async_cuda_core/memory/
host.rs

1use crate::ffi;
2use crate::memory::DeviceBuffer;
3use crate::runtime::Future;
4use crate::stream::Stream;
5
6type Result<T> = std::result::Result<T, crate::error::Error>;
7
8/// A host buffer.
9///
10/// # Performance
11///
12/// Host buffers are managed by CUDA and can be used for pinned memory transfer. Pinned memory
13/// transfer speeds are usually higher compared to paged memory transfers. Pinned memory buffers are
14/// especially important for this crate because the runtime thread must do the least amount of CPU
15/// work possible. Paged transfers do require the host to move data into a CUDA managed buffer first
16/// (an extra memory copy) whilst pinned transfers do not.
17pub struct HostBuffer<T: Copy + 'static> {
18    inner: ffi::memory::HostBuffer<T>,
19}
20
21impl<T: Copy + 'static> HostBuffer<T> {
22    /// Allocates memory on the host. This creates a pinned buffer. Any transfers to and from this
23    /// buffer automatically become pinned transfers, and will be much faster.
24    ///
25    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c)
26    ///
27    /// # Arguments
28    ///
29    /// * `num_elements` - Number of elements to allocate.
30    pub async fn new(num_elements: usize) -> Self {
31        let inner = Future::new(move || ffi::memory::HostBuffer::<T>::new(num_elements)).await;
32        Self { inner }
33    }
34
35    /// Allocates memory on the host and copies the provided data into it.
36    ///
37    /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
38    /// pinned transfers, and will be much faster.
39    ///
40    /// This is a convenience function that allows the caller to quickly put data into a host
41    /// buffer. It is roughly similar to `buffer.copy_from_slice(slice)`.
42    ///
43    /// # Arguments
44    ///
45    /// * `slice` - Data to copy into the new host buffer.
46    pub async fn from_slice(slice: &[T]) -> Self {
47        let mut this = Self::new(slice.len()).await;
48        this.copy_from_slice(slice);
49        this
50    }
51
52    /// Allocates memory on the host and copies the provided array into it.
53    ///
54    /// This creates a pinned buffer. Any transfers to and from this buffer automatically become
55    /// pinned transfers, and will be much faster.
56    ///
57    /// This is a convenience function that allows the caller to quickly put data into a host
58    /// buffer. It is roughly similar to `buffer.copy_from_array(slice)`.
59    ///
60    /// # Arguments
61    ///
62    /// * `array` - Array to copy into the new host buffer.
63    #[cfg(feature = "ndarray")]
64    pub async fn from_array<D: ndarray::Dimension>(array: &ndarray::ArrayView<'_, T, D>) -> Self {
65        let mut this = Self::new(array.len()).await;
66        this.copy_from_array(array);
67        this
68    }
69
70    /// Copies memory from the provided device buffer to this buffer.
71    ///
72    /// This function synchronizes the stream implicitly.
73    ///
74    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
75    ///
76    /// # Pinned transfer
77    ///
78    /// This function is guaranteed to produce a pinned transfer on the runtime thread.
79    ///
80    /// # Stream ordered semantics
81    ///
82    /// This function uses stream ordered semantics. It can only be guaranteed to complete
83    /// sequentially relative to operations scheduled on the same stream or the default stream.
84    ///
85    /// # Arguments
86    ///
87    /// * `other` - Device buffer to copy from.
88    /// * `stream` - Stream to use.
89    #[inline(always)]
90    pub async fn copy_from(&mut self, other: &DeviceBuffer<T>, stream: &Stream) -> Result<()> {
91        other.copy_to(self, stream).await
92    }
93
94    /// Copies memory from the provided device buffer to this buffer.
95    ///
96    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
97    ///
98    /// # Pinned transfer
99    ///
100    /// This function is guaranteed to produce a pinned transfer on the runtime thread.
101    ///
102    /// # Stream ordered semantics
103    ///
104    /// This function uses stream ordered semantics. It can only be guaranteed to complete
105    /// sequentially relative to operations scheduled on the same stream or the default stream.
106    ///
107    /// # Safety
108    ///
109    /// This function is unsafe because the operation might not have completed when the function
110    /// returns, and thus the state of the buffer is undefined.
111    ///
112    /// # Arguments
113    ///
114    /// * `other` - Device buffer to copy from.
115    /// * `stream` - Stream to use.
116    #[inline(always)]
117    pub async unsafe fn copy_from_async(
118        &mut self,
119        other: &DeviceBuffer<T>,
120        stream: &Stream,
121    ) -> Result<()> {
122        other.copy_to_async(self, stream).await
123    }
124
125    /// Copies memory from this buffer to the provided device buffer.
126    ///
127    /// This function synchronizes the stream implicitly.
128    ///
129    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
130    ///
131    /// # Pinned transfer
132    ///
133    /// This function is guaranteed to produce a pinned transfer on the runtime thread.
134    ///
135    /// # Stream ordered semantics
136    ///
137    /// This function uses stream ordered semantics. It can only be guaranteed to complete
138    /// sequentially relative to operations scheduled on the same stream or the default stream.
139    ///
140    /// # Arguments
141    ///
142    /// * `other` - Device buffer to copy to.
143    /// * `stream` - Stream to use.
144    #[inline(always)]
145    pub async fn copy_to(&self, other: &mut DeviceBuffer<T>, stream: &Stream) -> Result<()> {
146        other.copy_from(self, stream).await
147    }
148
149    /// Copies memory from this buffer to the provided device buffer.
150    ///
151    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
152    ///
153    /// # Pinned transfer
154    ///
155    /// This function is guaranteed to produce a pinned transfer on the runtime thread.
156    ///
157    /// # Stream ordered semantics
158    ///
159    /// This function uses stream ordered semantics. It can only be guaranteed to complete
160    /// sequentially relative to operations scheduled on the same stream or the default stream.
161    ///
162    /// # Safety
163    ///
164    /// This function is unsafe because the operation might not have completed when the function
165    /// returns, and thus the state of the buffer is undefined.
166    ///
167    /// # Arguments
168    ///
169    /// * `other` - Device buffer to copy to.
170    /// * `stream` - Stream to use.
171    #[inline(always)]
172    pub async unsafe fn copy_to_async(
173        &self,
174        other: &mut DeviceBuffer<T>,
175        stream: &Stream,
176    ) -> Result<()> {
177        other.copy_from_async(self, stream).await
178    }
179
180    /// Copy data into the host buffer from a slice.
181    ///
182    /// # Synchronization safety
183    ///
184    /// This call is only synchronization-safe if all streams that have previously been used for
185    /// copy operations either from or to this host buffer have been synchronized, and no operations
186    /// have been scheduled since.
187    ///
188    /// # Arguments
189    ///
190    /// * `slice` - Data to copy into the new host buffer.
191    ///
192    /// # Example
193    ///
194    /// ```
195    /// # use async_cuda_core::HostBuffer;
196    /// # tokio_test::block_on(async {
197    /// let mut host_buffer = HostBuffer::<u8>::new(100).await;
198    /// let some_data = vec![10; 100];
199    /// host_buffer.copy_from_slice(&some_data);
200    /// # })
201    /// ```
202    #[inline(always)]
203    pub fn copy_from_slice(&mut self, slice: &[T]) {
204        self.inner.copy_from_slice(slice);
205    }
206
207    /// Copy array into the host buffer from a slice.
208    ///
209    /// # Synchronization safety
210    ///
211    /// This call is only synchronization-safe if all streams that have previously been used for
212    /// copy operations either from or to this host buffer have been synchronized, and no operations
213    /// have been scheduled since.
214    ///
215    /// # Arguments
216    ///
217    /// * `array` - Array to copy into the new host buffer.
218    #[cfg(feature = "ndarray")]
219    #[inline(always)]
220    pub fn copy_from_array<D: ndarray::Dimension>(&mut self, array: &ndarray::ArrayView<T, D>) {
221        self.inner.copy_from_array(array)
222    }
223
224    /// Copy the data to a [`Vec`] and return it.
225    #[inline(always)]
226    pub fn to_vec(&self) -> Vec<T> {
227        self.inner.to_vec()
228    }
229
230    /// Copy the data to an [`ndarray::Array`] and return it.
231    ///
232    /// Function panics if provided shape does not match size of array.
233    ///
234    /// # Arguments
235    ///
236    /// * `shape` - Shape for array.
237    #[cfg(feature = "ndarray")]
238    #[inline(always)]
239    pub fn to_array_with_shape<D: ndarray::Dimension>(
240        &self,
241        shape: impl Into<ndarray::StrideShape<D>>,
242    ) -> ndarray::Array<T, D> {
243        self.inner.to_array_with_shape::<D>(shape)
244    }
245
246    /// Get number of elements in buffer.
247    #[inline(always)]
248    pub fn num_elements(&self) -> usize {
249        self.inner.num_elements
250    }
251
252    /// Access the inner synchronous implementation of [`HostBuffer`].
253    #[inline(always)]
254    pub fn inner(&self) -> &ffi::memory::HostBuffer<T> {
255        &self.inner
256    }
257
258    /// Access the inner synchronous implementation of [`HostBuffer`].
259    #[inline(always)]
260    pub fn inner_mut(&mut self) -> &mut ffi::memory::HostBuffer<T> {
261        &mut self.inner
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[tokio::test]
270    async fn test_new() {
271        let buffer = HostBuffer::<u32>::new(100).await;
272        assert_eq!(buffer.num_elements(), 100);
273        assert_eq!(buffer.to_vec().len(), 100);
274    }
275
276    #[tokio::test]
277    async fn test_from_slice() {
278        let all_ones = vec![1_u32; 200];
279        let buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
280        assert_eq!(buffer.num_elements(), 200);
281        let data = buffer.to_vec();
282        assert_eq!(data.len(), 200);
283        assert!(data.into_iter().all(|v| v == 1_u32));
284    }
285
286    #[tokio::test]
287    async fn test_copy() {
288        let stream = Stream::new().await.unwrap();
289        let all_ones = vec![1_u32; 100];
290        let host_buffer = HostBuffer::from_slice(all_ones.as_slice()).await;
291
292        let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
293        unsafe {
294            host_buffer
295                .copy_to_async(&mut device_buffer, &stream)
296                .await
297                .unwrap();
298        }
299
300        let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
301        unsafe {
302            return_host_buffer
303                .copy_from_async(&device_buffer, &stream)
304                .await
305                .unwrap();
306        }
307
308        stream.synchronize().await.unwrap();
309
310        assert_eq!(return_host_buffer.num_elements(), 100);
311        let return_data = return_host_buffer.to_vec();
312        assert_eq!(return_data.len(), 100);
313        assert!(return_data.into_iter().all(|v| v == 1_u32));
314    }
315
316    #[tokio::test]
317    #[should_panic]
318    async fn test_it_panics_when_copying_invalid_size() {
319        let stream = Stream::new().await.unwrap();
320        let host_buffer = HostBuffer::<u32>::new(100).await;
321        let mut device_buffer = DeviceBuffer::<u32>::new(101, &Stream::null()).await;
322        let _ = unsafe { host_buffer.copy_to_async(&mut device_buffer, &stream).await };
323    }
324}