async_cuda_core/memory/
device.rs

1use crate::ffi;
2use crate::memory::HostBuffer;
3use crate::runtime::Future;
4use crate::stream::Stream;
5
6type Result<T> = std::result::Result<T, crate::error::Error>;
7
8/// A buffer on the device.
9///
10/// # Example
11///
12/// Copying data from a [`HostBuffer`] to a [`DeviceBuffer`]:
13///
14/// ```
15/// # use async_cuda_core::{DeviceBuffer, HostBuffer, Stream};
16/// # tokio_test::block_on(async {
17/// let stream = Stream::new().await.unwrap();
18/// let all_ones = vec![1_u8; 100];
19/// let host_buffer = HostBuffer::<u8>::from_slice(&all_ones).await;
20/// let mut device_buffer = DeviceBuffer::<u8>::new(100, &stream).await;
21/// device_buffer.copy_from(&host_buffer, &stream).await.unwrap();
22/// # })
23/// ```
24pub struct DeviceBuffer<T: Copy + 'static> {
25    inner: ffi::memory::DeviceBuffer<T>,
26}
27
28impl<T: Copy + 'static> DeviceBuffer<T> {
29    /// Allocates memory on the device.
30    ///
31    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS_1gbbf70065888d61853c047513baa14081)
32    ///
33    /// # Stream ordered semantics
34    ///
35    /// This function uses stream ordered semantics. It can only be guaranteed to complete
36    /// sequentially relative to operations scheduled on the same stream or the default stream.
37    ///
38    /// # Arguments
39    ///
40    /// * `num_elements` - Number of elements to allocate.
41    /// * `stream` - Stream to use.
42    pub async fn new(num_elements: usize, stream: &Stream) -> Self {
43        let inner =
44            Future::new(move || ffi::memory::DeviceBuffer::<T>::new(num_elements, stream.inner()))
45                .await;
46        Self { inner }
47    }
48
49    /// Allocate memory on the device, and copy data from host into it.
50    ///
51    /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
52    /// copies the data from the host buffer to the [`DeviceBuffer`].
53    ///
54    /// The given stream is automatically synchronized, since the temporary host buffer might
55    /// otherwise be dropped before the copy can complete.
56    ///
57    /// # Arguments
58    ///
59    /// * `slice` - Data to copy into the buffer.
60    /// * `stream` - Stream to use.
61    pub async fn from_slice(slice: &[T], stream: &Stream) -> Result<Self> {
62        let host_buffer = HostBuffer::from_slice(slice).await;
63        let mut this = Self::new(slice.len(), stream).await;
64        this.copy_from(&host_buffer, stream).await?;
65        Ok(this)
66    }
67
68    /// Allocate memory on the device, and copy array from host into it.
69    ///
70    /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
71    /// copies the data from the host buffer to the [`DeviceBuffer`].
72    ///
73    /// The given stream is automatically synchronized, since the temporary host buffer might
74    /// otherwise be dropped before the copy can complete.
75    ///
76    /// # Arguments
77    ///
78    /// * `slice` - Data to copy into the buffer.
79    /// * `stream` - Stream to use.
80    #[cfg(feature = "ndarray")]
81    pub async fn from_array<D: ndarray::Dimension>(
82        array: &ndarray::ArrayView<'_, T, D>,
83        stream: &Stream,
84    ) -> Result<Self> {
85        let host_buffer = HostBuffer::from_array(array).await;
86        let mut this = Self::new(array.len(), stream).await;
87        this.copy_from(&host_buffer, stream).await?;
88        Ok(this)
89    }
90
91    /// Copies memory from the provided pinned host buffer to this buffer.
92    ///
93    /// This function synchronizes the stream implicitly.
94    ///
95    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
96    ///
97    /// # Pinned transfer
98    ///
99    /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
100    /// guaranteed to produce a pinned transfer on the runtime thread.
101    ///
102    /// # Stream ordered semantics
103    ///
104    /// This function uses stream ordered semantics. It can only be guaranteed to complete
105    /// sequentially relative to operations scheduled on the same stream or the default stream.
106    ///
107    /// # Arguments
108    ///
109    /// * `other` - Buffer to copy from.
110    /// * `stream` - Stream to use.
111    #[inline]
112    pub async fn copy_from(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
113        // SAFETY: Stream is synchronized after this.
114        unsafe {
115            self.copy_from_async(other, stream).await?;
116        }
117        stream.synchronize().await?;
118        Ok(())
119    }
120
121    /// Copies memory from the provided pinned host buffer to this buffer.
122    ///
123    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
124    ///
125    /// # Pinned transfer
126    ///
127    /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
128    /// guaranteed to produce a pinned transfer on the runtime thread.
129    ///
130    /// # Stream ordered semantics
131    ///
132    /// This function uses stream ordered semantics. It can only be guaranteed to complete
133    /// sequentially relative to operations scheduled on the same stream or the default stream.
134    ///
135    /// # Safety
136    ///
137    /// This function is unsafe because the operation might not have completed when the function
138    /// returns, and thus the state of the buffer is undefined.
139    ///
140    /// # Arguments
141    ///
142    /// * `other` - Buffer to copy from.
143    /// * `stream` - Stream to use.
144    pub async unsafe fn copy_from_async(
145        &mut self,
146        other: &HostBuffer<T>,
147        stream: &Stream,
148    ) -> Result<()> {
149        assert_eq!(self.num_elements(), other.num_elements());
150        Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await
151    }
152
153    /// Copies memory from this buffer to the provided pinned host buffer.
154    ///
155    /// This function synchronizes the stream implicitly.
156    ///
157    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
158    ///
159    /// # Pinned transfer
160    ///
161    /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
162    /// guaranteed to produce a pinned transfer on the runtime thread.
163    ///
164    /// # Stream ordered semantics
165    ///
166    /// This function uses stream ordered semantics. It can only be guaranteed to complete
167    /// sequentially relative to operations scheduled on the same stream or the default stream.
168    ///
169    /// # Arguments
170    ///
171    /// * `other` - Buffer to copy to.
172    /// * `stream` - Stream to use.
173    #[inline]
174    pub async fn copy_to(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
175        // SAFETY: Stream is synchronized after this.
176        unsafe {
177            self.copy_to_async(other, stream).await?;
178        }
179        stream.synchronize().await?;
180        Ok(())
181    }
182
183    /// Copies memory from this buffer to the provided pinned host buffer.
184    ///
185    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
186    ///
187    /// # Pinned transfer
188    ///
189    /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
190    /// guaranteed to produce a pinned transfer on the runtime thread.
191    ///
192    /// # Stream ordered semantics
193    ///
194    /// This function uses stream ordered semantics. It can only be guaranteed to complete
195    /// sequentially relative to operations scheduled on the same stream or the default stream.
196    ///
197    /// # Safety
198    ///
199    /// This function is unsafe because the operation might not have completed when the function
200    /// returns, and thus the state of the buffer is undefined.
201    ///
202    /// # Arguments
203    ///
204    /// * `other` - Buffer to copy to.
205    /// * `stream` - Stream to use.
206    pub async unsafe fn copy_to_async(
207        &self,
208        other: &mut HostBuffer<T>,
209        stream: &Stream,
210    ) -> Result<()> {
211        assert_eq!(self.num_elements(), other.num_elements());
212        Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await
213    }
214
215    /// Fill the entire buffer with the given byte.
216    ///
217    /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g7c9761e21d9f0999fd136c51e7b9b2a0)
218    ///
219    /// # Stream ordered semantics
220    ///
221    /// This function uses stream ordered semantics. It can only be guaranteed to complete
222    /// sequentially relative to operations scheduled on the same stream or the default stream.
223    ///
224    /// # Arguments
225    ///
226    /// * `value` - Byte value to fill buffer with.
227    pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
228        Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await
229    }
230
231    /// Get number of elements in buffer.
232    #[inline(always)]
233    pub fn num_elements(&self) -> usize {
234        self.inner.num_elements
235    }
236
237    /// Access the inner synchronous implementation of [`DeviceBuffer`].
238    #[inline(always)]
239    pub fn inner(&self) -> &ffi::memory::DeviceBuffer<T> {
240        &self.inner
241    }
242
243    /// Access the inner synchronous implementation of [`DeviceBuffer`].
244    #[inline(always)]
245    pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer<T> {
246        &mut self.inner
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[tokio::test]
255    async fn test_new() {
256        let buffer = DeviceBuffer::<u32>::new(100, &Stream::null()).await;
257        assert_eq!(buffer.num_elements(), 100);
258    }
259
260    #[tokio::test]
261    async fn test_copy() {
262        let stream = Stream::new().await.unwrap();
263        let all_ones = vec![1_u32; 100];
264        let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await;
265
266        let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
267        unsafe {
268            device_buffer
269                .copy_from_async(&host_buffer_all_ones, &stream)
270                .await
271                .unwrap();
272        }
273
274        let mut host_buffer = HostBuffer::<u32>::new(100).await;
275        unsafe {
276            device_buffer
277                .copy_to_async(&mut host_buffer, &stream)
278                .await
279                .unwrap();
280        }
281
282        let mut another_device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
283        unsafe {
284            another_device_buffer
285                .copy_from_async(&host_buffer, &stream)
286                .await
287                .unwrap();
288        }
289
290        let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
291        unsafe {
292            another_device_buffer
293                .copy_to_async(&mut return_host_buffer, &stream)
294                .await
295                .unwrap();
296        }
297
298        stream.synchronize().await.unwrap();
299
300        assert_eq!(return_host_buffer.num_elements(), 100);
301        let return_data = return_host_buffer.to_vec();
302        assert_eq!(return_data.len(), 100);
303        assert!(return_data.into_iter().all(|v| v == 1_u32));
304    }
305
306    #[tokio::test]
307    async fn test_fill_with_byte() {
308        let stream = Stream::new().await.unwrap();
309        let mut device_buffer = DeviceBuffer::<u8>::new(4, &stream).await;
310        let mut host_buffer = HostBuffer::<u8>::new(4).await;
311        device_buffer.fill_with_byte(0xab, &stream).await.unwrap();
312        device_buffer
313            .copy_to(&mut host_buffer, &stream)
314            .await
315            .unwrap();
316        assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]);
317    }
318
319    #[tokio::test]
320    #[should_panic]
321    async fn test_it_panics_when_copying_invalid_size() {
322        let stream = Stream::new().await.unwrap();
323        let device_buffer = DeviceBuffer::<u32>::new(101, &stream).await;
324        let mut host_buffer = HostBuffer::<u32>::new(100).await;
325        let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await };
326    }
327}