async_cuda_core/memory/device.rs
1use crate::ffi;
2use crate::memory::HostBuffer;
3use crate::runtime::Future;
4use crate::stream::Stream;
5
6type Result<T> = std::result::Result<T, crate::error::Error>;
7
8/// A buffer on the device.
9///
10/// # Example
11///
12/// Copying data from a [`HostBuffer`] to a [`DeviceBuffer`]:
13///
14/// ```
15/// # use async_cuda_core::{DeviceBuffer, HostBuffer, Stream};
16/// # tokio_test::block_on(async {
17/// let stream = Stream::new().await.unwrap();
18/// let all_ones = vec![1_u8; 100];
19/// let host_buffer = HostBuffer::<u8>::from_slice(&all_ones).await;
20/// let mut device_buffer = DeviceBuffer::<u8>::new(100, &stream).await;
21/// device_buffer.copy_from(&host_buffer, &stream).await.unwrap();
22/// # })
23/// ```
24pub struct DeviceBuffer<T: Copy + 'static> {
25 inner: ffi::memory::DeviceBuffer<T>,
26}
27
28impl<T: Copy + 'static> DeviceBuffer<T> {
29 /// Allocates memory on the device.
30 ///
31 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS_1gbbf70065888d61853c047513baa14081)
32 ///
33 /// # Stream ordered semantics
34 ///
35 /// This function uses stream ordered semantics. It can only be guaranteed to complete
36 /// sequentially relative to operations scheduled on the same stream or the default stream.
37 ///
38 /// # Arguments
39 ///
40 /// * `num_elements` - Number of elements to allocate.
41 /// * `stream` - Stream to use.
42 pub async fn new(num_elements: usize, stream: &Stream) -> Self {
43 let inner =
44 Future::new(move || ffi::memory::DeviceBuffer::<T>::new(num_elements, stream.inner()))
45 .await;
46 Self { inner }
47 }
48
49 /// Allocate memory on the device, and copy data from host into it.
50 ///
51 /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
52 /// copies the data from the host buffer to the [`DeviceBuffer`].
53 ///
54 /// The given stream is automatically synchronized, since the temporary host buffer might
55 /// otherwise be dropped before the copy can complete.
56 ///
57 /// # Arguments
58 ///
59 /// * `slice` - Data to copy into the buffer.
60 /// * `stream` - Stream to use.
61 pub async fn from_slice(slice: &[T], stream: &Stream) -> Result<Self> {
62 let host_buffer = HostBuffer::from_slice(slice).await;
63 let mut this = Self::new(slice.len(), stream).await;
64 this.copy_from(&host_buffer, stream).await?;
65 Ok(this)
66 }
67
68 /// Allocate memory on the device, and copy array from host into it.
69 ///
70 /// This function creates a temporary [`HostBuffer`], copies the slice into it, then finally
71 /// copies the data from the host buffer to the [`DeviceBuffer`].
72 ///
73 /// The given stream is automatically synchronized, since the temporary host buffer might
74 /// otherwise be dropped before the copy can complete.
75 ///
76 /// # Arguments
77 ///
78 /// * `slice` - Data to copy into the buffer.
79 /// * `stream` - Stream to use.
80 #[cfg(feature = "ndarray")]
81 pub async fn from_array<D: ndarray::Dimension>(
82 array: &ndarray::ArrayView<'_, T, D>,
83 stream: &Stream,
84 ) -> Result<Self> {
85 let host_buffer = HostBuffer::from_array(array).await;
86 let mut this = Self::new(array.len(), stream).await;
87 this.copy_from(&host_buffer, stream).await?;
88 Ok(this)
89 }
90
91 /// Copies memory from the provided pinned host buffer to this buffer.
92 ///
93 /// This function synchronizes the stream implicitly.
94 ///
95 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
96 ///
97 /// # Pinned transfer
98 ///
99 /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
100 /// guaranteed to produce a pinned transfer on the runtime thread.
101 ///
102 /// # Stream ordered semantics
103 ///
104 /// This function uses stream ordered semantics. It can only be guaranteed to complete
105 /// sequentially relative to operations scheduled on the same stream or the default stream.
106 ///
107 /// # Arguments
108 ///
109 /// * `other` - Buffer to copy from.
110 /// * `stream` - Stream to use.
111 #[inline]
112 pub async fn copy_from(&mut self, other: &HostBuffer<T>, stream: &Stream) -> Result<()> {
113 // SAFETY: Stream is synchronized after this.
114 unsafe {
115 self.copy_from_async(other, stream).await?;
116 }
117 stream.synchronize().await?;
118 Ok(())
119 }
120
121 /// Copies memory from the provided pinned host buffer to this buffer.
122 ///
123 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
124 ///
125 /// # Pinned transfer
126 ///
127 /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
128 /// guaranteed to produce a pinned transfer on the runtime thread.
129 ///
130 /// # Stream ordered semantics
131 ///
132 /// This function uses stream ordered semantics. It can only be guaranteed to complete
133 /// sequentially relative to operations scheduled on the same stream or the default stream.
134 ///
135 /// # Safety
136 ///
137 /// This function is unsafe because the operation might not have completed when the function
138 /// returns, and thus the state of the buffer is undefined.
139 ///
140 /// # Arguments
141 ///
142 /// * `other` - Buffer to copy from.
143 /// * `stream` - Stream to use.
144 pub async unsafe fn copy_from_async(
145 &mut self,
146 other: &HostBuffer<T>,
147 stream: &Stream,
148 ) -> Result<()> {
149 assert_eq!(self.num_elements(), other.num_elements());
150 Future::new(move || self.inner.copy_from_async(other.inner(), stream.inner())).await
151 }
152
153 /// Copies memory from this buffer to the provided pinned host buffer.
154 ///
155 /// This function synchronizes the stream implicitly.
156 ///
157 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
158 ///
159 /// # Pinned transfer
160 ///
161 /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
162 /// guaranteed to produce a pinned transfer on the runtime thread.
163 ///
164 /// # Stream ordered semantics
165 ///
166 /// This function uses stream ordered semantics. It can only be guaranteed to complete
167 /// sequentially relative to operations scheduled on the same stream or the default stream.
168 ///
169 /// # Arguments
170 ///
171 /// * `other` - Buffer to copy to.
172 /// * `stream` - Stream to use.
173 #[inline]
174 pub async fn copy_to(&self, other: &mut HostBuffer<T>, stream: &Stream) -> Result<()> {
175 // SAFETY: Stream is synchronized after this.
176 unsafe {
177 self.copy_to_async(other, stream).await?;
178 }
179 stream.synchronize().await?;
180 Ok(())
181 }
182
183 /// Copies memory from this buffer to the provided pinned host buffer.
184 ///
185 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79)
186 ///
187 /// # Pinned transfer
188 ///
189 /// The other buffer (of type [`HostBuffer`]) is always a pinned buffer. This function is
190 /// guaranteed to produce a pinned transfer on the runtime thread.
191 ///
192 /// # Stream ordered semantics
193 ///
194 /// This function uses stream ordered semantics. It can only be guaranteed to complete
195 /// sequentially relative to operations scheduled on the same stream or the default stream.
196 ///
197 /// # Safety
198 ///
199 /// This function is unsafe because the operation might not have completed when the function
200 /// returns, and thus the state of the buffer is undefined.
201 ///
202 /// # Arguments
203 ///
204 /// * `other` - Buffer to copy to.
205 /// * `stream` - Stream to use.
206 pub async unsafe fn copy_to_async(
207 &self,
208 other: &mut HostBuffer<T>,
209 stream: &Stream,
210 ) -> Result<()> {
211 assert_eq!(self.num_elements(), other.num_elements());
212 Future::new(move || self.inner.copy_to_async(other.inner_mut(), stream.inner())).await
213 }
214
215 /// Fill the entire buffer with the given byte.
216 ///
217 /// [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g7c9761e21d9f0999fd136c51e7b9b2a0)
218 ///
219 /// # Stream ordered semantics
220 ///
221 /// This function uses stream ordered semantics. It can only be guaranteed to complete
222 /// sequentially relative to operations scheduled on the same stream or the default stream.
223 ///
224 /// # Arguments
225 ///
226 /// * `value` - Byte value to fill buffer with.
227 pub async fn fill_with_byte(&mut self, value: u8, stream: &Stream) -> Result<()> {
228 Future::new(move || self.inner.fill_with_byte(value, stream.inner())).await
229 }
230
231 /// Get number of elements in buffer.
232 #[inline(always)]
233 pub fn num_elements(&self) -> usize {
234 self.inner.num_elements
235 }
236
237 /// Access the inner synchronous implementation of [`DeviceBuffer`].
238 #[inline(always)]
239 pub fn inner(&self) -> &ffi::memory::DeviceBuffer<T> {
240 &self.inner
241 }
242
243 /// Access the inner synchronous implementation of [`DeviceBuffer`].
244 #[inline(always)]
245 pub fn inner_mut(&mut self) -> &mut ffi::memory::DeviceBuffer<T> {
246 &mut self.inner
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253
254 #[tokio::test]
255 async fn test_new() {
256 let buffer = DeviceBuffer::<u32>::new(100, &Stream::null()).await;
257 assert_eq!(buffer.num_elements(), 100);
258 }
259
260 #[tokio::test]
261 async fn test_copy() {
262 let stream = Stream::new().await.unwrap();
263 let all_ones = vec![1_u32; 100];
264 let host_buffer_all_ones = HostBuffer::from_slice(all_ones.as_slice()).await;
265
266 let mut device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
267 unsafe {
268 device_buffer
269 .copy_from_async(&host_buffer_all_ones, &stream)
270 .await
271 .unwrap();
272 }
273
274 let mut host_buffer = HostBuffer::<u32>::new(100).await;
275 unsafe {
276 device_buffer
277 .copy_to_async(&mut host_buffer, &stream)
278 .await
279 .unwrap();
280 }
281
282 let mut another_device_buffer = DeviceBuffer::<u32>::new(100, &stream).await;
283 unsafe {
284 another_device_buffer
285 .copy_from_async(&host_buffer, &stream)
286 .await
287 .unwrap();
288 }
289
290 let mut return_host_buffer = HostBuffer::<u32>::new(100).await;
291 unsafe {
292 another_device_buffer
293 .copy_to_async(&mut return_host_buffer, &stream)
294 .await
295 .unwrap();
296 }
297
298 stream.synchronize().await.unwrap();
299
300 assert_eq!(return_host_buffer.num_elements(), 100);
301 let return_data = return_host_buffer.to_vec();
302 assert_eq!(return_data.len(), 100);
303 assert!(return_data.into_iter().all(|v| v == 1_u32));
304 }
305
306 #[tokio::test]
307 async fn test_fill_with_byte() {
308 let stream = Stream::new().await.unwrap();
309 let mut device_buffer = DeviceBuffer::<u8>::new(4, &stream).await;
310 let mut host_buffer = HostBuffer::<u8>::new(4).await;
311 device_buffer.fill_with_byte(0xab, &stream).await.unwrap();
312 device_buffer
313 .copy_to(&mut host_buffer, &stream)
314 .await
315 .unwrap();
316 assert_eq!(host_buffer.to_vec(), &[0xab, 0xab, 0xab, 0xab]);
317 }
318
319 #[tokio::test]
320 #[should_panic]
321 async fn test_it_panics_when_copying_invalid_size() {
322 let stream = Stream::new().await.unwrap();
323 let device_buffer = DeviceBuffer::<u32>::new(101, &stream).await;
324 let mut host_buffer = HostBuffer::<u32>::new(100).await;
325 let _ = unsafe { device_buffer.copy_to_async(&mut host_buffer, &stream).await };
326 }
327}