rust_gpu_tools/cuda/mod.rs
1//! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
2//!
3//! The current operation mode is synchronuous, in order to have higher safety gurarantees. All
4//! operations happen on a single stream, which is synchronized after each operation. This is a
5//! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons:
6//!
7//! 1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream
8//! 2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous
9//! copy methods. To prevent such kind of bugs, be explicit which stream is used.
10
11pub(crate) mod utils;
12
13use std::convert::TryFrom;
14use std::ffi::{c_void, CStr, CString};
15use std::fmt;
16use std::hash::{Hash, Hasher};
17
18use log::debug;
19use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer};
20use rustacuda::stream::{Stream, StreamFlags};
21
22use crate::device::{DeviceUuid, PciId, Vendor};
23use crate::error::{GPUError, GPUResult};
24use crate::LocalBuffer;
25
26/// A Buffer to be used for sending and receiving data to/from the GPU.
27#[derive(Debug)]
28pub struct Buffer<T> {
29 buffer: DeviceBuffer<u8>,
30 /// The number of T-sized elements.
31 length: usize,
32 _phantom: std::marker::PhantomData<T>,
33}
34
35/// CUDA specific device.
36#[derive(Debug, Clone)]
37pub struct Device {
38 vendor: Vendor,
39 name: String,
40 /// The total memory of the GPU in bytes.
41 memory: u64,
42 /// Number of streaming multiprocessors.
43 compute_units: u32,
44 /// The compute capability of the device, major and minor version.
45 compute_capability: (u32, u32),
46 pci_id: PciId,
47 uuid: Option<DeviceUuid>,
48 context: rustacuda::context::UnownedContext,
49}
50
51impl Hash for Device {
52 fn hash<H: Hasher>(&self, state: &mut H) {
53 self.vendor.hash(state);
54 self.name.hash(state);
55 self.memory.hash(state);
56 self.pci_id.hash(state);
57 self.uuid.hash(state);
58 }
59}
60
61impl PartialEq for Device {
62 fn eq(&self, other: &Self) -> bool {
63 self.vendor == other.vendor
64 && self.name == other.name
65 && self.memory == other.memory
66 && self.pci_id == other.pci_id
67 && self.uuid == other.uuid
68 }
69}
70
71impl Eq for Device {}
72
73impl Device {
74 /// Returns the [`Vendor`] of the GPU.
75 pub fn vendor(&self) -> Vendor {
76 self.vendor
77 }
78
79 /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
80 pub fn name(&self) -> String {
81 self.name.clone()
82 }
83
84 /// Returns the memory of the GPU in bytes.
85 pub fn memory(&self) -> u64 {
86 self.memory
87 }
88
89 /// Returns the number of compute units of the GPU.
90 pub fn compute_units(&self) -> u32 {
91 self.compute_units
92 }
93
94 /// Returns the major and minor version of compute capability of the GPU.
95 pub fn compute_capability(&self) -> (u32, u32) {
96 self.compute_capability
97 }
98
99 /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
100 pub fn pci_id(&self) -> PciId {
101 self.pci_id
102 }
103
104 /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more
105 /// information.
106 pub fn uuid(&self) -> Option<DeviceUuid> {
107 self.uuid
108 }
109}
110
111/// Abstraction that contains everything to run a CUDA kernel on a GPU.
112///
113/// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this
114/// API, which will then work with OpenCL as well as CUDA kernels.
115// When compiled without the `opencl` feature, then the intra-doc link above will be broken.
116#[allow(rustdoc::broken_intra_doc_links)]
117#[derive(Debug)]
118pub struct Program {
119 context: rustacuda::context::UnownedContext,
120 module: rustacuda::module::Module,
121 stream: Stream,
122 device_name: String,
123}
124
125impl Program {
126 /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
127 pub fn device_name(&self) -> &str {
128 &self.device_name
129 }
130
131 /// Creates a program for a specific device from a compiled CUDA binary file.
132 pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult<Program> {
133 debug!("Creating CUDA program from binary file.");
134 rustacuda::context::CurrentContext::set_current(&device.context)?;
135 let module = rustacuda::module::Module::load_from_file(filename).map_err(|err| {
136 Self::pop_context();
137 err
138 })?;
139 let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
140 Self::pop_context();
141 err
142 })?;
143 let prog = Program {
144 module,
145 stream,
146 device_name: device.name(),
147 context: device.context.clone(),
148 };
149 Self::pop_context();
150 Ok(prog)
151 }
152
153 /// Creates a program for a specific device from a compiled CUDA binary.
154 pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult<Program> {
155 debug!("Creating CUDA program from bytes.");
156 rustacuda::context::CurrentContext::set_current(&device.context)?;
157 let module = rustacuda::module::Module::load_from_bytes(bytes).map_err(|err| {
158 Self::pop_context();
159 err
160 })?;
161 let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
162 Self::pop_context();
163 err
164 })?;
165 let prog = Program {
166 module,
167 stream,
168 device_name: device.name(),
169 context: device.context.clone(),
170 };
171 Self::pop_context();
172 Ok(prog)
173 }
174
175 /// Creates a new buffer that can be used for input/output with the GPU.
176 ///
177 /// The `length` is the number of elements to create.
178 ///
179 /// It is usually used to create buffers that are initialized by the GPU. If you want to
180 /// directly transfer data from the host to the GPU, you would use the safe
181 /// [`Program::create_buffer_from_slice`] instead.
182 ///
183 /// ### Safety
184 ///
185 /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by
186 /// the GPU) before it can be read via [`Program::read_into_buffer`].
187 pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
188 assert!(length > 0);
189 // This is the unsafe call, the rest of the function is safe code.
190 let buffer = DeviceBuffer::<u8>::uninitialized(length * std::mem::size_of::<T>())?;
191
192 Ok(Buffer::<T> {
193 buffer,
194 length,
195 _phantom: std::marker::PhantomData,
196 })
197 }
198
199 /// Creates a new buffer on the GPU and initializes with the given slice.
200 pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
201 // The number of bytes is used for the allocations.
202 let bytes_len = slice.len() * std::mem::size_of::<T>();
203
204 // Transmuting types is safe as long a sizes match.
205 let bytes = unsafe {
206 std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len)
207 };
208
209 // It is only unsafe as long as the buffer isn't initialized, but that's what we do next.
210 let mut buffer = unsafe { DeviceBuffer::<u8>::uninitialized(bytes_len)? };
211 // It is safe as we synchronize the stream after the call.
212 unsafe { buffer.async_copy_from(bytes, &self.stream)? };
213 self.stream.synchronize()?;
214
215 Ok(Buffer::<T> {
216 buffer,
217 length: slice.len(),
218 _phantom: std::marker::PhantomData,
219 })
220 }
221
222 /// Returns a kernel.
223 ///
224 /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
225 /// number of threads. Instead it follows CUDA's definition and is the number of
226 /// `local_work_size` sized thread groups. So the total number of threads is
227 /// `global_work_size * local_work_size`.
228 pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult<Kernel> {
229 let function_name = CString::new(name).expect("Kernel name must not contain nul bytes");
230 let function = self.module.get_function(&function_name)?;
231
232 Ok(Kernel {
233 function,
234 global_work_size: gws,
235 local_work_size: lws,
236 stream: &self.stream,
237 args: Vec::new(),
238 })
239 }
240
241 /// Puts data from an existing buffer onto the GPU.
242 pub fn write_from_buffer<T>(&self, buffer: &mut Buffer<T>, data: &[T]) -> GPUResult<()> {
243 assert!(data.len() <= buffer.length, "Buffer is too small");
244
245 // Transmuting types is safe as long a sizes match.
246 let bytes = unsafe {
247 std::slice::from_raw_parts(
248 data.as_ptr() as *const T as *const u8,
249 data.len() * std::mem::size_of::<T>(),
250 )
251 };
252
253 // It is safe as we synchronize the stream after the call.
254 unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? };
255 self.stream.synchronize()?;
256
257 Ok(())
258 }
259
260 /// Reads data from the GPU into an existing buffer.
261 pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
262 assert!(data.len() <= buffer.length, "Buffer is too small");
263
264 // Transmuting types is safe as long a sizes match.
265 let bytes = unsafe {
266 std::slice::from_raw_parts_mut(
267 data.as_mut_ptr() as *mut T as *mut u8,
268 data.len() * std::mem::size_of::<T>(),
269 )
270 };
271
272 // It is safe as we synchronize the stream after the call.
273 unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? };
274 self.stream.synchronize()?;
275
276 Ok(())
277 }
278
279 /// Run some code in the context of the program.
280 ///
281 /// It sets the correct contexts.
282 ///
283 /// It takes the program as a parameter, so that we can use the same function body, for both
284 /// the OpenCL and the CUDA code path. The only difference is the type of the program.
285 pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
286 where
287 F: FnOnce(&Self, A) -> Result<R, E>,
288 E: From<GPUError>,
289 {
290 rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?;
291 let result = fun(self, arg);
292 Self::pop_context();
293 result
294 }
295
296 /// Pop the current context.
297 ///
298 /// It panics as it's an unrecoverable error.
299 fn pop_context() {
300 rustacuda::context::ContextStack::pop().expect("Cannot remove context.");
301 }
302}
303
304// TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that
305// this manual `Send` implementation is no longer needed.
306unsafe impl Send for Program {}
307
308/// Abstraction for kernel arguments.
309///
310/// Kernel arguments implement this trait, so that they can be converted it into the correct
311/// pointers needed by the actual kernel call.
312pub trait KernelArgument {
313 /// Converts into a C void pointer.
314 fn as_c_void(&self) -> *mut c_void;
315
316 /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This
317 /// informations is used to allocate the memory correctly.
318 fn shared_mem(&self) -> u32 {
319 0
320 }
321}
322
323impl<T> KernelArgument for Buffer<T> {
324 fn as_c_void(&self) -> *mut c_void {
325 &self.buffer as *const _ as _
326 }
327}
328
329impl KernelArgument for i32 {
330 fn as_c_void(&self) -> *mut c_void {
331 self as *const _ as _
332 }
333}
334
335impl KernelArgument for u32 {
336 fn as_c_void(&self) -> *mut c_void {
337 self as *const _ as _
338 }
339}
340
341impl<T> KernelArgument for LocalBuffer<T> {
342 // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo)
343 // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it
344 // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would
345 // return an "invalid argument" error.
346 fn as_c_void(&self) -> *mut c_void {
347 self as *const _ as _
348 }
349
350 fn shared_mem(&self) -> u32 {
351 u32::try_from(self.length * std::mem::size_of::<T>())
352 .expect("__shared__ memory allocation is too big.")
353 }
354}
355
356/// A kernel that can be executed.
357pub struct Kernel<'a> {
358 function: rustacuda::function::Function<'a>,
359 global_work_size: usize,
360 local_work_size: usize,
361 stream: &'a Stream,
362 args: Vec<&'a dyn KernelArgument>,
363}
364
365impl fmt::Debug for Kernel<'_> {
366 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
367 let args = self
368 .args
369 .iter()
370 .map(|arg| (arg.as_c_void(), arg.shared_mem()))
371 .collect::<Vec<_>>();
372 f.debug_struct("Kernel")
373 .field("function", &self.function)
374 .field("global_work_size", &self.global_work_size)
375 .field("local_work_size", &self.local_work_size)
376 .field("stream", &self.stream)
377 .field("args", &args)
378 .finish()
379 }
380}
381
382impl<'a> Kernel<'a> {
383 /// Set a kernel argument.
384 ///
385 /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
386 /// long as the kernel is in use.
387 ///
388 /// Example where this behaviour is enforced and leads to a compile-time error:
389 ///
390 /// ```compile_fail
391 /// use rust_gpu_tools::cuda::Program;
392 ///
393 /// fn would_break(program: &Program) {
394 /// let data = vec![1, 2, 3, 4];
395 /// let buffer = program.create_buffer_from_slice(&data).unwrap();
396 /// let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
397 /// let kernel = kernel.arg(&buffer);
398 /// // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
399 /// drop(buffer);
400 /// kernel.run().unwrap();
401 /// }
402 /// ```
403 pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
404 self.args.push(t);
405 self
406 }
407
408 /// Actually run the kernel.
409 ///
410 /// ### Panics
411 ///
412 /// Panics if the wrong number of arguments was provided.
413 pub fn run(self) -> GPUResult<()> {
414 // There can only be a single [`LocalBuffer`], due to CUDA restrictions.
415 let shared_mem = self
416 .args
417 .iter()
418 .try_fold(0, |acc, &arg| -> GPUResult<u32> {
419 let mem = arg.shared_mem();
420 match (mem, acc) {
421 // No new shared memory needs to be allocated.
422 (0, _) => Ok(acc),
423 // Some shared memory needs to be allocated.
424 (_, 0) => Ok(mem),
425 // There should be memory allocated more than once
426 (_, _) => Err(GPUError::Generic(
427 "There cannot be more than one `LocalBuffer`.".to_string(),
428 )),
429 }
430 })?;
431 let args = self
432 .args
433 .iter()
434 .map(|arg| arg.as_c_void())
435 .collect::<Vec<_>>();
436 // It is safe to launch the kernel as the arguments need to live when the kernel is called,
437 // and the buffers are copied synchronuously. At the end of the execution, the underlying
438 // stream is synchronized.
439 unsafe {
440 self.stream.launch(
441 &self.function,
442 self.global_work_size as u32,
443 self.local_work_size as u32,
444 shared_mem,
445 &args,
446 )?;
447 };
448 // Synchronize after the kernel execution, so that the underlying pointers can be
449 // invalidated/dropped.
450 self.stream.synchronize()?;
451 Ok(())
452 }
453}