rust_gpu_tools/cuda/
mod.rs

1//! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
2//!
3//! The current operation mode is synchronuous, in order to have higher safety gurarantees. All
4//! operations happen on a single stream, which is synchronized after each operation. This is a
5//! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons:
6//!
7//!   1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream
8//!   2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous
9//!   copy methods. To prevent such kind of bugs, be explicit which stream is used.
10
11pub(crate) mod utils;
12
13use std::convert::TryFrom;
14use std::ffi::{c_void, CStr, CString};
15use std::fmt;
16use std::hash::{Hash, Hasher};
17
18use log::debug;
19use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer};
20use rustacuda::stream::{Stream, StreamFlags};
21
22use crate::device::{DeviceUuid, PciId, Vendor};
23use crate::error::{GPUError, GPUResult};
24use crate::LocalBuffer;
25
26/// A Buffer to be used for sending and receiving data to/from the GPU.
27#[derive(Debug)]
28pub struct Buffer<T> {
29    buffer: DeviceBuffer<u8>,
30    /// The number of T-sized elements.
31    length: usize,
32    _phantom: std::marker::PhantomData<T>,
33}
34
35/// CUDA specific device.
36#[derive(Debug, Clone)]
37pub struct Device {
38    vendor: Vendor,
39    name: String,
40    /// The total memory of the GPU in bytes.
41    memory: u64,
42    /// Number of streaming multiprocessors.
43    compute_units: u32,
44    /// The compute capability of the device, major and minor version.
45    compute_capability: (u32, u32),
46    pci_id: PciId,
47    uuid: Option<DeviceUuid>,
48    context: rustacuda::context::UnownedContext,
49}
50
51impl Hash for Device {
52    fn hash<H: Hasher>(&self, state: &mut H) {
53        self.vendor.hash(state);
54        self.name.hash(state);
55        self.memory.hash(state);
56        self.pci_id.hash(state);
57        self.uuid.hash(state);
58    }
59}
60
61impl PartialEq for Device {
62    fn eq(&self, other: &Self) -> bool {
63        self.vendor == other.vendor
64            && self.name == other.name
65            && self.memory == other.memory
66            && self.pci_id == other.pci_id
67            && self.uuid == other.uuid
68    }
69}
70
71impl Eq for Device {}
72
73impl Device {
74    /// Returns the [`Vendor`] of the GPU.
75    pub fn vendor(&self) -> Vendor {
76        self.vendor
77    }
78
79    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
80    pub fn name(&self) -> String {
81        self.name.clone()
82    }
83
84    /// Returns the memory of the GPU in bytes.
85    pub fn memory(&self) -> u64 {
86        self.memory
87    }
88
89    /// Returns the number of compute units of the GPU.
90    pub fn compute_units(&self) -> u32 {
91        self.compute_units
92    }
93
94    /// Returns the major and minor version of compute capability of the GPU.
95    pub fn compute_capability(&self) -> (u32, u32) {
96        self.compute_capability
97    }
98
99    /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
100    pub fn pci_id(&self) -> PciId {
101        self.pci_id
102    }
103
104    /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more
105    /// information.
106    pub fn uuid(&self) -> Option<DeviceUuid> {
107        self.uuid
108    }
109}
110
111/// Abstraction that contains everything to run a CUDA kernel on a GPU.
112///
113/// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this
114/// API, which will then work with OpenCL as well as CUDA kernels.
115// When compiled without the `opencl` feature, then the intra-doc link above will be broken.
116#[allow(rustdoc::broken_intra_doc_links)]
117#[derive(Debug)]
118pub struct Program {
119    context: rustacuda::context::UnownedContext,
120    module: rustacuda::module::Module,
121    stream: Stream,
122    device_name: String,
123}
124
125impl Program {
126    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
127    pub fn device_name(&self) -> &str {
128        &self.device_name
129    }
130
131    /// Creates a program for a specific device from a compiled CUDA binary file.
132    pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult<Program> {
133        debug!("Creating CUDA program from binary file.");
134        rustacuda::context::CurrentContext::set_current(&device.context)?;
135        let module = rustacuda::module::Module::load_from_file(filename).map_err(|err| {
136            Self::pop_context();
137            err
138        })?;
139        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
140            Self::pop_context();
141            err
142        })?;
143        let prog = Program {
144            module,
145            stream,
146            device_name: device.name(),
147            context: device.context.clone(),
148        };
149        Self::pop_context();
150        Ok(prog)
151    }
152
153    /// Creates a program for a specific device from a compiled CUDA binary.
154    pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult<Program> {
155        debug!("Creating CUDA program from bytes.");
156        rustacuda::context::CurrentContext::set_current(&device.context)?;
157        let module = rustacuda::module::Module::load_from_bytes(bytes).map_err(|err| {
158            Self::pop_context();
159            err
160        })?;
161        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
162            Self::pop_context();
163            err
164        })?;
165        let prog = Program {
166            module,
167            stream,
168            device_name: device.name(),
169            context: device.context.clone(),
170        };
171        Self::pop_context();
172        Ok(prog)
173    }
174
175    /// Creates a new buffer that can be used for input/output with the GPU.
176    ///
177    /// The `length` is the number of elements to create.
178    ///
179    /// It is usually used to create buffers that are initialized by the GPU. If you want to
180    /// directly transfer data from the host to the GPU, you would use the safe
181    /// [`Program::create_buffer_from_slice`] instead.
182    ///
183    /// ### Safety
184    ///
185    /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by
186    /// the GPU) before it can be read via [`Program::read_into_buffer`].
187    pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
188        assert!(length > 0);
189        // This is the unsafe call, the rest of the function is safe code.
190        let buffer = DeviceBuffer::<u8>::uninitialized(length * std::mem::size_of::<T>())?;
191
192        Ok(Buffer::<T> {
193            buffer,
194            length,
195            _phantom: std::marker::PhantomData,
196        })
197    }
198
199    /// Creates a new buffer on the GPU and initializes with the given slice.
200    pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
201        // The number of bytes is used for the allocations.
202        let bytes_len = slice.len() * std::mem::size_of::<T>();
203
204        // Transmuting types is safe as long a sizes match.
205        let bytes = unsafe {
206            std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len)
207        };
208
209        // It is only unsafe as long as the buffer isn't initialized, but that's what we do next.
210        let mut buffer = unsafe { DeviceBuffer::<u8>::uninitialized(bytes_len)? };
211        // It is safe as we synchronize the stream after the call.
212        unsafe { buffer.async_copy_from(bytes, &self.stream)? };
213        self.stream.synchronize()?;
214
215        Ok(Buffer::<T> {
216            buffer,
217            length: slice.len(),
218            _phantom: std::marker::PhantomData,
219        })
220    }
221
222    /// Returns a kernel.
223    ///
224    /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
225    /// number of threads. Instead it follows CUDA's definition and is the number of
226    /// `local_work_size` sized thread groups. So the total number of threads is
227    /// `global_work_size * local_work_size`.
228    pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult<Kernel> {
229        let function_name = CString::new(name).expect("Kernel name must not contain nul bytes");
230        let function = self.module.get_function(&function_name)?;
231
232        Ok(Kernel {
233            function,
234            global_work_size: gws,
235            local_work_size: lws,
236            stream: &self.stream,
237            args: Vec::new(),
238        })
239    }
240
241    /// Puts data from an existing buffer onto the GPU.
242    pub fn write_from_buffer<T>(&self, buffer: &mut Buffer<T>, data: &[T]) -> GPUResult<()> {
243        assert!(data.len() <= buffer.length, "Buffer is too small");
244
245        // Transmuting types is safe as long a sizes match.
246        let bytes = unsafe {
247            std::slice::from_raw_parts(
248                data.as_ptr() as *const T as *const u8,
249                data.len() * std::mem::size_of::<T>(),
250            )
251        };
252
253        // It is safe as we synchronize the stream after the call.
254        unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? };
255        self.stream.synchronize()?;
256
257        Ok(())
258    }
259
260    /// Reads data from the GPU into an existing buffer.
261    pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
262        assert!(data.len() <= buffer.length, "Buffer is too small");
263
264        // Transmuting types is safe as long a sizes match.
265        let bytes = unsafe {
266            std::slice::from_raw_parts_mut(
267                data.as_mut_ptr() as *mut T as *mut u8,
268                data.len() * std::mem::size_of::<T>(),
269            )
270        };
271
272        // It is safe as we synchronize the stream after the call.
273        unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? };
274        self.stream.synchronize()?;
275
276        Ok(())
277    }
278
279    /// Run some code in the context of the program.
280    ///
281    /// It sets the correct contexts.
282    ///
283    /// It takes the program as a parameter, so that we can use the same function body, for both
284    /// the OpenCL and the CUDA code path. The only difference is the type of the program.
285    pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
286    where
287        F: FnOnce(&Self, A) -> Result<R, E>,
288        E: From<GPUError>,
289    {
290        rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?;
291        let result = fun(self, arg);
292        Self::pop_context();
293        result
294    }
295
296    /// Pop the current context.
297    ///
298    /// It panics as it's an unrecoverable error.
299    fn pop_context() {
300        rustacuda::context::ContextStack::pop().expect("Cannot remove context.");
301    }
302}
303
304// TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that
305// this manual `Send` implementation is no longer needed.
306unsafe impl Send for Program {}
307
308/// Abstraction for kernel arguments.
309///
310/// Kernel arguments implement this trait, so that they can be converted it into the correct
311/// pointers needed by the actual kernel call.
312pub trait KernelArgument {
313    /// Converts into a C void pointer.
314    fn as_c_void(&self) -> *mut c_void;
315
316    /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This
317    /// informations is used to allocate the memory correctly.
318    fn shared_mem(&self) -> u32 {
319        0
320    }
321}
322
323impl<T> KernelArgument for Buffer<T> {
324    fn as_c_void(&self) -> *mut c_void {
325        &self.buffer as *const _ as _
326    }
327}
328
329impl KernelArgument for i32 {
330    fn as_c_void(&self) -> *mut c_void {
331        self as *const _ as _
332    }
333}
334
335impl KernelArgument for u32 {
336    fn as_c_void(&self) -> *mut c_void {
337        self as *const _ as _
338    }
339}
340
341impl<T> KernelArgument for LocalBuffer<T> {
342    // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo)
343    // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it
344    // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would
345    // return an "invalid argument" error.
346    fn as_c_void(&self) -> *mut c_void {
347        self as *const _ as _
348    }
349
350    fn shared_mem(&self) -> u32 {
351        u32::try_from(self.length * std::mem::size_of::<T>())
352            .expect("__shared__ memory allocation is too big.")
353    }
354}
355
356/// A kernel that can be executed.
357pub struct Kernel<'a> {
358    function: rustacuda::function::Function<'a>,
359    global_work_size: usize,
360    local_work_size: usize,
361    stream: &'a Stream,
362    args: Vec<&'a dyn KernelArgument>,
363}
364
365impl fmt::Debug for Kernel<'_> {
366    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
367        let args = self
368            .args
369            .iter()
370            .map(|arg| (arg.as_c_void(), arg.shared_mem()))
371            .collect::<Vec<_>>();
372        f.debug_struct("Kernel")
373            .field("function", &self.function)
374            .field("global_work_size", &self.global_work_size)
375            .field("local_work_size", &self.local_work_size)
376            .field("stream", &self.stream)
377            .field("args", &args)
378            .finish()
379    }
380}
381
382impl<'a> Kernel<'a> {
383    /// Set a kernel argument.
384    ///
385    /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
386    /// long as the kernel is in use.
387    ///
388    /// Example where this behaviour is enforced and leads to a compile-time error:
389    ///
390    /// ```compile_fail
391    /// use rust_gpu_tools::cuda::Program;
392    ///
393    /// fn would_break(program: &Program) {
394    ///    let data = vec![1, 2, 3, 4];
395    ///    let buffer = program.create_buffer_from_slice(&data).unwrap();
396    ///    let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
397    ///    let kernel = kernel.arg(&buffer);
398    ///    // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
399    ///    drop(buffer);
400    ///    kernel.run().unwrap();
401    /// }
402    /// ```
403    pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
404        self.args.push(t);
405        self
406    }
407
408    /// Actually run the kernel.
409    ///
410    /// ### Panics
411    ///
412    /// Panics if the wrong number of arguments was provided.
413    pub fn run(self) -> GPUResult<()> {
414        // There can only be a single [`LocalBuffer`], due to CUDA restrictions.
415        let shared_mem = self
416            .args
417            .iter()
418            .try_fold(0, |acc, &arg| -> GPUResult<u32> {
419                let mem = arg.shared_mem();
420                match (mem, acc) {
421                    // No new shared memory needs to be allocated.
422                    (0, _) => Ok(acc),
423                    // Some shared memory needs to be allocated.
424                    (_, 0) => Ok(mem),
425                    // There should be memory allocated more than once
426                    (_, _) => Err(GPUError::Generic(
427                        "There cannot be more than one `LocalBuffer`.".to_string(),
428                    )),
429                }
430            })?;
431        let args = self
432            .args
433            .iter()
434            .map(|arg| arg.as_c_void())
435            .collect::<Vec<_>>();
436        // It is safe to launch the kernel as the arguments need to live when the kernel is called,
437        // and the buffers are copied synchronuously. At the end of the execution, the underlying
438        // stream is synchronized.
439        unsafe {
440            self.stream.launch(
441                &self.function,
442                self.global_work_size as u32,
443                self.local_work_size as u32,
444                shared_mem,
445                &args,
446            )?;
447        };
448        // Synchronize after the kernel execution, so that the underlying pointers can be
449        // invalidated/dropped.
450        self.stream.synchronize()?;
451        Ok(())
452    }
453}