1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
use std::ffi::c_void;
use crate::{Buffer, number::Number, CudaDevice};
use super::{fn_cache, api::{culaunch_kernel, cuOccupancyMaxPotentialBlockSize}};
pub trait AsCudaCvoidPtr {
fn as_cvoid_ptr(&self) -> *mut c_void;
}
impl<T> AsCudaCvoidPtr for &Buffer<T> {
fn as_cvoid_ptr(&self) -> *mut c_void {
&self.ptr.2 as *const u64 as *mut c_void
}
}
impl<T> AsCudaCvoidPtr for Buffer<T> {
fn as_cvoid_ptr(&self) -> *mut c_void {
&self.ptr.2 as *const u64 as *mut c_void
}
}
impl<T: Number> AsCudaCvoidPtr for T {
fn as_cvoid_ptr(&self) -> *mut c_void {
self as *const T as *mut c_void
}
}
pub fn launch_kernel1d(len: usize, device: &CudaDevice, src: &str, fn_name: &str, params: Vec<&dyn AsCudaCvoidPtr>) -> crate::Result<()> {
let params = params.into_iter()
.map(|param| param.as_cvoid_ptr())
.collect::<Vec<_>>();
let func = fn_cache(device, src, fn_name)?;
let mut min_grid_size = 0;
let mut block_size = 0;
unsafe {cuOccupancyMaxPotentialBlockSize(
&mut min_grid_size, &mut block_size,
func.0,
0, 0,
len as i32).to_result()?
};
let grid_size = (len as i32 + block_size - 1) / block_size;
culaunch_kernel(
&func, [grid_size as u32, 1, 1], [block_size as u32, 1, 1],
&device.stream(), ¶ms
)?;
Ok(())
}