impl CudaExecutor {
pub fn q5k_gemv(
&mut self,
weights: &[u8],
input: &[f32],
output: &mut [f32],
n: u32,
k: u32,
) -> Result<(), GpuError> {
let kernel_type = KernelType::Q5KGemv { k, n };
let kernel_name = self.kernels.kernel_name(&kernel_type);
let cache_key = format!("q5k_gemv_{}_{}", k, n);
if !self.modules.contains_key(&cache_key) {
let ptx = self.kernels.generate_ptx(&kernel_type);
let module = self.compile_ptx(&ptx)?;
self.modules.insert(cache_key.clone(), module);
}
let module = self
.modules
.get_mut(&cache_key)
.expect("module just inserted");
let buf_weights = GpuBuffer::from_host(&self.context, weights)?;
let buf_input = GpuBuffer::from_host(&self.context, input)?;
let buf_output = GpuBuffer::<f32>::new(&self.context, n as usize)?;
let config = LaunchConfig::grid_2d(n, 1, 32, 1);
let mut ptr_output = buf_output.as_ptr();
let mut ptr_weights = buf_weights.as_ptr();
let mut ptr_input = buf_input.as_ptr();
let mut k_val = k;
let mut n_val = n;
unsafe {
self.stream.launch_kernel(
module,
kernel_name,
&config,
&mut [
std::ptr::from_mut(&mut ptr_output) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut ptr_weights) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut ptr_input) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut k_val) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut n_val) as *mut std::ffi::c_void,
],
)?;
}
self.stream.synchronize()?;
buf_output.copy_to_host(output)?;
Ok(())
}
pub fn q6k_gemv(
&mut self,
weights: &[u8],
input: &[f32],
output: &mut [f32],
n: u32,
k: u32,
) -> Result<(), GpuError> {
let kernel_type = KernelType::Q6KGemv { k, n };
let kernel_name = self.kernels.kernel_name(&kernel_type);
let cache_key = format!("q6k_gemv_{}_{}", k, n);
if !self.modules.contains_key(&cache_key) {
let ptx = self.kernels.generate_ptx(&kernel_type);
let module = self.compile_ptx(&ptx)?;
self.modules.insert(cache_key.clone(), module);
}
let module = self
.modules
.get_mut(&cache_key)
.expect("module just inserted");
let buf_weights = GpuBuffer::from_host(&self.context, weights)?;
let buf_input = GpuBuffer::from_host(&self.context, input)?;
let buf_output = GpuBuffer::<f32>::new(&self.context, n as usize)?;
let config = LaunchConfig::grid_2d(n, 1, 32, 1);
let mut ptr_output = buf_output.as_ptr();
let mut ptr_weights = buf_weights.as_ptr();
let mut ptr_input = buf_input.as_ptr();
let mut k_val = k;
let mut n_val = n;
unsafe {
self.stream.launch_kernel(
module,
kernel_name,
&config,
&mut [
std::ptr::from_mut(&mut ptr_output) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut ptr_weights) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut ptr_input) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut k_val) as *mut std::ffi::c_void,
std::ptr::from_mut(&mut n_val) as *mut std::ffi::c_void,
],
)?;
}
self.stream.synchronize()?;
buf_output.copy_to_host(output)?;
Ok(())
}
}
include!("streams.rs");
include!("gemm_tiled.rs");
include!("execute.rs");