#![allow(clippy::similar_names)]
#[cfg(feature = "cuda")]
use crate::driver::{CudaContext, CudaStream, GpuBuffer, LaunchConfig};
#[cfg(feature = "cuda")]
use crate::error::Result;
#[cfg(feature = "cuda")]
use crate::kernels::{GemmKernel, Kernel, LongRowSoftmaxKernel, ScaleKernel, SoftmaxKernel};
#[cfg(feature = "cuda")]
use super::cache::get_or_compile_kernel;
#[cfg(feature = "cuda")]
use super::GpuResidentTensor;
#[cfg(feature = "cuda")]
impl GpuResidentTensor<f32> {
pub fn matmul(
&self,
ctx: &CudaContext,
other: &GpuResidentTensor<f32>,
m: u32,
n: u32,
k: u32,
) -> Result<GpuResidentTensor<f32>> {
let expected_a = (m * k) as usize;
let expected_b = (k * n) as usize;
let output_size = (m * n) as usize;
if self.len() != expected_a {
return Err(crate::GpuError::InvalidParameter(format!(
"A has {} elements, expected {} ({}x{})",
self.len(),
expected_a,
m,
k
)));
}
if other.len() != expected_b {
return Err(crate::GpuError::InvalidParameter(format!(
"B has {} elements, expected {} ({}x{})",
other.len(),
expected_b,
k,
n
)));
}
let output_buffer = GpuBuffer::new(ctx, output_size)?;
let tile_size = 16u32;
let force_fp32 = std::env::var("TRUENO_FORCE_FP32_GEMM").is_ok();
let use_wmma = !force_fp32 && k >= 64 && m >= 64 && n >= 64;
let use_tiled = !use_wmma && k >= 64;
let (kernel, cache_key, config) = if use_wmma {
let kernel = GemmKernel::wmma_fp16(m, n, k);
let key = format!("gemm_wmma_fp16:{}x{}x{}", m, n, k);
let grid_x = (n + 15) / 16;
let grid_y = (m + 15) / 16;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (32, 1, 1), shared_mem: 1024,
};
(kernel, key, cfg)
} else if use_tiled {
let kernel = GemmKernel::tiled_unrolled(m, n, k, tile_size);
let key = format!("gemm_tiled_unrolled:{}x{}x{}", m, n, k);
let grid_x = (n + tile_size - 1) / tile_size;
let grid_y = (m + tile_size - 1) / tile_size;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (tile_size, tile_size, 1),
shared_mem: tile_size * tile_size * 4 * 2,
};
(kernel, key, cfg)
} else {
let kernel = GemmKernel::naive(m, n, k);
let key = format!("gemm_naive:{}x{}x{}", m, n, k);
let block_size = 16u32;
let grid_x = (n + block_size - 1) / block_size;
let grid_y = (m + block_size - 1) / block_size;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (block_size, block_size, 1),
shared_mem: 0,
};
(kernel, key, cfg)
};
let ptx = kernel.emit_ptx();
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let m_val = m;
let n_val = n;
let k_val = k;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(m_val) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
std::ptr::addr_of!(k_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn matmul_with_stream(
&self,
ctx: &CudaContext,
other: &GpuResidentTensor<f32>,
m: u32,
n: u32,
k: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let expected_a = (m * k) as usize;
let expected_b = (k * n) as usize;
let output_size = (m * n) as usize;
if self.len() != expected_a {
return Err(crate::GpuError::InvalidParameter(format!(
"A has {} elements, expected {} ({}x{})",
self.len(), expected_a, m, k
)));
}
if other.len() != expected_b {
return Err(crate::GpuError::InvalidParameter(format!(
"B has {} elements, expected {} ({}x{})",
other.len(), expected_b, k, n
)));
}
let output_buffer = GpuBuffer::new(ctx, output_size)?;
let tile_size = 16u32;
let force_fp32 = std::env::var("TRUENO_FORCE_FP32_GEMM").is_ok();
let use_wmma = !force_fp32 && k >= 64 && m >= 64 && n >= 64;
let use_tiled = !use_wmma && k >= 64;
let (kernel, cache_key, config) = if use_wmma {
let kernel = GemmKernel::wmma_fp16(m, n, k);
let key = format!("gemm_wmma_fp16:{}x{}x{}", m, n, k);
let grid_x = (n + 15) / 16;
let grid_y = (m + 15) / 16;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (32, 1, 1),
shared_mem: 1024,
};
(kernel, key, cfg)
} else if use_tiled {
let kernel = GemmKernel::tiled_unrolled(m, n, k, tile_size);
let key = format!("gemm_tiled_unrolled:{}x{}x{}", m, n, k);
let grid_x = (n + tile_size - 1) / tile_size;
let grid_y = (m + tile_size - 1) / tile_size;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (tile_size, tile_size, 1),
shared_mem: tile_size * tile_size * 4 * 2,
};
(kernel, key, cfg)
} else {
let kernel = GemmKernel::naive(m, n, k);
let key = format!("gemm_naive:{}x{}x{}", m, n, k);
let block_size = 16u32;
let grid_x = (n + block_size - 1) / block_size;
let grid_y = (m + block_size - 1) / block_size;
let cfg = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (block_size, block_size, 1),
shared_mem: 0,
};
(kernel, key, cfg)
};
let ptx = kernel.emit_ptx();
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let m_val = m;
let n_val = n;
let k_val = k;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(m_val) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
std::ptr::addr_of!(k_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn softmax(&self, ctx: &CudaContext, seq_len: u32) -> Result<GpuResidentTensor<f32>> {
let total_elements = self.len();
let row_size = total_elements / (seq_len as usize);
if total_elements % (seq_len as usize) != 0 {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} not divisible by seq_len {}",
total_elements, seq_len
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elements)?;
let stream = CudaStream::new(ctx)?;
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let row_size_val = row_size as u32;
if row_size <= 32 {
let kernel = SoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax:{}", row_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (32, 1, 1), shared_mem: 0,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
} else {
let kernel = LongRowSoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax_long_row:{}", row_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (256, 1, 1),
shared_mem: 72,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn softmax_with_stream(
&self,
ctx: &CudaContext,
seq_len: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let total_elements = self.len();
let row_size = total_elements / (seq_len as usize);
if total_elements % (seq_len as usize) != 0 {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} not divisible by seq_len {}",
total_elements, seq_len
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elements)?;
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let row_size_val = row_size as u32;
if row_size <= 32 {
let kernel = SoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax:{}", row_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (32, 1, 1), shared_mem: 0,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
} else {
let kernel = LongRowSoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax_long_row:{}", row_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (256, 1, 1),
shared_mem: 72,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn add(&self, ctx: &CudaContext, other: &GpuResidentTensor<f32>) -> Result<GpuResidentTensor<f32>> {
if self.len() != other.len() {
return Err(crate::GpuError::InvalidParameter(format!(
"Size mismatch: {} vs {}",
self.len(),
other.len()
)));
}
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::ResidualAddKernel;
let kernel = ResidualAddKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("residual_add:{}", n);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn add_with_stream(
&self,
ctx: &CudaContext,
other: &GpuResidentTensor<f32>,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
if self.len() != other.len() {
return Err(crate::GpuError::InvalidParameter(format!(
"Size mismatch: {} vs {}",
self.len(),
other.len()
)));
}
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::ResidualAddKernel;
let kernel = ResidualAddKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("residual_add:{}", n);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn interleaved_to_head_first(
&self,
ctx: &CudaContext,
seq_len: u32,
n_heads: u32,
head_dim: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let d_model = n_heads * head_dim;
let total_elems = (seq_len * d_model) as usize;
if self.len() != total_elems {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} doesn't match seq_len ({}) × d_model ({})",
self.len(),
seq_len,
d_model
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elems)?;
use crate::kernels::{InterleavedToBatchedKernel, Kernel};
let kernel = InterleavedToBatchedKernel::new(seq_len, n_heads, head_dim);
let ptx = kernel.emit_ptx();
let cache_key = format!("interleaved_to_batched:{}:{}:{}", seq_len, n_heads, head_dim);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 256u32;
let blocks = (total_elems as u32 + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn scale(&self, ctx: &CudaContext, scale: f32) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
let kernel = ScaleKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("scale:{}", n);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(scale) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn layer_norm(
&self,
ctx: &CudaContext,
gamma: &GpuResidentTensor<f32>,
beta: &GpuResidentTensor<f32>,
hidden_size: u32,
batch_size: u32,
) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::LayerNormKernel;
let kernel = LayerNormKernel::new(hidden_size);
let ptx = kernel.emit_ptx();
let cache_key = format!("layer_norm:{}", hidden_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let threads = 32u32;
let blocks = batch_size;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let gamma_ptr = gamma.as_ptr();
let beta_ptr = beta.as_ptr();
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(gamma_ptr) as *mut _,
std::ptr::addr_of!(beta_ptr) as *mut _,
std::ptr::addr_of!(hidden_size) as *mut _,
std::ptr::addr_of!(batch_size) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn layer_norm_with_stream(
&self,
ctx: &CudaContext,
gamma: &GpuResidentTensor<f32>,
beta: &GpuResidentTensor<f32>,
hidden_size: u32,
batch_size: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::LayerNormKernel;
let kernel = LayerNormKernel::new(hidden_size);
let ptx = kernel.emit_ptx();
let cache_key = format!("layer_norm:{}", hidden_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 32u32;
let blocks = batch_size;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let gamma_ptr = gamma.as_ptr();
let beta_ptr = beta.as_ptr();
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(gamma_ptr) as *mut _,
std::ptr::addr_of!(beta_ptr) as *mut _,
std::ptr::addr_of!(hidden_size) as *mut _,
std::ptr::addr_of!(batch_size) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn gelu(&self, ctx: &CudaContext) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::GeluKernel;
let kernel = GeluKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("gelu:{}", n);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn gelu_with_stream(
&self,
ctx: &CudaContext,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::GeluKernel;
let kernel = GeluKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("gelu:{}", n);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn bias_add(
&self,
ctx: &CudaContext,
bias: &GpuResidentTensor<f32>,
) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let bias_size = bias.len();
let stream = CudaStream::new(ctx)?;
let mut output_buffer = GpuBuffer::new(ctx, n)?;
unsafe {
output_buffer.copy_from_buffer_async(&self.buffer, &stream)?;
}
use crate::kernels::BiasActivationKernel;
let kernel = BiasActivationKernel::new(n as u32, bias_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("bias_add:{}:{}", n, bias_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let output_ptr = output_buffer.as_ptr();
let bias_ptr = bias.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(bias_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn bias_add_with_stream(
&self,
ctx: &CudaContext,
bias: &GpuResidentTensor<f32>,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let bias_size = bias.len();
let output_buffer = self.buffer.clone(ctx)?;
use crate::kernels::BiasActivationKernel;
let kernel = BiasActivationKernel::new(n as u32, bias_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("bias_add:{}:{}", n, bias_size);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig {
grid: (blocks, 1, 1),
block: (threads, 1, 1),
shared_mem: 0,
};
let output_ptr = output_buffer.as_ptr();
let bias_ptr = bias.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(bias_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn linear(
&self,
ctx: &CudaContext,
weight: &GpuResidentTensor<f32>,
bias: Option<&GpuResidentTensor<f32>>,
batch_size: u32,
in_features: u32,
out_features: u32,
) -> Result<GpuResidentTensor<f32>> {
let debug = std::env::var("WHISPER_DEBUG_LINEAR").is_ok();
if debug {
eprintln!("[DEBUG-LINEAR] input: len={}, batch={}, in_feat={}, out_feat={}",
self.len(), batch_size, in_features, out_features);
let inp = self.peek_host()?;
eprintln!("[DEBUG-LINEAR] input stats: mean={:.6}, max={:.6}",
inp.iter().sum::<f32>() / inp.len() as f32,
inp.iter().cloned().fold(f32::NEG_INFINITY, f32::max));
}
let result = self.matmul(ctx, weight, batch_size, out_features, in_features)?;
if debug {
let res = result.peek_host()?;
eprintln!("[DEBUG-LINEAR] matmul result: len={}, mean={:.6}, max={:.6}",
res.len(),
res.iter().sum::<f32>() / res.len() as f32,
res.iter().cloned().fold(f32::NEG_INFINITY, f32::max));
}
if let Some(b) = bias {
let output = result.bias_add(ctx, b)?;
if debug {
let out = output.peek_host()?;
eprintln!("[DEBUG-LINEAR] after bias_add: len={}, mean={:.6}, max={:.6}",
out.len(),
out.iter().sum::<f32>() / out.len() as f32,
out.iter().cloned().fold(f32::NEG_INFINITY, f32::max));
}
Ok(output)
} else {
Ok(result)
}
}
pub fn fused_linear_gelu(
&self,
ctx: &CudaContext,
weight: &GpuResidentTensor<f32>,
bias: &GpuResidentTensor<f32>,
batch_size: u32,
in_features: u32,
out_features: u32,
) -> Result<GpuResidentTensor<f32>> {
use crate::kernels::FusedGemmBiasGeluKernel;
let output_size = (batch_size * out_features) as usize;
let output_buffer = GpuBuffer::new(ctx, output_size)?;
let kernel = FusedGemmBiasGeluKernel::new(batch_size, out_features, in_features);
let ptx = kernel.emit_ptx();
let cache_key = format!("fused_gemm_bias_gelu:{}x{}x{}", batch_size, out_features, in_features);
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let block_size = 16u32;
let grid_x = (out_features + block_size - 1) / block_size;
let grid_y = (batch_size + block_size - 1) / block_size;
let config = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (block_size, block_size, 1),
shared_mem: 0,
};
let a_ptr = self.as_ptr();
let b_ptr = weight.as_ptr();
let bias_ptr = bias.as_ptr();
let c_ptr = output_buffer.as_ptr();
let m_val = batch_size;
let n_val = out_features;
let k_val = in_features;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(bias_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(m_val) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
std::ptr::addr_of!(k_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn conv1d(
&self,
ctx: &CudaContext,
weight: &GpuResidentTensor<f32>,
bias: Option<&GpuResidentTensor<f32>>,
in_channels: u32,
out_channels: u32,
kernel_size: u32,
stride: u32,
padding: u32,
seq_len: u32,
) -> Result<GpuResidentTensor<f32>> {
use crate::kernels::Conv1dKernel;
let out_seq_len = (seq_len + 2 * padding - kernel_size) / stride + 1;
let output_size = (out_seq_len * out_channels) as usize;
let expected_input = (seq_len * in_channels) as usize;
if self.len() != expected_input {
return Err(crate::GpuError::InvalidParameter(format!(
"Input has {} elements, expected {} ({}x{})",
self.len(), expected_input, seq_len, in_channels
)));
}
let expected_weight = (out_channels * in_channels * kernel_size) as usize;
if weight.len() != expected_weight {
return Err(crate::GpuError::InvalidParameter(format!(
"Weight has {} elements, expected {} ({}x{}x{})",
weight.len(), expected_weight, out_channels, in_channels, kernel_size
)));
}
let output_buffer = GpuBuffer::new(ctx, output_size)?;
let kernel = Conv1dKernel::new(in_channels, out_channels, kernel_size, stride, padding);
let cache_key = format!(
"conv1d:{}:{}:{}:{}:{}",
in_channels, out_channels, kernel_size, stride, padding
);
let ptx = kernel.emit_ptx();
let module_arc = get_or_compile_kernel(ctx, &cache_key, &ptx)?;
let stream = CudaStream::new(ctx)?;
let block_x = 32u32;
let block_y = 8u32;
let grid_x = (out_seq_len + block_x - 1) / block_x;
let grid_y = (out_channels + block_y - 1) / block_y;
let config = LaunchConfig {
grid: (grid_x, grid_y, 1),
block: (block_x, block_y, 1),
shared_mem: 0,
};
let input_ptr = self.as_ptr();
let weight_ptr = weight.as_ptr();
let bias_ptr = bias.map_or(0_u64, |b| b.as_ptr());
let output_ptr = output_buffer.as_ptr();
let seq_len_val = seq_len;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(weight_ptr) as *mut _,
std::ptr::addr_of!(bias_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(seq_len_val) as *mut _,
];
{
let mut module = module_arc.lock().map_err(|e| {
crate::GpuError::KernelLaunch(format!("Module lock poisoned: {}", e))
})?;
unsafe {
stream.launch_kernel(&mut module, kernel.name(), &config, &mut args)?;
}
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
}