#![allow(clippy::similar_names)]
#[cfg(feature = "cuda")]
use crate::driver::{CudaContext, CudaStream, GpuBuffer, LaunchConfig};
#[cfg(feature = "cuda")]
use crate::error::Result;
#[cfg(feature = "cuda")]
use crate::kernels::{Kernel, LongRowSoftmaxKernel, ScaleKernel, SoftmaxKernel};
#[cfg(feature = "cuda")]
use super::super::cache::compile_lock_launch;
#[cfg(feature = "cuda")]
use super::super::GpuResidentTensor;
#[cfg(feature = "cuda")]
fn launch_cached_kernel(
ctx: &CudaContext,
stream: &CudaStream,
cache_key: &str,
ptx: &str,
kernel_name: &str,
config: &LaunchConfig,
args: &mut [*mut std::ffi::c_void],
) -> Result<()> {
compile_lock_launch(ctx, stream, cache_key, ptx, kernel_name, config, args)
}
#[cfg(feature = "cuda")]
impl GpuResidentTensor<f32> {
pub fn softmax(&self, ctx: &CudaContext, seq_len: u32) -> Result<GpuResidentTensor<f32>> {
let total_elements = self.len();
let row_size = total_elements / (seq_len as usize);
if total_elements % (seq_len as usize) != 0 {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} not divisible by seq_len {}",
total_elements, seq_len
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elements)?;
let stream = CudaStream::new(ctx)?;
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let row_size_val = row_size as u32;
if row_size <= 32 {
let kernel = SoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax:{}", row_size);
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (32, 1, 1), shared_mem: 0,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
launch_cached_kernel(
ctx,
&stream,
&cache_key,
&ptx,
kernel.name(),
&config,
&mut args,
)?;
} else {
let kernel = LongRowSoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax_long_row:{}", row_size);
let config = LaunchConfig { grid: (seq_len, 1, 1), block: (256, 1, 1), shared_mem: 72 };
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
launch_cached_kernel(
ctx,
&stream,
&cache_key,
&ptx,
kernel.name(),
&config,
&mut args,
)?;
}
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn softmax_with_stream(
&self,
ctx: &CudaContext,
seq_len: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let total_elements = self.len();
let row_size = total_elements / (seq_len as usize);
if total_elements % (seq_len as usize) != 0 {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} not divisible by seq_len {}",
total_elements, seq_len
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elements)?;
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let row_size_val = row_size as u32;
if row_size <= 32 {
let kernel = SoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax:{}", row_size);
let config = LaunchConfig {
grid: (seq_len, 1, 1),
block: (32, 1, 1), shared_mem: 0,
};
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
launch_cached_kernel(ctx, stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
} else {
let kernel = LongRowSoftmaxKernel::new(row_size as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("softmax_long_row:{}", row_size);
let config = LaunchConfig { grid: (seq_len, 1, 1), block: (256, 1, 1), shared_mem: 72 };
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(row_size_val) as *mut _,
];
launch_cached_kernel(ctx, stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
}
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn add(
&self,
ctx: &CudaContext,
other: &GpuResidentTensor<f32>,
) -> Result<GpuResidentTensor<f32>> {
if self.len() != other.len() {
return Err(crate::GpuError::InvalidParameter(format!(
"Size mismatch: {} vs {}",
self.len(),
other.len()
)));
}
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::ResidualAddKernel;
let kernel = ResidualAddKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("residual_add:{}", n);
let stream = CudaStream::new(ctx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig { grid: (blocks, 1, 1), block: (threads, 1, 1), shared_mem: 0 };
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
launch_cached_kernel(ctx, &stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn add_with_stream(
&self,
ctx: &CudaContext,
other: &GpuResidentTensor<f32>,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
if self.len() != other.len() {
return Err(crate::GpuError::InvalidParameter(format!(
"Size mismatch: {} vs {}",
self.len(),
other.len()
)));
}
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
use crate::kernels::ResidualAddKernel;
let kernel = ResidualAddKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("residual_add:{}", n);
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig { grid: (blocks, 1, 1), block: (threads, 1, 1), shared_mem: 0 };
let a_ptr = self.as_ptr();
let b_ptr = other.as_ptr();
let c_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(a_ptr) as *mut _,
std::ptr::addr_of!(b_ptr) as *mut _,
std::ptr::addr_of!(c_ptr) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
launch_cached_kernel(ctx, stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn interleaved_to_head_first(
&self,
ctx: &CudaContext,
seq_len: u32,
n_heads: u32,
head_dim: u32,
stream: &CudaStream,
) -> Result<GpuResidentTensor<f32>> {
let d_model = n_heads * head_dim;
let total_elems = (seq_len * d_model) as usize;
if self.len() != total_elems {
return Err(crate::GpuError::InvalidParameter(format!(
"Tensor size {} doesn't match seq_len ({}) x d_model ({})",
self.len(),
seq_len,
d_model
)));
}
let output_buffer = GpuBuffer::new(ctx, total_elems)?;
use crate::kernels::InterleavedToBatchedKernel;
let kernel = InterleavedToBatchedKernel::new(seq_len, n_heads, head_dim);
let ptx = kernel.emit_ptx();
let cache_key = format!("interleaved_to_batched:{}:{}:{}", seq_len, n_heads, head_dim);
let threads = 256u32;
let blocks = (total_elems as u32 + threads - 1) / threads;
let config = LaunchConfig { grid: (blocks, 1, 1), block: (threads, 1, 1), shared_mem: 0 };
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let mut args: Vec<*mut std::ffi::c_void> =
vec![std::ptr::addr_of!(input_ptr) as *mut _, std::ptr::addr_of!(output_ptr) as *mut _];
launch_cached_kernel(ctx, stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
pub fn scale(&self, ctx: &CudaContext, scale: f32) -> Result<GpuResidentTensor<f32>> {
let n = self.len();
let output_buffer = GpuBuffer::new(ctx, n)?;
let kernel = ScaleKernel::new(n as u32);
let ptx = kernel.emit_ptx();
let cache_key = format!("scale:{}", n);
let stream = CudaStream::new(ctx)?;
let threads = 256u32;
let blocks = ((n as u32) + threads - 1) / threads;
let config = LaunchConfig { grid: (blocks, 1, 1), block: (threads, 1, 1), shared_mem: 0 };
let input_ptr = self.as_ptr();
let output_ptr = output_buffer.as_ptr();
let n_val = n as u32;
let mut args: Vec<*mut std::ffi::c_void> = vec![
std::ptr::addr_of!(input_ptr) as *mut _,
std::ptr::addr_of!(output_ptr) as *mut _,
std::ptr::addr_of!(scale) as *mut _,
std::ptr::addr_of!(n_val) as *mut _,
];
launch_cached_kernel(ctx, &stream, &cache_key, &ptx, kernel.name(), &config, &mut args)?;
stream.synchronize()?;
Ok(GpuResidentTensor::from_buffer_internal(output_buffer, 1))
}
}