mod attention;
mod cache;
mod ops;
mod stats;
mod weights;
#[cfg(test)]
mod ops_tests;
#[cfg(feature = "cuda")]
pub use attention::{
batched_multihead_attention, batched_multihead_attention_optimized, incremental_attention_gpu,
incremental_attention_gpu_async, incremental_attention_gpu_with_stream, kv_cache_scatter_gpu,
};
pub use cache::{
clear_kernel_cache, kernel_cache_hits, kernel_cache_misses, reset_kernel_cache_stats,
};
pub use stats::{
reset_transfer_counters, total_d2h_bytes, total_d2h_transfers, total_h2d_bytes,
total_h2d_transfers, TransferStats,
};
#[cfg(feature = "cuda")]
pub use weights::{
forward_encoder_block_gpu, GpuConvFrontendWeights, GpuDecoderBlockWeights, GpuDecoderConfig,
GpuEncoderBlockWeights, GpuEncoderConfig, GpuKvCache,
};
#[cfg(feature = "cuda")]
use stats::{record_d2h_transfer, record_h2d_transfer};
#[cfg(feature = "cuda")]
use crate::driver::{CudaContext, GpuBuffer};
#[cfg(feature = "cuda")]
use crate::error::Result;
#[cfg(feature = "cuda")]
pub struct GpuResidentTensor<T: Copy> {
pub(crate) buffer: GpuBuffer<T>,
h2d_count: u64,
d2h_count: u64,
kernel_launches: u64,
is_resident: bool,
}
#[cfg(feature = "cuda")]
impl<T: Copy> GpuResidentTensor<T> {
pub fn from_host(ctx: &CudaContext, data: &[T]) -> Result<Self> {
let buffer = GpuBuffer::from_host(ctx, data)?;
let bytes = data.len() * std::mem::size_of::<T>();
record_h2d_transfer(bytes as u64);
Ok(Self { buffer, h2d_count: 1, d2h_count: 0, kernel_launches: 0, is_resident: true })
}
pub fn new_uninit(ctx: &CudaContext, len: usize) -> Result<Self> {
let buffer = GpuBuffer::new(ctx, len)?;
Ok(Self { buffer, h2d_count: 0, d2h_count: 0, kernel_launches: 0, is_resident: true })
}
pub(crate) fn from_buffer_internal(buffer: GpuBuffer<T>, kernel_launches: u64) -> Self {
Self { buffer, h2d_count: 0, d2h_count: 0, kernel_launches, is_resident: true }
}
pub fn to_host(&mut self) -> Result<Vec<T>>
where
T: Default + Clone,
{
let mut result = vec![T::default(); self.buffer.len()];
self.buffer.copy_to_host(&mut result)?;
let bytes = result.len() * std::mem::size_of::<T>();
self.d2h_count += 1;
record_d2h_transfer(bytes as u64);
Ok(result)
}
pub fn peek_host(&self) -> Result<Vec<T>>
where
T: Default + Clone,
{
let mut result = vec![T::default(); self.buffer.len()];
self.buffer.copy_to_host(&mut result)?;
Ok(result)
}
#[must_use]
pub const fn is_device_resident(&self) -> bool {
self.is_resident
}
#[must_use]
pub const fn h2d_transfers(&self) -> u64 {
self.h2d_count
}
#[must_use]
pub const fn host_to_device_transfers(&self) -> u64 {
self.h2d_count
}
#[must_use]
pub const fn d2h_transfers(&self) -> u64 {
self.d2h_count
}
#[must_use]
pub const fn device_to_host_transfers(&self) -> u64 {
self.d2h_count
}
#[must_use]
pub const fn kernel_launches(&self) -> u64 {
self.kernel_launches
}
#[must_use]
pub fn len(&self) -> usize {
self.buffer.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
#[must_use]
pub fn size_bytes(&self) -> usize {
self.buffer.size_bytes()
}
#[must_use]
pub fn buffer(&self) -> &GpuBuffer<T> {
&self.buffer
}
#[must_use]
pub fn buffer_mut(&mut self) -> &mut GpuBuffer<T> {
&mut self.buffer
}
#[must_use]
pub fn as_ptr(&self) -> u64 {
self.buffer.as_ptr()
}
pub fn record_kernel_launch(&mut self) {
self.kernel_launches += 1;
}
}
#[cfg(test)]
mod tests;