mod attention;
mod cache;
mod ops;
mod stats;
mod weights;
#[cfg(test)]
mod ops_tests;
#[cfg(feature = "cuda")]
pub use attention::{
batched_multihead_attention, batched_multihead_attention_optimized,
incremental_attention_gpu, incremental_attention_gpu_async,
incremental_attention_gpu_with_stream, kv_cache_scatter_gpu,
};
pub use cache::{
clear_kernel_cache, kernel_cache_hits, kernel_cache_misses, reset_kernel_cache_stats,
};
pub use stats::{
reset_transfer_counters, total_d2h_bytes, total_d2h_transfers, total_h2d_bytes,
total_h2d_transfers, TransferStats,
};
#[cfg(feature = "cuda")]
pub use weights::{
forward_encoder_block_gpu, GpuConvFrontendWeights, GpuDecoderBlockWeights,
GpuDecoderConfig, GpuEncoderBlockWeights, GpuEncoderConfig, GpuKvCache,
};
#[cfg(feature = "cuda")]
use stats::{record_d2h_transfer, record_h2d_transfer};
#[cfg(feature = "cuda")]
use crate::driver::{CudaContext, GpuBuffer};
#[cfg(feature = "cuda")]
use crate::error::Result;
#[cfg(feature = "cuda")]
pub struct GpuResidentTensor<T: Copy> {
pub(crate) buffer: GpuBuffer<T>,
h2d_count: u64,
d2h_count: u64,
kernel_launches: u64,
is_resident: bool,
}
#[cfg(feature = "cuda")]
impl<T: Copy> GpuResidentTensor<T> {
pub fn from_host(ctx: &CudaContext, data: &[T]) -> Result<Self> {
let buffer = GpuBuffer::from_host(ctx, data)?;
let bytes = data.len() * std::mem::size_of::<T>();
record_h2d_transfer(bytes as u64);
Ok(Self {
buffer,
h2d_count: 1,
d2h_count: 0,
kernel_launches: 0,
is_resident: true,
})
}
pub fn new_uninit(ctx: &CudaContext, len: usize) -> Result<Self> {
let buffer = GpuBuffer::new(ctx, len)?;
Ok(Self {
buffer,
h2d_count: 0,
d2h_count: 0,
kernel_launches: 0,
is_resident: true,
})
}
pub(crate) fn from_buffer_internal(buffer: GpuBuffer<T>, kernel_launches: u64) -> Self {
Self {
buffer,
h2d_count: 0,
d2h_count: 0,
kernel_launches,
is_resident: true,
}
}
pub fn to_host(&mut self) -> Result<Vec<T>>
where
T: Default + Clone,
{
let mut result = vec![T::default(); self.buffer.len()];
self.buffer.copy_to_host(&mut result)?;
let bytes = result.len() * std::mem::size_of::<T>();
self.d2h_count += 1;
record_d2h_transfer(bytes as u64);
Ok(result)
}
pub fn peek_host(&self) -> Result<Vec<T>>
where
T: Default + Clone,
{
let mut result = vec![T::default(); self.buffer.len()];
self.buffer.copy_to_host(&mut result)?;
Ok(result)
}
#[must_use]
pub const fn is_device_resident(&self) -> bool {
self.is_resident
}
#[must_use]
pub const fn h2d_transfers(&self) -> u64 {
self.h2d_count
}
#[must_use]
pub const fn host_to_device_transfers(&self) -> u64 {
self.h2d_count
}
#[must_use]
pub const fn d2h_transfers(&self) -> u64 {
self.d2h_count
}
#[must_use]
pub const fn device_to_host_transfers(&self) -> u64 {
self.d2h_count
}
#[must_use]
pub const fn kernel_launches(&self) -> u64 {
self.kernel_launches
}
#[must_use]
pub fn len(&self) -> usize {
self.buffer.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
#[must_use]
pub fn size_bytes(&self) -> usize {
self.buffer.size_bytes()
}
#[must_use]
pub fn buffer(&self) -> &GpuBuffer<T> {
&self.buffer
}
#[must_use]
pub fn buffer_mut(&mut self) -> &mut GpuBuffer<T> {
&mut self.buffer
}
#[must_use]
pub fn as_ptr(&self) -> u64 {
self.buffer.as_ptr()
}
pub fn record_kernel_launch(&mut self) {
self.kernel_launches += 1;
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::memory::resident::stats::{record_d2h_transfer, record_h2d_transfer};
#[cfg(feature = "cuda")]
#[test]
fn test_gpu_resident_tensor_lifecycle() {
use crate::driver::CudaContext;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping CUDA lifecycle test: {:?}", e);
return;
}
};
reset_transfer_counters();
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let mut tensor = GpuResidentTensor::from_host(&ctx, &data)
.expect("Failed to create GpuResidentTensor");
assert!(tensor.is_device_resident());
assert_eq!(tensor.len(), 8);
assert_eq!(tensor.h2d_transfers(), 1);
assert_eq!(tensor.d2h_transfers(), 0);
assert_eq!(tensor.kernel_launches(), 0);
assert_eq!(total_h2d_transfers(), 1);
assert_eq!(total_d2h_transfers(), 0);
assert_eq!(total_h2d_bytes(), 32);
let result = tensor.to_host().expect("Failed to read from GPU");
assert_eq!(result, data);
assert_eq!(tensor.d2h_transfers(), 1);
assert_eq!(total_d2h_transfers(), 1);
assert_eq!(total_d2h_bytes(), 32);
}
#[cfg(feature = "cuda")]
#[test]
fn test_gpu_resident_tensor_uninit() {
use crate::driver::CudaContext;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping CUDA uninit test: {:?}", e);
return;
}
};
reset_transfer_counters();
let tensor: GpuResidentTensor<f32> = GpuResidentTensor::new_uninit(&ctx, 16)
.expect("Failed to create uninit GpuResidentTensor");
assert_eq!(tensor.h2d_transfers(), 0);
assert_eq!(tensor.d2h_transfers(), 0);
assert!(tensor.is_device_resident());
assert_eq!(tensor.len(), 16);
assert_eq!(tensor.size_bytes(), 64);
assert_eq!(total_h2d_transfers(), 0);
assert_eq!(total_d2h_transfers(), 0);
}
#[cfg(feature = "cuda")]
#[test]
fn test_gpu_resident_tensor_peek() {
use crate::driver::CudaContext;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping CUDA peek test: {:?}", e);
return;
}
};
reset_transfer_counters();
let data = vec![42.0f32; 4];
let tensor = GpuResidentTensor::from_host(&ctx, &data)
.expect("Failed to create GpuResidentTensor");
let before_h2d = total_h2d_transfers();
let before_d2h = total_d2h_transfers();
let peeked = tensor.peek_host().expect("Failed to peek");
assert_eq!(peeked, data);
assert_eq!(total_h2d_transfers(), before_h2d);
assert_eq!(total_d2h_transfers(), before_d2h);
assert_eq!(tensor.d2h_transfers(), 0); }
#[cfg(feature = "cuda")]
#[test]
fn test_gpu_resident_tensor_buffer_access() {
use crate::driver::CudaContext;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping CUDA buffer access test: {:?}", e);
return;
}
};
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let mut tensor = GpuResidentTensor::from_host(&ctx, &data)
.expect("Failed to create GpuResidentTensor");
let buf = tensor.buffer();
assert_eq!(buf.len(), 4);
let buf_mut = tensor.buffer_mut();
assert_eq!(buf_mut.len(), 4);
}
#[test]
fn test_transfer_stats_capture_and_delta() {
reset_transfer_counters();
let before = TransferStats::capture();
assert_eq!(before.total_transfers(), 0);
record_h2d_transfer(1024);
record_h2d_transfer(2048);
record_h2d_transfer(512);
record_d2h_transfer(512);
let after = TransferStats::capture();
let delta = after.delta_from(&before);
assert_eq!(delta.h2d_transfers, 3);
assert_eq!(delta.d2h_transfers, 1);
assert_eq!(delta.h2d_bytes, 3584); assert_eq!(delta.d2h_bytes, 512);
assert_eq!(delta.total_transfers(), 4);
assert_eq!(delta.total_bytes(), 4096);
}
#[test]
fn test_transfer_stats_display() {
let stats = TransferStats {
h2d_transfers: 5,
d2h_transfers: 2,
h2d_bytes: 1024 * 1024 * 10, d2h_bytes: 1024 * 1024 * 5, };
let display = format!("{}", stats);
assert!(display.contains("H2D: 5"));
assert!(display.contains("D2H: 2"));
assert!(display.contains("10.00 MB"));
assert!(display.contains("5.00 MB"));
}
#[test]
fn test_reset_counters() {
record_h2d_transfer(100);
record_d2h_transfer(50);
reset_transfer_counters();
assert_eq!(total_h2d_transfers(), 0);
assert_eq!(total_d2h_transfers(), 0);
assert_eq!(total_h2d_bytes(), 0);
assert_eq!(total_d2h_bytes(), 0);
}
#[cfg(feature = "cuda")]
#[test]
fn test_gpu_allocation_under_pressure() {
use crate::driver::CudaContext;
let ctx = match CudaContext::new(0) {
Ok(ctx) => ctx,
Err(e) => {
eprintln!("Skipping GPU pressure test: {:?}", e);
return;
}
};
reset_transfer_counters();
const CHUNK_SIZE: usize = 64 * 1024 * 1024 / 4; const MAX_CHUNKS: usize = 1024;
let mut tensors: Vec<GpuResidentTensor<f32>> = Vec::new();
let mut allocation_count = 0;
let mut hit_limit = false;
for _ in 0..MAX_CHUNKS {
let data = vec![0.0f32; CHUNK_SIZE];
match GpuResidentTensor::from_host(&ctx, &data) {
Ok(tensor) => {
tensors.push(tensor);
allocation_count += 1;
}
Err(_) => {
hit_limit = true;
break;
}
}
}
assert!(
allocation_count > 0,
"Should have allocated at least one tensor"
);
let tensors_at_limit = tensors.len();
eprintln!(
"GPU pressure test: Allocated {} tensors ({} MB) before limit",
tensors_at_limit,
tensors_at_limit * 64
);
let drop_count = tensors.len() / 2;
for _ in 0..drop_count {
tensors.pop();
}
let data = vec![0.0f32; CHUNK_SIZE];
let recovery_result = GpuResidentTensor::from_host(&ctx, &data);
if hit_limit {
assert!(
recovery_result.is_ok(),
"Should be able to allocate after freeing tensors"
);
}
let total_transfers = total_h2d_transfers();
assert!(
total_transfers >= allocation_count as u64,
"Transfer counter should track all allocations"
);
}
#[test]
fn test_memory_pool_exhaustion() {
use crate::memory::pool::{MemoryPool, PoolConfig};
let config = PoolConfig {
total_bytes: 1024 * 1024,
page_size: 64 * 1024,
};
let mut pool = MemoryPool::new(config);
let mut allocations = Vec::new();
for _ in 0..16 {
if let Some(id) = pool.allocate(64 * 1024) {
allocations.push(id);
}
}
let stats = pool.stats();
assert_eq!(stats.free_pages, 0, "Pool should be completely full");
let failed_alloc = pool.allocate(64 * 1024);
assert!(
failed_alloc.is_none(),
"Allocation should fail when pool is exhausted"
);
if let Some(id) = allocations.pop() {
assert!(pool.free(id), "Free should succeed");
}
let recovered_alloc = pool.allocate(64 * 1024);
assert!(
recovered_alloc.is_some(),
"Allocation should succeed after freeing"
);
}
}