use crate::error::{RealizarError, Result};
use crate::tensor::Tensor;
use std::collections::HashMap;
use std::time::Duration;
const MAX_GPU_BUFFER_BYTES: usize = 256 * 1024 * 1024;
pub const LARGE_VOCAB_THRESHOLD: usize = 65536;
#[inline]
#[must_use]
pub fn exceeds_gpu_buffer_limit(elements: usize) -> bool {
elements * std::mem::size_of::<f32>() > MAX_GPU_BUFFER_BYTES
}
pub type MatmulOp = (Vec<f32>, Vec<f32>, usize, usize, usize);
#[must_use]
pub fn scalar_softmax(input: &[f32]) -> Vec<f32> {
if input.is_empty() {
return Vec::new();
}
let max_val = input.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_vals: Vec<f32> = input.iter().map(|&x| (x - max_val).exp()).collect();
let sum: f32 = exp_vals.iter().sum();
exp_vals.iter().map(|&e| e / sum).collect()
}
#[must_use]
pub fn simd_softmax(input: &[f32]) -> Vec<f32> {
if input.is_empty() {
return Vec::new();
}
let max_val = input.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_vals: Vec<f32> = input.iter().map(|&x| (x - max_val).exp()).collect();
let exp_vec = trueno::Vector::from_slice(&exp_vals);
let sum = exp_vec.sum().unwrap_or_else(|_| exp_vals.iter().sum());
exp_vals.iter().map(|&e| e / sum).collect()
}
#[must_use]
pub fn scalar_rope(input: &[f32], seq_len: usize, head_dim: usize, theta: f32) -> Vec<f32> {
if input.is_empty() || seq_len == 0 || head_dim == 0 {
return Vec::new();
}
let hidden_dim = input.len() / seq_len;
let num_heads = hidden_dim / head_dim;
let mut output = vec![0.0f32; input.len()];
for pos in 0..seq_len {
for head in 0..num_heads {
let head_start = pos * hidden_dim + head * head_dim;
for i in 0..head_dim / 2 {
let freq = 1.0 / theta.powf((2.0 * i as f32) / head_dim as f32);
let angle = pos as f32 * freq;
let cos_val = angle.cos();
let sin_val = angle.sin();
let idx0 = head_start + i;
let idx1 = head_start + i + head_dim / 2;
if idx1 < input.len() {
let x0 = input[idx0];
let x1 = input[idx1];
output[idx0] = x0 * cos_val - x1 * sin_val;
output[idx1] = x0 * sin_val + x1 * cos_val;
}
}
}
}
output
}
#[must_use]
pub fn simd_rope(input: &[f32], seq_len: usize, head_dim: usize, theta: f32) -> Vec<f32> {
if input.is_empty() || seq_len == 0 || head_dim == 0 {
return Vec::new();
}
let hidden_dim = input.len() / seq_len;
let num_heads = hidden_dim / head_dim;
let half_head = head_dim / 2;
let mut freqs: Vec<f32> = Vec::with_capacity(half_head);
for i in 0..half_head {
freqs.push(1.0 / theta.powf((2.0 * i as f32) / head_dim as f32));
}
let mut output = vec![0.0f32; input.len()];
for pos in 0..seq_len {
let angles: Vec<f32> = freqs.iter().map(|&f| pos as f32 * f).collect();
let cos_vals: Vec<f32> = angles.iter().map(|&a| a.cos()).collect();
let sin_vals: Vec<f32> = angles.iter().map(|&a| a.sin()).collect();
let cos_vec = trueno::Vector::from_slice(&cos_vals);
let sin_vec = trueno::Vector::from_slice(&sin_vals);
for head in 0..num_heads {
let head_start = pos * hidden_dim + head * head_dim;
let x0_slice = &input[head_start..head_start + half_head];
let x1_slice = &input[head_start + half_head..head_start + head_dim];
let x0_vec = trueno::Vector::from_slice(x0_slice);
let x1_vec = trueno::Vector::from_slice(x1_slice);
let x0_cos = x0_vec.mul(&cos_vec).unwrap_or_else(|_| x0_vec.clone());
let x1_sin = x1_vec.mul(&sin_vec).unwrap_or_else(|_| x1_vec.clone());
let x0_sin = x0_vec.mul(&sin_vec).unwrap_or_else(|_| x0_vec.clone());
let x1_cos = x1_vec.mul(&cos_vec).unwrap_or_else(|_| x1_vec.clone());
let out0 = x0_cos
.sub(&x1_sin)
.unwrap_or_else(|_| trueno::Vector::from_slice(x0_slice));
let out1 = x0_sin
.add(&x1_cos)
.unwrap_or_else(|_| trueno::Vector::from_slice(x1_slice));
output[head_start..head_start + half_head].copy_from_slice(out0.as_slice());
output[head_start + half_head..head_start + head_dim].copy_from_slice(out1.as_slice());
}
}
output
}
#[derive(Debug)]
pub struct ContiguousAttentionBuffer {
data: Vec<f32>,
max_seq_len: usize,
#[allow(dead_code)]
num_heads: usize,
#[allow(dead_code)]
head_dim: usize,
tensor_size: usize,
}
impl ContiguousAttentionBuffer {
#[must_use]
pub fn new(max_seq_len: usize, num_heads: usize, head_dim: usize) -> Self {
let tensor_size = max_seq_len * num_heads * head_dim;
let data = vec![0.0f32; tensor_size * 4];
Self {
data,
max_seq_len,
num_heads,
head_dim,
tensor_size,
}
}
#[must_use]
pub fn is_contiguous(&self) -> bool {
true
}
#[must_use]
pub fn get_views(&self) -> (&[f32], &[f32], &[f32], &[f32]) {
let q_start = 0;
let k_start = self.tensor_size;
let v_start = self.tensor_size * 2;
let o_start = self.tensor_size * 3;
(
&self.data[q_start..k_start],
&self.data[k_start..v_start],
&self.data[v_start..o_start],
&self.data[o_start..],
)
}
pub fn get_views_mut(&mut self) -> (&mut [f32], &mut [f32], &mut [f32], &mut [f32]) {
let tensor_size = self.tensor_size;
let (q, rest) = self.data.split_at_mut(tensor_size);
let (k, rest) = rest.split_at_mut(tensor_size);
let (v, o) = rest.split_at_mut(tensor_size);
(q, k, v, o)
}
pub fn reset(&mut self) {
self.data.fill(0.0);
}
#[must_use]
pub fn max_seq_len(&self) -> usize {
self.max_seq_len
}
}
#[must_use]
pub fn batch_embed(embedding_table: &[f32], tokens: &[usize], hidden_dim: usize) -> Vec<f32> {
if tokens.is_empty() || embedding_table.is_empty() {
return Vec::new();
}
let mut result = Vec::with_capacity(tokens.len() * hidden_dim);
for &token in tokens {
let start_idx = token * hidden_dim;
let end_idx = start_idx + hidden_dim;
if end_idx <= embedding_table.len() {
result.extend_from_slice(&embedding_table[start_idx..end_idx]);
} else {
result.extend(std::iter::repeat(0.0).take(hidden_dim));
}
}
result
}
#[must_use]
pub fn sequential_ffn(
input: &[f32],
w_up: &[f32],
w_down: &[f32],
hidden_dim: usize,
intermediate_dim: usize,
) -> Vec<f32> {
if input.is_empty() {
return Vec::new();
}
let mut intermediate = vec![0.0f32; intermediate_dim];
for i in 0..intermediate_dim {
let mut sum = 0.0f32;
for j in 0..hidden_dim {
sum += input[j] * w_up[j * intermediate_dim + i];
}
intermediate[i] =
sum * 0.5 * (1.0 + (sum * 0.797_884_5 * (1.0 + 0.044_715 * sum * sum)).tanh());
}
let mut output = vec![0.0f32; hidden_dim];
for i in 0..hidden_dim {
let mut sum = 0.0f32;
for j in 0..intermediate_dim {
sum += intermediate[j] * w_down[j * hidden_dim + i];
}
output[i] = sum;
}
output
}
#[must_use]
pub fn parallel_ffn(
input: &[f32],
w_up: &[f32],
w_down: &[f32],
hidden_dim: usize,
intermediate_dim: usize,
) -> Vec<f32> {
use rayon::prelude::*;
if input.is_empty() {
return Vec::new();
}
let intermediate: Vec<f32> = (0..intermediate_dim)
.map(|i| {
let sum: f32 = (0..hidden_dim)
.map(|j| input[j] * w_up[j * intermediate_dim + i])
.sum();
sum * 0.5 * (1.0 + (sum * 0.797_884_5 * (1.0 + 0.044_715 * sum * sum)).tanh())
})
.collect();
let output: Vec<f32> = (0..hidden_dim)
.into_par_iter()
.map(|i| {
(0..intermediate_dim)
.map(|j| intermediate[j] * w_down[j * hidden_dim + i])
.sum()
})
.collect();
output
}
#[must_use]
pub fn standard_layernorm(input: &[f32], gamma: &[f32], beta: &[f32], eps: f32) -> Vec<f32> {
if input.is_empty() {
return Vec::new();
}
let n = input.len() as f32;
let mean: f32 = input.iter().sum::<f32>() / n;
let variance: f32 = input.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / n;
let std_dev = (variance + eps).sqrt();
input
.iter()
.enumerate()
.map(|(i, &x)| {
let normalized = (x - mean) / std_dev;
normalized * gamma.get(i).copied().unwrap_or(1.0) + beta.get(i).copied().unwrap_or(0.0)
})
.collect()
}
#[must_use]
pub fn fused_layernorm(input: &[f32], gamma: &[f32], beta: &[f32], eps: f32) -> Vec<f32> {
if input.is_empty() {
return Vec::new();
}
let n = input.len();
let mut mean = 0.0f32;
let mut m2 = 0.0f32;
for (i, &x) in input.iter().enumerate() {
let delta = x - mean;
mean += delta / (i + 1) as f32;
let delta2 = x - mean;
m2 += delta * delta2;
}
let variance = m2 / n as f32;
let std_dev = (variance + eps).sqrt();
let inv_std = 1.0 / std_dev;
input
.iter()
.enumerate()
.map(|(i, &x)| {
let normalized = (x - mean) * inv_std;
normalized * gamma.get(i).copied().unwrap_or(1.0) + beta.get(i).copied().unwrap_or(0.0)
})
.collect()
}
const CACHE_LINE_SIZE: usize = 64;
#[derive(Debug)]
pub struct CacheAlignedBuffer {
data: Vec<f32>,
offset: usize,
len: usize,
}
impl CacheAlignedBuffer {
#[must_use]
pub fn new(len: usize) -> Self {
let align_elements = CACHE_LINE_SIZE / std::mem::size_of::<f32>();
let extra = align_elements - 1;
let data = vec![0.0f32; len + extra];
let ptr = data.as_ptr() as usize;
let misalignment = ptr % CACHE_LINE_SIZE;
let offset = if misalignment == 0 {
0
} else {
(CACHE_LINE_SIZE - misalignment) / std::mem::size_of::<f32>()
};
Self { data, offset, len }
}
#[must_use]
pub fn is_aligned(&self, alignment: usize) -> bool {
let ptr = self.as_slice().as_ptr() as usize;
ptr % alignment == 0
}
#[must_use]
pub fn len(&self) -> usize {
self.len
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len == 0
}
#[must_use]
pub fn as_slice(&self) -> &[f32] {
&self.data[self.offset..self.offset + self.len]
}
pub fn as_mut_slice(&mut self) -> &mut [f32] {
let offset = self.offset;
let len = self.len;
&mut self.data[offset..offset + len]
}
}
#[inline]
pub fn prefetch_read(data: &[f32], position: usize, distance: usize) {
let prefetch_pos = position + distance;
if prefetch_pos < data.len() {
let _ = unsafe { std::ptr::read_volatile(&data[prefetch_pos]) };
}
}
#[must_use]
pub fn sequential_sum(data: &[f32]) -> f32 {
data.iter().sum()
}
#[must_use]
pub fn sum_with_prefetch(data: &[f32], prefetch_distance: usize) -> f32 {
let mut sum = 0.0f32;
let len = data.len();
for i in 0..len {
if i + prefetch_distance < len {
prefetch_read(data, i, prefetch_distance);
}
sum += data[i];
}
sum
}
#[must_use]
pub fn naive_matmul(
mat_a: &[f32],
mat_b: &[f32],
rows: usize,
inner: usize,
cols: usize,
) -> Vec<f32> {
let mut result = vec![0.0f32; rows * cols];
for row in 0..rows {
for col in 0..cols {
let mut sum = 0.0f32;
for idx in 0..inner {
sum += mat_a[row * inner + idx] * mat_b[idx * cols + col];
}
result[row * cols + col] = sum;
}
}
result
}
#[must_use]
#[allow(clippy::many_single_char_names)] pub fn blocked_matmul(
mat_a: &[f32],
mat_b: &[f32],
rows: usize,
inner: usize,
cols: usize,
block_size: usize,
) -> Vec<f32> {
let mut result = vec![0.0f32; rows * cols];
for row_blk in (0..rows).step_by(block_size) {
let row_end = (row_blk + block_size).min(rows);
for col_blk in (0..cols).step_by(block_size) {
let col_end = (col_blk + block_size).min(cols);
for inner_blk in (0..inner).step_by(block_size) {
let inner_end = (inner_blk + block_size).min(inner);
for row in row_blk..row_end {
for col in col_blk..col_end {
let mut sum = result[row * cols + col];
for idx in inner_blk..inner_end {
sum += mat_a[row * inner + idx] * mat_b[idx * cols + col];
}
result[row * cols + col] = sum;
}
}
}
}
}
result
}
#[derive(Debug)]
pub struct TensorPool {
capacity: usize,
buffers: Vec<Vec<f32>>,
}
impl TensorPool {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
capacity,
buffers: Vec::with_capacity(capacity),
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn available(&self) -> usize {
self.buffers.len()
}
pub fn acquire(&mut self, size: usize) -> Vec<f32> {
if let Some(idx) = self.buffers.iter().position(|b| b.capacity() >= size) {
let mut buffer = self.buffers.swap_remove(idx);
buffer.resize(size, 0.0);
buffer
} else {
vec![0.0f32; size]
}
}
pub fn release(&mut self, buffer: Vec<f32>) {
if self.buffers.len() < self.capacity {
self.buffers.push(buffer);
}
}
pub fn clear(&mut self) {
self.buffers.clear();
}
}
#[derive(Debug)]
pub struct ForwardArena {
data: Vec<f32>,
offset: usize,
}
impl ForwardArena {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
data: vec![0.0f32; capacity],
offset: 0,
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.data.len()
}
#[must_use]
pub fn used(&self) -> usize {
self.offset
}
pub fn alloc(&mut self, size: usize) -> &mut [f32] {
let start = self.offset;
let end = start + size;
assert!(
end <= self.data.len(),
"ForwardArena: insufficient capacity (need {}, have {})",
end,
self.data.len()
);
self.offset = end;
&mut self.data[start..end]
}
pub fn reset(&mut self) {
self.offset = 0;
}
}
#[derive(Debug)]
pub struct ScratchBuffer {
num_layers: usize,
layer_size: usize,
data: Vec<f32>,
}
impl ScratchBuffer {
#[must_use]
pub fn new(num_layers: usize, layer_size: usize) -> Self {
Self {
num_layers,
layer_size,
data: vec![0.0f32; num_layers * layer_size],
}
}
#[must_use]
pub fn num_layers(&self) -> usize {
self.num_layers
}
#[must_use]
pub fn layer_size(&self) -> usize {
self.layer_size
}
#[must_use]
pub fn total_size(&self) -> usize {
self.num_layers * self.layer_size
}
#[must_use]
pub fn get_layer(&self, layer_idx: usize) -> &[f32] {
assert!(
layer_idx < self.num_layers,
"ScratchBuffer: layer index {} out of bounds (max {})",
layer_idx,
self.num_layers
);
let start = layer_idx * self.layer_size;
let end = start + self.layer_size;
&self.data[start..end]
}
pub fn get_layer_mut(&mut self, layer_idx: usize) -> &mut [f32] {
assert!(
layer_idx < self.num_layers,
"ScratchBuffer: layer index {} out of bounds (max {})",
layer_idx,
self.num_layers
);
let start = layer_idx * self.layer_size;
let end = start + self.layer_size;
&mut self.data[start..end]
}
pub fn reset(&mut self) {
self.data.fill(0.0);
}
}
#[must_use]
pub fn quantized_dot_q4(block_a: &[u8], block_b: &[u8]) -> f32 {
if block_a.len() < 18 || block_b.len() < 18 {
return 0.0;
}
let scale_a = half::f16::from_le_bytes([block_a[0], block_a[1]]).to_f32();
let scale_b = half::f16::from_le_bytes([block_b[0], block_b[1]]).to_f32();
let mut acc = 0i32;
for i in 0..16 {
let byte_a = block_a[2 + i];
let byte_b = block_b[2 + i];
let a_lo = (byte_a & 0x0F) as i32 - 8;
let a_hi = ((byte_a >> 4) & 0x0F) as i32 - 8;
let b_lo = (byte_b & 0x0F) as i32 - 8;
let b_hi = ((byte_b >> 4) & 0x0F) as i32 - 8;
acc += a_lo * b_lo + a_hi * b_hi;
}
(acc as f32) * scale_a * scale_b
}
#[must_use]
pub fn quantized_dot_q8(block_a: &[u8], block_b: &[u8]) -> f32 {
if block_a.len() < 34 || block_b.len() < 34 {
return 0.0;
}
let scale_a = half::f16::from_le_bytes([block_a[0], block_a[1]]).to_f32();
let scale_b = half::f16::from_le_bytes([block_b[0], block_b[1]]).to_f32();
let mut acc = 0i32;
for i in 0..32 {
let a_val = block_a[2 + i] as i8 as i32;
let b_val = block_b[2 + i] as i8 as i32;
acc += a_val * b_val;
}
(acc as f32) * scale_a * scale_b
}
#[must_use]
pub fn quantized_matvec_q4(weights: &[u8], input: &[f32], rows: usize, cols: usize) -> Vec<f32> {
const Q4_BLOCK_SIZE: usize = 18; const Q4_BLOCK_VALUES: usize = 32;
let blocks_per_row = cols.div_ceil(Q4_BLOCK_VALUES);
let row_bytes = blocks_per_row * Q4_BLOCK_SIZE;
let mut output = vec![0.0f32; rows];
for (row, out_val) in output.iter_mut().enumerate().take(rows) {
let row_offset = row * row_bytes;
let mut acc = 0.0f32;
for block_idx in 0..blocks_per_row {
let block_offset = row_offset + block_idx * Q4_BLOCK_SIZE;
if block_offset + Q4_BLOCK_SIZE > weights.len() {
break;
}
let scale =
half::f16::from_le_bytes([weights[block_offset], weights[block_offset + 1]])
.to_f32();
let input_offset = block_idx * Q4_BLOCK_VALUES;
for i in 0..16 {
let byte = weights[block_offset + 2 + i];
let val_lo = (byte & 0x0F) as i32 - 8;
let val_hi = ((byte >> 4) & 0x0F) as i32 - 8;
let in_idx_lo = input_offset + i * 2;
let in_idx_hi = input_offset + i * 2 + 1;
if in_idx_lo < cols {
acc += (val_lo as f32) * scale * input[in_idx_lo];
}
if in_idx_hi < cols {
acc += (val_hi as f32) * scale * input[in_idx_hi];
}
}
}
*out_val = acc;
}
output
}
#[must_use]
pub fn quantized_matvec_q8(weights: &[u8], input: &[f32], rows: usize, cols: usize) -> Vec<f32> {
const Q8_BLOCK_SIZE: usize = 34; const Q8_BLOCK_VALUES: usize = 32;
let blocks_per_row = cols.div_ceil(Q8_BLOCK_VALUES);
let row_bytes = blocks_per_row * Q8_BLOCK_SIZE;
let mut output = vec![0.0f32; rows];
for (row, out_val) in output.iter_mut().enumerate().take(rows) {
let row_offset = row * row_bytes;
let mut acc = 0.0f32;
for block_idx in 0..blocks_per_row {
let block_offset = row_offset + block_idx * Q8_BLOCK_SIZE;
if block_offset + Q8_BLOCK_SIZE > weights.len() {
break;
}
let scale =
half::f16::from_le_bytes([weights[block_offset], weights[block_offset + 1]])
.to_f32();
let input_offset = block_idx * Q8_BLOCK_VALUES;
for i in 0..32 {
let val = weights[block_offset + 2 + i] as i8 as i32;
let in_idx = input_offset + i;
if in_idx < cols {
acc += (val as f32) * scale * input[in_idx];
}
}
}
*out_val = acc;
}
output
}
#[derive(Debug, Clone, Default)]
pub struct QuantizedAccumulator {
sum: f32,
}
impl QuantizedAccumulator {
#[must_use]
pub fn new() -> Self {
Self { sum: 0.0 }
}
#[must_use]
pub fn sum(&self) -> f32 {
self.sum
}
pub fn reset(&mut self) {
self.sum = 0.0;
}
#[inline]
pub fn add_scaled(&mut self, value: f32, scale: f32) {
self.sum += value * scale;
}
#[inline]
pub fn add_block(&mut self, block_sum: f32, block_scale: f32) {
self.sum += block_sum * block_scale;
}
}
#[derive(Debug)]
pub struct DoubleBuffer<T> {
front: Vec<T>,
back: Vec<T>,
}
impl<T: Default + Clone> DoubleBuffer<T> {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
front: vec![T::default(); capacity],
back: vec![T::default(); capacity],
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.front.len()
}
#[must_use]
pub fn front(&self) -> &[T] {
&self.front
}
pub fn back_mut(&mut self) -> &mut [T] {
&mut self.back
}
pub fn swap(&mut self) {
std::mem::swap(&mut self.front, &mut self.back);
}
}
#[derive(Debug, Clone)]
pub struct ChunkedProcessor {
chunk_size: usize,
}
impl ChunkedProcessor {
#[must_use]
pub fn new(chunk_size: usize) -> Self {
Self { chunk_size }
}
#[must_use]
pub fn chunk_size(&self) -> usize {
self.chunk_size
}
#[must_use]
pub fn num_chunks(&self, total_len: usize) -> usize {
if total_len == 0 {
return 0;
}
total_len.div_ceil(self.chunk_size)
}
#[must_use]
pub fn chunk_bounds(&self, chunk_idx: usize, total_len: usize) -> (usize, usize) {
let start = chunk_idx * self.chunk_size;
let end = (start + self.chunk_size).min(total_len);
(start, end)
}
pub fn process_chunks<T, F>(&self, data: &[T], mut process: F) -> f32
where
F: FnMut(&[T]) -> f32,
{
let mut total = 0.0f32;
let num_chunks = self.num_chunks(data.len());
for chunk_idx in 0..num_chunks {
let (start, end) = self.chunk_bounds(chunk_idx, data.len());
total += process(&data[start..end]);
}
total
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
pub enum GpuPipelineStage {
Embed = 0,
Attention = 1,
FFN = 2,
Output = 3,
}
#[derive(Debug)]
pub struct InferencePipeline {
num_stages: usize,
stage_times: std::collections::HashMap<GpuPipelineStage, f32>,
}
impl InferencePipeline {
#[must_use]
pub fn new(num_stages: usize) -> Self {
Self {
num_stages,
stage_times: std::collections::HashMap::new(),
}
}
#[must_use]
pub fn num_stages(&self) -> usize {
self.num_stages
}
pub fn record_stage_time(&mut self, stage: GpuPipelineStage, time_ms: f32) {
self.stage_times.insert(stage, time_ms);
}
#[must_use]
pub fn total_latency(&self) -> f32 {
self.stage_times.values().sum()
}
#[must_use]
pub fn stage_breakdown(&self) -> &std::collections::HashMap<GpuPipelineStage, f32> {
&self.stage_times
}
pub fn reset(&mut self) {
self.stage_times.clear();
}
}
#[derive(Debug)]
pub struct TokenBatch {
tokens: Vec<usize>,
capacity: usize,
}
impl TokenBatch {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
tokens: Vec::with_capacity(capacity),
capacity,
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn len(&self) -> usize {
self.tokens.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.tokens.is_empty()
}
#[must_use]
pub fn is_full(&self) -> bool {
self.tokens.len() >= self.capacity
}
pub fn push(&mut self, token: usize) -> Option<Vec<usize>> {
self.tokens.push(token);
if self.is_full() {
Some(self.flush())
} else {
None
}
}
pub fn flush(&mut self) -> Vec<usize> {
std::mem::take(&mut self.tokens)
}
}
#[derive(Debug, Clone)]
struct SpeculativeCandidate {
token: usize,
#[allow(dead_code)]
confidence: f32,
}
#[derive(Debug)]
pub struct SpeculativeBuffer {
candidates: Vec<SpeculativeCandidate>,
capacity: usize,
}
impl SpeculativeBuffer {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
candidates: Vec::with_capacity(capacity),
capacity,
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn len(&self) -> usize {
self.candidates.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.candidates.is_empty()
}
pub fn add_candidate(&mut self, token: usize, confidence: f32) {
if self.candidates.len() < self.capacity {
self.candidates
.push(SpeculativeCandidate { token, confidence });
}
}
#[must_use]
pub fn verify(&self, actual_tokens: &[usize]) -> (usize, Option<usize>) {
let mut accepted = 0;
for (i, candidate) in self.candidates.iter().enumerate() {
if i < actual_tokens.len() && candidate.token == actual_tokens[i] {
accepted += 1;
} else {
return (accepted, Some(i));
}
}
(accepted, None)
}
pub fn accept(&mut self, n: usize) {
if n >= self.candidates.len() {
self.candidates.clear();
} else {
self.candidates.drain(0..n);
}
}
pub fn reject(&mut self) {
self.candidates.clear();
}
}
pub type BatchId = u64;
#[derive(Debug)]
pub struct InferenceBatchScheduler {
next_id: BatchId,
pending: std::collections::HashMap<BatchId, Vec<usize>>,
completed: std::collections::VecDeque<(BatchId, Vec<usize>)>,
}
impl InferenceBatchScheduler {
#[must_use]
pub fn new() -> Self {
Self {
next_id: 0,
pending: std::collections::HashMap::new(),
completed: std::collections::VecDeque::new(),
}
}
#[must_use]
pub fn pending_count(&self) -> usize {
self.pending.len()
}
#[must_use]
pub fn completed_count(&self) -> usize {
self.completed.len()
}
pub fn submit(&mut self, tokens: Vec<usize>) -> BatchId {
let id = self.next_id;
self.next_id += 1;
self.pending.insert(id, tokens);
id
}
pub fn complete(&mut self, batch_id: BatchId, results: Vec<usize>) {
self.pending.remove(&batch_id);
self.completed.push_back((batch_id, results));
}
pub fn poll(&mut self) -> Option<(BatchId, Vec<usize>)> {
self.completed.pop_front()
}
pub fn drain(&mut self) -> Vec<(BatchId, Vec<usize>)> {
self.completed.drain(..).collect()
}
}
impl Default for InferenceBatchScheduler {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct AsyncRequestQueue<T> {
items: std::collections::VecDeque<T>,
capacity: usize,
}
impl<T> AsyncRequestQueue<T> {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
items: std::collections::VecDeque::with_capacity(capacity),
capacity,
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn len(&self) -> usize {
self.items.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.items.is_empty()
}
#[must_use]
pub fn is_full(&self) -> bool {
self.items.len() >= self.capacity
}
pub fn try_push(&mut self, item: T) -> bool {
if self.is_full() {
false
} else {
self.items.push_back(item);
true
}
}
pub fn try_pop(&mut self) -> Option<T> {
self.items.pop_front()
}
}
pub type InferenceCompletionHandler = Box<dyn Fn(u64, &[usize]) + Send + Sync>;
pub struct InferenceEventNotifier {
handlers: Vec<InferenceCompletionHandler>,
}
impl std::fmt::Debug for InferenceEventNotifier {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InferenceEventNotifier")
.field("handler_count", &self.handlers.len())
.finish()
}
}
impl InferenceEventNotifier {
#[must_use]
pub fn new() -> Self {
Self {
handlers: Vec::new(),
}
}
#[must_use]
pub fn handler_count(&self) -> usize {
self.handlers.len()
}
pub fn register(&mut self, handler: InferenceCompletionHandler) {
self.handlers.push(handler);
}
pub fn notify(&self, request_id: u64, tokens: &[usize]) {
for handler in &self.handlers {
handler(request_id, tokens);
}
}
pub fn clear(&mut self) {
self.handlers.clear();
}
}
impl Default for InferenceEventNotifier {
fn default() -> Self {
Self::new()
}
}
pub type RequestId = u64;
#[derive(Debug)]
pub struct TimeoutManager {
deadlines: std::collections::HashMap<RequestId, std::time::Instant>,
}
impl TimeoutManager {
#[must_use]
pub fn new() -> Self {
Self {
deadlines: std::collections::HashMap::new(),
}
}
#[must_use]
pub fn active_count(&self) -> usize {
self.deadlines.len()
}
pub fn register(&mut self, request_id: RequestId, deadline: std::time::Instant) {
self.deadlines.insert(request_id, deadline);
}
pub fn remove(&mut self, request_id: RequestId) {
self.deadlines.remove(&request_id);
}
pub fn check_expired(&mut self) -> Vec<RequestId> {
let now = std::time::Instant::now();
let expired: Vec<RequestId> = self
.deadlines
.iter()
.filter(|(_, &deadline)| now >= deadline)
.map(|(&id, _)| id)
.collect();
for id in &expired {
self.deadlines.remove(id);
}
expired
}
}
impl Default for TimeoutManager {
fn default() -> Self {
Self::new()
}
}
pub type Priority = u32;
#[derive(Debug, Clone)]
pub struct PriorityRequest<T> {
priority: Priority,
sequence: u64, data: T,
}
impl<T> PriorityRequest<T> {
#[must_use]
pub fn new(priority: Priority, data: T) -> Self {
Self {
priority,
sequence: 0, data,
}
}
#[must_use]
pub fn priority(&self) -> Priority {
self.priority
}
#[must_use]
pub fn data(&self) -> &T {
&self.data
}
#[must_use]
pub fn into_data(self) -> T {
self.data
}
}
#[derive(Debug)]
pub struct PriorityRequestQueue<T> {
items: Vec<PriorityRequest<T>>,
next_sequence: u64,
}
impl<T> PriorityRequestQueue<T> {
#[must_use]
pub fn new() -> Self {
Self {
items: Vec::new(),
next_sequence: 0,
}
}
#[must_use]
pub fn len(&self) -> usize {
self.items.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.items.is_empty()
}
pub fn enqueue(&mut self, mut request: PriorityRequest<T>) {
request.sequence = self.next_sequence;
self.next_sequence += 1;
self.items.push(request);
}
pub fn dequeue_highest(&mut self) -> Option<PriorityRequest<T>> {
if self.items.is_empty() {
return None;
}
let mut best_idx = 0;
for (i, item) in self.items.iter().enumerate().skip(1) {
let best = &self.items[best_idx];
if item.priority > best.priority
|| (item.priority == best.priority && item.sequence < best.sequence)
{
best_idx = i;
}
}
Some(self.items.swap_remove(best_idx))
}
}
impl<T> Default for PriorityRequestQueue<T> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct TokenRateLimiter {
tokens: u32,
capacity: u32,
rate: f64, last_refill: std::time::Instant,
}
impl TokenRateLimiter {
#[must_use]
pub fn new(rate: f64, burst_capacity: u32) -> Self {
Self {
tokens: burst_capacity, capacity: burst_capacity,
rate,
last_refill: std::time::Instant::now(),
}
}
#[must_use]
pub fn tokens_available(&self) -> u32 {
self.tokens
}
pub fn try_acquire(&mut self, count: u32) -> bool {
if self.tokens >= count {
self.tokens -= count;
true
} else {
false
}
}
pub fn refill(&mut self) {
let now = std::time::Instant::now();
let elapsed = now.duration_since(self.last_refill).as_secs_f64();
let new_tokens = (elapsed * self.rate) as u32;
if new_tokens > 0 {
self.tokens = (self.tokens + new_tokens).min(self.capacity);
self.last_refill = now;
}
}
}
pub type AllocationId = u64;
#[derive(Debug, Clone)]
struct ResourceAllocation {
memory: u64,
compute: u32,
}
#[derive(Debug)]
pub struct ResourceTracker {
memory_capacity: u64,
compute_capacity: u32,
memory_used: u64,
compute_used: u32,
allocations: std::collections::HashMap<AllocationId, ResourceAllocation>,
next_id: AllocationId,
}
impl ResourceTracker {
#[must_use]
pub fn new(memory_capacity: u64, compute_capacity: u32) -> Self {
Self {
memory_capacity,
compute_capacity,
memory_used: 0,
compute_used: 0,
allocations: std::collections::HashMap::new(),
next_id: 0,
}
}
#[must_use]
pub fn memory_usage(&self) -> u64 {
self.memory_used
}
#[must_use]
pub fn compute_usage(&self) -> u32 {
self.compute_used
}
#[must_use]
pub fn can_allocate(&self, memory: u64, compute: u32) -> bool {
self.memory_used + memory <= self.memory_capacity
&& self.compute_used + compute <= self.compute_capacity
}
pub fn allocate(&mut self, memory: u64, compute: u32) -> Option<AllocationId> {
if !self.can_allocate(memory, compute) {
return None;
}
let id = self.next_id;
self.next_id += 1;
self.memory_used += memory;
self.compute_used += compute;
self.allocations
.insert(id, ResourceAllocation { memory, compute });
Some(id)
}
pub fn release(&mut self, id: AllocationId) {
if let Some(alloc) = self.allocations.remove(&id) {
self.memory_used = self.memory_used.saturating_sub(alloc.memory);
self.compute_used = self.compute_used.saturating_sub(alloc.compute);
}
}
#[must_use]
pub fn usage_percentage(&self) -> (f64, f64) {
let mem_pct = if self.memory_capacity > 0 {
(self.memory_used as f64 / self.memory_capacity as f64) * 100.0
} else {
0.0
};
let compute_pct = if self.compute_capacity > 0 {
(self.compute_used as f64 / self.compute_capacity as f64) * 100.0
} else {
0.0
};
(mem_pct, compute_pct)
}
}
impl Default for ResourceTracker {
fn default() -> Self {
Self::new(8 * 1024 * 1024 * 1024, 100)
}
}
#[derive(Debug)]
pub struct InferenceMetrics {
latencies: Vec<std::time::Duration>,
total_tokens: u64,
start_time: std::time::Instant,
}
impl InferenceMetrics {
#[must_use]
pub fn new() -> Self {
Self {
latencies: Vec::new(),
total_tokens: 0,
start_time: std::time::Instant::now(),
}
}
#[must_use]
pub fn total_inferences(&self) -> usize {
self.latencies.len()
}
#[must_use]
pub fn total_tokens(&self) -> u64 {
self.total_tokens
}
pub fn record_inference(&mut self, latency: std::time::Duration, tokens: usize) {
self.latencies.push(latency);
self.total_tokens += tokens as u64;
}
#[must_use]
pub fn latency_percentile(&self, percentile: u8) -> Option<std::time::Duration> {
if self.latencies.is_empty() {
return None;
}
let mut sorted = self.latencies.clone();
sorted.sort();
let idx = ((percentile as usize) * sorted.len() / 100).min(sorted.len() - 1);
Some(sorted[idx])
}
#[must_use]
pub fn throughput(&self) -> f64 {
let elapsed = self.start_time.elapsed().as_secs_f64();
if elapsed > 0.0 {
self.total_tokens as f64 / elapsed
} else {
0.0
}
}
pub fn reset(&mut self) {
self.latencies.clear();
self.total_tokens = 0;
self.start_time = std::time::Instant::now();
}
}
impl Default for InferenceMetrics {
fn default() -> Self {
Self::new()
}
}
pub type HealthCheckFn = Box<dyn Fn() -> bool + Send + Sync>;
pub struct HealthChecker {
checks: Vec<(String, HealthCheckFn)>,
last_results: std::collections::HashMap<String, bool>,
}
impl std::fmt::Debug for HealthChecker {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HealthChecker")
.field("check_count", &self.checks.len())
.field("last_results", &self.last_results)
.finish()
}
}
impl HealthChecker {
#[must_use]
pub fn new() -> Self {
Self {
checks: Vec::new(),
last_results: std::collections::HashMap::new(),
}
}
#[must_use]
pub fn check_count(&self) -> usize {
self.checks.len()
}
pub fn register_check(&mut self, name: &str, check: HealthCheckFn) {
self.checks.push((name.to_string(), check));
}
pub fn check_all(&mut self) -> std::collections::HashMap<String, bool> {
let mut results = std::collections::HashMap::new();
for (name, check) in &self.checks {
let healthy = check();
results.insert(name.clone(), healthy);
}
self.last_results.clone_from(&results);
results
}
#[must_use]
pub fn is_healthy(&self) -> bool {
if self.checks.is_empty() {
return true;
}
self.last_results.values().all(|&v| v)
}
pub fn clear(&mut self) {
self.checks.clear();
self.last_results.clear();
}
}
impl Default for HealthChecker {
fn default() -> Self {
Self::new()
}
}
pub type ShutdownHandlerFn = Box<dyn Fn() + Send + Sync>;
pub struct ShutdownCoordinator {
shutting_down: bool,
pending_requests: u32,
handlers: Vec<ShutdownHandlerFn>,
}
impl std::fmt::Debug for ShutdownCoordinator {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ShutdownCoordinator")
.field("shutting_down", &self.shutting_down)
.field("pending_requests", &self.pending_requests)
.field("handler_count", &self.handlers.len())
.finish()
}
}
impl ShutdownCoordinator {
#[must_use]
pub fn new() -> Self {
Self {
shutting_down: false,
pending_requests: 0,
handlers: Vec::new(),
}
}
#[must_use]
pub fn is_shutting_down(&self) -> bool {
self.shutting_down
}
#[must_use]
pub fn pending_requests(&self) -> u32 {
self.pending_requests
}
#[must_use]
pub fn handler_count(&self) -> usize {
self.handlers.len()
}
pub fn register_handler(&mut self, handler: ShutdownHandlerFn) {
self.handlers.push(handler);
}
pub fn request_started(&mut self) {
self.pending_requests += 1;
}
pub fn request_completed(&mut self) {
self.pending_requests = self.pending_requests.saturating_sub(1);
}
pub fn initiate_shutdown(&mut self) {
if self.shutting_down {
return;
}
self.shutting_down = true;
for handler in &self.handlers {
handler();
}
}
#[must_use]
pub fn is_complete(&self) -> bool {
self.shutting_down && self.pending_requests == 0
}
}
impl Default for ShutdownCoordinator {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ComputeBackend {
Gpu,
Cpu,
#[default]
Auto,
}
pub struct GpuCompute {
backend: ComputeBackend,
gpu: Option<trueno::backends::gpu::GpuBackend>,
}
impl GpuCompute {
pub fn auto() -> Result<Self> {
Self::new(ComputeBackend::Auto)
}
pub fn new(backend: ComputeBackend) -> Result<Self> {
match backend {
ComputeBackend::Gpu => {
if trueno::backends::gpu::GpuBackend::is_available() {
Ok(Self {
backend: ComputeBackend::Gpu,
gpu: Some(trueno::backends::gpu::GpuBackend::new()),
})
} else {
Err(RealizarError::GpuError {
reason: "GPU not available".to_string(),
})
}
},
ComputeBackend::Cpu => Ok(Self {
backend: ComputeBackend::Cpu,
gpu: None,
}),
ComputeBackend::Auto => {
if trueno::backends::gpu::GpuBackend::is_available() {
Ok(Self {
backend: ComputeBackend::Gpu,
gpu: Some(trueno::backends::gpu::GpuBackend::new()),
})
} else {
Ok(Self {
backend: ComputeBackend::Cpu,
gpu: None,
})
}
},
}
}
#[must_use]
pub fn is_gpu(&self) -> bool {
self.backend == ComputeBackend::Gpu && self.gpu.is_some()
}
#[must_use]
pub fn backend(&self) -> ComputeBackend {
self.backend
}
#[allow(clippy::many_single_char_names)]
pub fn matmul(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
if a.len() != m * k {
return Err(RealizarError::InvalidShape {
reason: format!(
"Matrix A size {} doesn't match m*k={}*{}={}",
a.len(),
m,
k,
m * k
),
});
}
if b.len() != k * n {
return Err(RealizarError::InvalidShape {
reason: format!(
"Matrix B size {} doesn't match k*n={}*{}={}",
b.len(),
k,
n,
k * n
),
});
}
if let Some(gpu) = &mut self.gpu {
#[allow(clippy::implicit_clone)]
gpu.matmul(a, b, m, k, n)
.map_err(|e| RealizarError::GpuError {
reason: e.to_string(),
})
} else {
Ok(cpu_matmul(a, b, m, k, n))
}
}
#[allow(clippy::many_single_char_names)]
pub fn matmul_tensor(&mut self, a: &Tensor<f32>, b: &Tensor<f32>) -> Result<Tensor<f32>> {
let a_shape = a.shape();
let b_shape = b.shape();
if a_shape.len() != 2 || b_shape.len() != 2 {
return Err(RealizarError::InvalidShape {
reason: "matmul_tensor requires 2D tensors".to_string(),
});
}
let m = a_shape[0];
let k = a_shape[1];
let k2 = b_shape[0];
let n = b_shape[1];
if k != k2 {
return Err(RealizarError::InvalidShape {
reason: format!("Inner dimensions don't match: A[{m},{k}] @ B[{k2},{n}]"),
});
}
let result = self.matmul(a.data(), b.data(), m, k, n)?;
Tensor::from_vec(vec![m, n], result)
}
pub fn dot(&mut self, a: &[f32], b: &[f32]) -> Result<f32> {
if a.len() != b.len() {
return Err(RealizarError::InvalidShape {
reason: format!("Vector lengths don't match: {} vs {}", a.len(), b.len()),
});
}
if let Some(gpu) = &mut self.gpu {
#[allow(clippy::implicit_clone)]
gpu.dot(a, b).map_err(|e| RealizarError::GpuError {
reason: e.to_string(),
})
} else {
Ok(a.iter().zip(b.iter()).map(|(x, y)| x * y).sum())
}
}
pub fn relu(&mut self, input: &[f32]) -> Result<Vec<f32>> {
if let Some(gpu) = &mut self.gpu {
#[allow(clippy::implicit_clone)]
gpu.relu(input).map_err(|e| RealizarError::GpuError {
reason: e.to_string(),
})
} else {
Ok(input.iter().map(|&x| x.max(0.0)).collect())
}
}
pub fn sigmoid(&mut self, input: &[f32]) -> Result<Vec<f32>> {
if let Some(gpu) = &mut self.gpu {
#[allow(clippy::implicit_clone)]
gpu.sigmoid(input).map_err(|e| RealizarError::GpuError {
reason: e.to_string(),
})
} else {
Ok(input.iter().map(|&x| 1.0 / (1.0 + (-x).exp())).collect())
}
}
}
#[allow(clippy::many_single_char_names)]
pub(crate) fn cpu_matmul(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
if m == 1 {
return cpu_vector_matmul(a, b, k, n);
}
let mut c = vec![0.0; m * n];
for i in 0..m {
for j in 0..n {
let mut sum = 0.0;
for p in 0..k {
sum += a[i * k + p] * b[p * n + j];
}
c[i * n + j] = sum;
}
}
c
}
#[allow(clippy::many_single_char_names)]
fn cpu_vector_matmul(a: &[f32], b: &[f32], k: usize, n: usize) -> Vec<f32> {
use rayon::prelude::*;
if n < 2048 {
return cpu_vector_matmul_seq(a, b, k, n);
}
const CHUNK_SIZE: usize = 1024;
let num_chunks = n.div_ceil(CHUNK_SIZE);
let chunks: Vec<Vec<f32>> = (0..num_chunks)
.into_par_iter()
.map(|chunk_idx| {
let start = chunk_idx * CHUNK_SIZE;
let end = (start + CHUNK_SIZE).min(n);
let chunk_len = end - start;
let mut chunk_c = vec![0.0f32; chunk_len];
for (p, &a_val) in a.iter().enumerate() {
let row_start = p * n + start;
let row = &b[row_start..row_start + chunk_len];
for (j, &b_val) in row.iter().enumerate() {
chunk_c[j] += a_val * b_val;
}
}
chunk_c
})
.collect();
chunks.into_iter().flatten().collect()
}
#[allow(clippy::many_single_char_names)]
fn cpu_vector_matmul_seq(a: &[f32], b: &[f32], _k: usize, n: usize) -> Vec<f32> {
let mut c = vec![0.0f32; n];
for (p, &a_val) in a.iter().enumerate() {
let row = &b[p * n..(p + 1) * n];
for (j, &b_val) in row.iter().enumerate() {
c[j] += a_val * b_val;
}
}
c
}
#[allow(clippy::many_single_char_names)]
fn cpu_matmul_transpose_b(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
let mut c = vec![0.0; m * n];
for i in 0..m {
for j in 0..n {
let mut sum = 0.0;
for p in 0..k {
sum += a[i * k + p] * b[j * k + p];
}
c[i * n + j] = sum;
}
}
c
}
fn transpose(data: &[f32], rows: usize, cols: usize) -> Vec<f32> {
let mut result = vec![0.0; data.len()];
for i in 0..rows {
for j in 0..cols {
result[j * rows + i] = data[i * cols + j];
}
}
result
}
#[allow(clippy::many_single_char_names)]
fn cpu_matmul_transposed_simd(
a: &[f32], weight_t: &[f32], bias: &[f32], k: usize,
n: usize,
) -> Vec<f32> {
use rayon::prelude::*;
const CHUNK_SIZE: usize = 4096;
(0..n)
.into_par_iter()
.step_by(CHUNK_SIZE)
.flat_map(|chunk_start| {
let chunk_end = (chunk_start + CHUNK_SIZE).min(n);
(chunk_start..chunk_end)
.map(|j| {
let row = &weight_t[j * k..(j + 1) * k];
let dot: f32 = row.iter().zip(a.iter()).map(|(&w, &h)| w * h).sum();
dot + bias[j]
})
.collect::<Vec<_>>()
})
.collect()
}
pub struct StreamingKVCache {
num_layers: usize,
max_positions: usize,
num_heads: usize,
head_dim: usize,
keys: Vec<Vec<f32>>,
values: Vec<Vec<f32>>,
position: usize,
valid_positions: usize,
}
impl StreamingKVCache {
#[must_use]
pub fn new(num_layers: usize, max_positions: usize, num_heads: usize, head_dim: usize) -> Self {
let kv_size = max_positions * num_heads * head_dim;
Self {
num_layers,
max_positions,
num_heads,
head_dim,
keys: vec![vec![0.0f32; kv_size]; num_layers],
values: vec![vec![0.0f32; kv_size]; num_layers],
position: 0,
valid_positions: 0,
}
}
pub fn append(&mut self, layer: usize, key: &[f32], value: &[f32]) {
let kv_dim = self.num_heads * self.head_dim;
assert!(layer < self.num_layers, "Layer index out of bounds");
assert_eq!(key.len(), kv_dim, "Key dimension mismatch");
assert_eq!(value.len(), kv_dim, "Value dimension mismatch");
let offset = self.position * kv_dim;
self.keys[layer][offset..offset + kv_dim].copy_from_slice(key);
self.values[layer][offset..offset + kv_dim].copy_from_slice(value);
if layer == self.num_layers - 1 {
self.position = (self.position + 1) % self.max_positions;
self.valid_positions = (self.valid_positions + 1).min(self.max_positions);
}
}
#[must_use]
pub fn get_range(&self, layer: usize, start: usize, end: usize) -> (&[f32], &[f32]) {
let kv_dim = self.num_heads * self.head_dim;
let start_offset = start * kv_dim;
let end_offset = end * kv_dim;
(
&self.keys[layer][start_offset..end_offset],
&self.values[layer][start_offset..end_offset],
)
}
#[must_use]
pub fn get_valid(&self, layer: usize) -> (&[f32], &[f32]) {
self.get_range(layer, 0, self.valid_positions)
}
#[must_use]
pub fn len(&self) -> usize {
self.valid_positions
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.valid_positions == 0
}
#[must_use]
pub fn max_positions(&self) -> usize {
self.max_positions
}
pub fn clear(&mut self) {
self.position = 0;
self.valid_positions = 0;
}
#[must_use]
pub fn memory_bytes(&self) -> usize {
let kv_size = self.max_positions * self.num_heads * self.head_dim;
self.num_layers * kv_size * 2 * 4
}
#[must_use]
pub fn memory_mb(&self) -> f64 {
self.memory_bytes() as f64 / (1024.0 * 1024.0)
}
}
pub struct StreamingKVCacheFp16 {
num_layers: usize,
max_positions: usize,
num_heads: usize,
head_dim: usize,
keys: Vec<Vec<u16>>,
values: Vec<Vec<u16>>,
position: usize,
valid_positions: usize,
}
impl StreamingKVCacheFp16 {
#[must_use]
pub fn new(num_layers: usize, max_positions: usize, num_heads: usize, head_dim: usize) -> Self {
let kv_size = max_positions * num_heads * head_dim;
Self {
num_layers,
max_positions,
num_heads,
head_dim,
keys: vec![vec![0u16; kv_size]; num_layers],
values: vec![vec![0u16; kv_size]; num_layers],
position: 0,
valid_positions: 0,
}
}
#[inline]
fn f32_to_f16(value: f32) -> u16 {
half::f16::from_f32(value).to_bits()
}
#[inline]
fn f16_to_f32(bits: u16) -> f32 {
half::f16::from_bits(bits).to_f32()
}
pub fn append(&mut self, layer: usize, key: &[f32], value: &[f32]) {
let kv_dim = self.num_heads * self.head_dim;
assert!(layer < self.num_layers, "Layer index out of bounds");
assert_eq!(key.len(), kv_dim, "Key dimension mismatch");
assert_eq!(value.len(), kv_dim, "Value dimension mismatch");
let offset = self.position * kv_dim;
for (i, &k) in key.iter().enumerate() {
self.keys[layer][offset + i] = Self::f32_to_f16(k);
}
for (i, &v) in value.iter().enumerate() {
self.values[layer][offset + i] = Self::f32_to_f16(v);
}
if layer == self.num_layers - 1 {
self.position = (self.position + 1) % self.max_positions;
self.valid_positions = (self.valid_positions + 1).min(self.max_positions);
}
}
#[must_use]
pub fn get_range_f32(&self, layer: usize, start: usize, end: usize) -> (Vec<f32>, Vec<f32>) {
let kv_dim = self.num_heads * self.head_dim;
let start_offset = start * kv_dim;
let end_offset = end * kv_dim;
let keys: Vec<f32> = self.keys[layer][start_offset..end_offset]
.iter()
.map(|&bits| Self::f16_to_f32(bits))
.collect();
let values: Vec<f32> = self.values[layer][start_offset..end_offset]
.iter()
.map(|&bits| Self::f16_to_f32(bits))
.collect();
(keys, values)
}
#[must_use]
pub fn get_range_raw(&self, layer: usize, start: usize, end: usize) -> (&[u16], &[u16]) {
let kv_dim = self.num_heads * self.head_dim;
let start_offset = start * kv_dim;
let end_offset = end * kv_dim;
(
&self.keys[layer][start_offset..end_offset],
&self.values[layer][start_offset..end_offset],
)
}
#[must_use]
pub fn get_valid_f32(&self, layer: usize) -> (Vec<f32>, Vec<f32>) {
self.get_range_f32(layer, 0, self.valid_positions)
}
#[must_use]
pub fn len(&self) -> usize {
self.valid_positions
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.valid_positions == 0
}
#[must_use]
pub fn max_positions(&self) -> usize {
self.max_positions
}
pub fn clear(&mut self) {
self.position = 0;
self.valid_positions = 0;
}
#[must_use]
pub fn memory_bytes(&self) -> usize {
let kv_size = self.max_positions * self.num_heads * self.head_dim;
self.num_layers * kv_size * 2 * 2
}
#[must_use]
pub fn memory_mb(&self) -> f64 {
self.memory_bytes() as f64 / (1024.0 * 1024.0)
}
}
pub struct GpuBufferPool {
available_buffers: std::collections::HashMap<usize, Vec<Vec<f32>>>,
bucket_sizes: Vec<usize>,
max_per_bucket: usize,
}
impl GpuBufferPool {
#[must_use]
pub fn new() -> Self {
Self {
available_buffers: std::collections::HashMap::new(),
bucket_sizes: (10..=24).map(|i| 1 << i).collect(), max_per_bucket: 4,
}
}
fn get_bucket(&self, size: usize) -> usize {
*self
.bucket_sizes
.iter()
.find(|&&b| b >= size)
.unwrap_or(&size)
}
pub fn acquire(&mut self, size: usize) -> Vec<f32> {
let bucket = self.get_bucket(size);
if let Some(buffers) = self.available_buffers.get_mut(&bucket) {
if let Some(mut buf) = buffers.pop() {
buf.resize(size, 0.0);
return buf;
}
}
vec![0.0; size]
}
pub fn release(&mut self, mut buffer: Vec<f32>) {
let bucket = self.get_bucket(buffer.capacity());
let buffers = self.available_buffers.entry(bucket).or_default();
if buffers.len() < self.max_per_bucket {
buffer.clear();
buffers.push(buffer);
}
}
pub fn clear(&mut self) {
self.available_buffers.clear();
}
#[must_use]
pub fn stats(&self) -> GpuPoolStats {
let total_buffers: usize = self.available_buffers.values().map(Vec::len).sum();
let total_bytes: usize = self
.available_buffers
.iter()
.map(|(bucket, buffers)| bucket * buffers.len() * 4)
.sum();
GpuPoolStats {
cached_buffers: total_buffers,
cached_bytes: total_bytes,
}
}
}
impl Default for GpuBufferPool {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy)]
pub struct GpuPoolStats {
pub cached_buffers: usize,
pub cached_bytes: usize,
}
pub struct AsyncGpuResult {
result: Option<Vec<f32>>,
ready: bool,
}
impl AsyncGpuResult {
pub fn ready(data: Vec<f32>) -> Self {
Self {
result: Some(data),
ready: true,
}
}
pub fn pending() -> Self {
Self {
result: None,
ready: false,
}
}
#[must_use]
pub fn is_ready(&self) -> bool {
self.ready
}
pub fn set_result(&mut self, data: Vec<f32>) {
self.result = Some(data);
self.ready = true;
}
pub fn wait(self) -> Vec<f32> {
self.result.expect("Result not ready")
}
pub fn try_get(&self) -> Option<&Vec<f32>> {
if self.ready {
self.result.as_ref()
} else {
None
}
}
}
pub struct HybridScheduler {
gpu_compute: GpuCompute,
gpu_threshold: usize,
buffer_pool: GpuBufferPool,
}
impl HybridScheduler {
pub fn new() -> Result<Self> {
Ok(Self {
gpu_compute: GpuCompute::auto()?,
gpu_threshold: 64 * 64 * 64, buffer_pool: GpuBufferPool::new(),
})
}
pub fn with_threshold(gpu_threshold: usize) -> Result<Self> {
Ok(Self {
gpu_compute: GpuCompute::auto()?,
gpu_threshold,
buffer_pool: GpuBufferPool::new(),
})
}
#[must_use]
pub fn has_gpu(&self) -> bool {
self.gpu_compute.is_gpu()
}
#[must_use]
pub fn gpu_threshold(&self) -> usize {
self.gpu_threshold
}
#[must_use]
#[allow(clippy::many_single_char_names)]
pub fn should_use_gpu(&self, m: usize, k: usize, n: usize) -> bool {
if m <= 1 {
return false;
}
self.gpu_compute.is_gpu() && (m * k * n) >= self.gpu_threshold
}
#[allow(clippy::many_single_char_names)]
pub fn matmul(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
if self.should_use_gpu(m, k, n) {
self.gpu_compute.matmul(a, b, m, k, n)
} else {
Ok(cpu_matmul(a, b, m, k, n))
}
}
#[allow(clippy::many_single_char_names)]
pub fn matmul_pooled(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
let mut output = self.buffer_pool.acquire(m * n);
let result = if self.should_use_gpu(m, k, n) {
self.gpu_compute.matmul(a, b, m, k, n)?
} else {
cpu_matmul(a, b, m, k, n)
};
output.copy_from_slice(&result);
Ok(output)
}
pub fn release_buffer(&mut self, buffer: Vec<f32>) {
self.buffer_pool.release(buffer);
}
#[must_use]
pub fn pool_stats(&self) -> GpuPoolStats {
self.buffer_pool.stats()
}
#[allow(clippy::many_single_char_names)]
pub fn matmul_async(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<AsyncGpuResult> {
let result = if self.should_use_gpu(m, k, n) {
self.gpu_compute.matmul(a, b, m, k, n)?
} else {
cpu_matmul(a, b, m, k, n)
};
Ok(AsyncGpuResult::ready(result))
}
pub fn matmul_batch(&mut self, operations: &[MatmulOp]) -> Result<Vec<Vec<f32>>> {
let mut results = Vec::with_capacity(operations.len());
for (a, b, m, k, n) in operations {
let result = self.matmul(a, b, *m, *k, *n)?;
results.push(result);
}
Ok(results)
}
#[allow(clippy::many_single_char_names)]
pub fn matmul_transpose_b(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
if self.should_use_gpu(m, k, n) {
let b_t = transpose(b, n, k);
self.gpu_compute.matmul(a, &b_t, m, k, n)
} else {
Ok(cpu_matmul_transpose_b(a, b, m, k, n))
}
}
}
#[cfg(feature = "cuda")]
pub struct CudaScheduler {
executor: crate::cuda::CudaExecutor,
}
#[cfg(feature = "cuda")]
impl CudaScheduler {
pub fn new() -> Result<Self> {
let executor = crate::cuda::CudaExecutor::new(0).map_err(|e| RealizarError::GpuError {
reason: format!("Failed to create CudaExecutor: {}", e),
})?;
Ok(Self { executor })
}
#[must_use]
pub fn has_cuda(&self) -> bool {
true }
#[must_use]
#[allow(clippy::unused_self)]
pub fn uses_cuda_for(&self, _m: usize, _k: usize, _n: usize) -> bool {
true }
#[allow(clippy::many_single_char_names)]
pub fn matmul(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
let mut output = vec![0.0f32; m * n];
self.executor
.gemm(a, b, &mut output, m as u32, n as u32, k as u32)
.map_err(|e| RealizarError::GpuError {
reason: format!("CUDA GEMM failed: {}", e),
})?;
Ok(output)
}
pub fn device_name(&self) -> Result<String> {
self.executor
.device_name()
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to get device name: {}", e),
})
}
pub fn cache_weight(&mut self, name: &str, weight: &[f32]) -> Result<()> {
self.executor
.load_weights(name, weight)
.map(|_| ())
.map_err(|e| RealizarError::GpuError {
reason: format!("Failed to cache weight '{}': {}", name, e),
})
}
#[must_use]
pub fn has_cached_weight(&self, name: &str) -> bool {
self.executor.has_weights(name)
}
#[must_use]
pub fn cached_weight_count(&self) -> usize {
self.executor.cached_weight_count()
}
pub fn matmul_cached(
&mut self,
weight_name: &str,
x: &[f32],
k: usize,
n: usize,
) -> Result<Vec<f32>> {
let mut output = vec![0.0f32; n];
self.executor
.gemv_cached(weight_name, x, &mut output, k as u32, n as u32)
.map_err(|e| RealizarError::GpuError {
reason: format!("CUDA cached GEMV failed: {}", e),
})?;
Ok(output)
}
}
pub struct GpuModel {
embedding_weights: Vec<f32>,
block_weights: Vec<BlockWeights>,
final_norm_weight: Vec<f32>,
final_norm_bias: Vec<f32>,
lm_head_weight: Vec<f32>,
lm_head_weight_t: Vec<f32>,
lm_head_bias: Vec<f32>,
scheduler: HybridScheduler,
#[cfg(feature = "cuda")]
cuda_scheduler: Option<CudaScheduler>,
config: GpuModelConfig,
attention_buffers: Option<AttentionBuffers>,
}
struct BlockWeights {
attn_norm_weight: Vec<f32>,
attn_norm_bias: Vec<f32>,
qkv_weight: Vec<f32>,
#[allow(dead_code)] qkv_bias: Vec<f32>,
out_weight: Vec<f32>,
out_bias: Vec<f32>,
ffn_norm_weight: Vec<f32>,
ffn_norm_bias: Vec<f32>,
ffn_fc1_weight: Vec<f32>,
ffn_fc1_bias: Vec<f32>,
ffn_fc2_weight: Vec<f32>,
ffn_fc2_bias: Vec<f32>,
}
#[derive(Debug, Clone, Copy)]
pub enum WeightType {
Qkv,
Output,
FfnFc1,
FfnFc2,
LmHead,
}
#[derive(Debug, Clone)]
pub struct GpuModelConfig {
pub vocab_size: usize,
pub hidden_dim: usize,
pub num_heads: usize,
pub num_kv_heads: usize,
pub num_layers: usize,
pub intermediate_dim: usize,
pub eps: f32,
}
impl GpuModelConfig {
#[inline]
pub fn head_dim(&self) -> usize {
self.hidden_dim / self.num_heads
}
#[inline]
pub fn kv_dim(&self) -> usize {
self.num_kv_heads * self.head_dim()
}
#[inline]
pub fn qkv_dim(&self) -> usize {
self.hidden_dim + 2 * self.kv_dim()
}
#[inline]
pub fn is_gqa(&self) -> bool {
self.num_kv_heads < self.num_heads
}
}
#[derive(Debug, Clone)]
pub struct GpuGenerateConfig {
pub max_tokens: usize,
pub temperature: f32,
pub top_k: usize,
pub stop_tokens: Vec<usize>,
}
impl Default for GpuGenerateConfig {
fn default() -> Self {
Self {
max_tokens: 64,
temperature: 0.0,
top_k: 1,
stop_tokens: Vec::new(),
}
}
}
impl GpuGenerateConfig {
#[must_use]
pub fn deterministic(max_tokens: usize) -> Self {
Self {
max_tokens,
temperature: 0.0,
top_k: 1,
stop_tokens: Vec::new(),
}
}
#[must_use]
pub fn with_sampling(max_tokens: usize, temperature: f32, top_k: usize) -> Self {
Self {
max_tokens,
temperature,
top_k,
stop_tokens: Vec::new(),
}
}
#[must_use]
pub fn with_stop_tokens(mut self, stop_tokens: Vec<usize>) -> Self {
self.stop_tokens = stop_tokens;
self
}
}
#[derive(Debug)]
pub struct AttentionBuffers {
pub q_buffer: Vec<f32>,
pub scores_buffer: Vec<f32>,
pub output_buffer: Vec<f32>,
pub kv_proj_buffer: Vec<f32>,
pub ffn_buffer: Vec<f32>,
pub max_seq_len: usize,
}
impl AttentionBuffers {
#[must_use]
pub fn new(config: &GpuModelConfig, max_seq_len: usize) -> Self {
Self {
q_buffer: vec![0.0; config.hidden_dim],
scores_buffer: vec![0.0; config.num_heads * max_seq_len],
output_buffer: vec![0.0; config.hidden_dim],
kv_proj_buffer: vec![0.0; config.hidden_dim],
ffn_buffer: vec![0.0; config.intermediate_dim],
max_seq_len,
}
}
pub fn reset(&mut self) {
self.q_buffer.fill(0.0);
self.scores_buffer.fill(0.0);
self.output_buffer.fill(0.0);
self.kv_proj_buffer.fill(0.0);
self.ffn_buffer.fill(0.0);
}
}
impl GpuModel {
pub fn new(config: GpuModelConfig) -> Result<Self> {
let scheduler = HybridScheduler::new()?;
let embedding_weights = vec![0.01f32; config.vocab_size * config.hidden_dim];
let mut block_weights = Vec::with_capacity(config.num_layers);
for _ in 0..config.num_layers {
block_weights.push(BlockWeights {
attn_norm_weight: vec![1.0f32; config.hidden_dim],
attn_norm_bias: vec![0.0f32; config.hidden_dim],
qkv_weight: vec![0.01f32; config.hidden_dim * 3 * config.hidden_dim],
qkv_bias: vec![0.0f32; 3 * config.hidden_dim],
out_weight: vec![0.01f32; config.hidden_dim * config.hidden_dim],
out_bias: vec![0.0f32; config.hidden_dim],
ffn_norm_weight: vec![1.0f32; config.hidden_dim],
ffn_norm_bias: vec![0.0f32; config.hidden_dim],
ffn_fc1_weight: vec![0.01f32; config.hidden_dim * config.intermediate_dim],
ffn_fc1_bias: vec![0.0f32; config.intermediate_dim],
ffn_fc2_weight: vec![0.01f32; config.intermediate_dim * config.hidden_dim],
ffn_fc2_bias: vec![0.0f32; config.hidden_dim],
});
}
let final_norm_weight = vec![1.0f32; config.hidden_dim];
let final_norm_bias = vec![0.0f32; config.hidden_dim];
let lm_head_weight = vec![0.01f32; config.hidden_dim * config.vocab_size];
let lm_head_bias = vec![0.0f32; config.vocab_size];
let lm_head_weight_t =
Self::transpose_weights(&lm_head_weight, config.hidden_dim, config.vocab_size);
Ok(Self {
embedding_weights,
block_weights,
final_norm_weight,
final_norm_bias,
lm_head_weight,
lm_head_weight_t,
lm_head_bias,
scheduler,
#[cfg(feature = "cuda")]
cuda_scheduler: None,
config,
attention_buffers: None,
})
}
#[cfg(feature = "cuda")]
pub fn new_with_cuda(config: GpuModelConfig) -> Result<Self> {
let scheduler = HybridScheduler::new()?;
let cuda_scheduler = Some(CudaScheduler::new()?);
let embedding_weights = vec![0.01f32; config.vocab_size * config.hidden_dim];
let mut block_weights = Vec::with_capacity(config.num_layers);
for _ in 0..config.num_layers {
block_weights.push(BlockWeights {
attn_norm_weight: vec![1.0f32; config.hidden_dim],
attn_norm_bias: vec![0.0f32; config.hidden_dim],
qkv_weight: vec![0.01f32; config.hidden_dim * config.qkv_dim()],
qkv_bias: vec![0.0f32; config.qkv_dim()],
out_weight: vec![0.01f32; config.hidden_dim * config.hidden_dim],
out_bias: vec![0.0f32; config.hidden_dim],
ffn_norm_weight: vec![1.0f32; config.hidden_dim],
ffn_norm_bias: vec![0.0f32; config.hidden_dim],
ffn_fc1_weight: vec![0.01f32; config.hidden_dim * config.intermediate_dim],
ffn_fc1_bias: vec![0.0f32; config.intermediate_dim],
ffn_fc2_weight: vec![0.01f32; config.intermediate_dim * config.hidden_dim],
ffn_fc2_bias: vec![0.0f32; config.hidden_dim],
});
}
let final_norm_weight = vec![1.0f32; config.hidden_dim];
let final_norm_bias = vec![0.0f32; config.hidden_dim];
let lm_head_weight = vec![0.01f32; config.hidden_dim * config.vocab_size];
let lm_head_bias = vec![0.0f32; config.vocab_size];
let lm_head_weight_t =
Self::transpose_weights(&lm_head_weight, config.hidden_dim, config.vocab_size);
Ok(Self {
embedding_weights,
block_weights,
final_norm_weight,
final_norm_bias,
lm_head_weight,
lm_head_weight_t,
lm_head_bias,
scheduler,
cuda_scheduler,
config,
attention_buffers: None,
})
}
#[cfg(feature = "cuda")]
#[must_use]
pub fn has_cuda_scheduler(&self) -> bool {
self.cuda_scheduler.is_some()
}
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
pub fn cuda_matmul(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
if let Some(ref mut cuda_sched) = self.cuda_scheduler {
cuda_sched.matmul(a, b, m, k, n)
} else {
self.scheduler.matmul(a, b, m, k, n)
}
}
#[allow(clippy::many_single_char_names)]
pub fn do_matmul(
&mut self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
#[cfg(feature = "cuda")]
if let Some(ref mut cuda_sched) = self.cuda_scheduler {
return cuda_sched.matmul(a, b, m, k, n);
}
self.scheduler.matmul(a, b, m, k, n)
}
pub fn matmul_split(
&mut self,
input: &[f32],
block_idx: usize,
op: WeightType,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let qkv_dim = self.config.qkv_dim();
let intermediate_dim = self.config.intermediate_dim;
let vocab_size = self.config.vocab_size;
let (weight, m, k, n) = match op {
WeightType::Qkv => (
&self.block_weights[block_idx].qkv_weight,
1,
hidden_dim,
qkv_dim,
),
WeightType::Output => (
&self.block_weights[block_idx].out_weight,
1,
hidden_dim,
hidden_dim,
),
WeightType::FfnFc1 => (
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
),
WeightType::FfnFc2 => (
&self.block_weights[block_idx].ffn_fc2_weight,
1,
intermediate_dim,
hidden_dim,
),
WeightType::LmHead => (&self.lm_head_weight, 1, hidden_dim, vocab_size),
};
let weight_clone = weight.clone();
self.do_matmul(input, &weight_clone, m, k, n)
}
#[cfg(feature = "cuda")]
pub fn matmul_zero_clone(
&mut self,
input: &[f32],
block_idx: usize,
op: WeightType,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let qkv_dim = self.config.qkv_dim();
let intermediate_dim = self.config.intermediate_dim;
let vocab_size = self.config.vocab_size;
let mut cuda_sched = self.cuda_scheduler.take();
let (weight, m, k, n) = match op {
WeightType::Qkv => (
&self.block_weights[block_idx].qkv_weight,
1,
hidden_dim,
qkv_dim,
),
WeightType::Output => (
&self.block_weights[block_idx].out_weight,
1,
hidden_dim,
hidden_dim,
),
WeightType::FfnFc1 => (
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
),
WeightType::FfnFc2 => (
&self.block_weights[block_idx].ffn_fc2_weight,
1,
intermediate_dim,
hidden_dim,
),
WeightType::LmHead => (&self.lm_head_weight, 1, hidden_dim, vocab_size),
};
let result = if let Some(ref mut sched) = cuda_sched {
sched.matmul(input, weight, m, k, n)
} else {
self.scheduler.matmul(input, weight, m, k, n)
};
self.cuda_scheduler = cuda_sched;
result
}
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
pub fn matmul_refcell(
&self,
a: &[f32],
b: &[f32],
m: usize,
k: usize,
n: usize,
) -> Result<Vec<f32>> {
let cuda_sched_ptr = std::ptr::addr_of!(self.cuda_scheduler).cast_mut();
unsafe {
if let Some(ref mut sched) = *cuda_sched_ptr {
sched.matmul(a, b, m, k, n)
} else {
let sched_ptr = std::ptr::addr_of!(self.scheduler).cast_mut();
(*sched_ptr).matmul(a, b, m, k, n)
}
}
}
#[cfg(feature = "cuda")]
pub fn forward_block_refcell(
&self,
input: &[f32],
block_idx: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let intermediate_dim = self.config.intermediate_dim;
let eps = self.config.eps;
let num_kv_heads = self.config.num_kv_heads;
let normed = Self::layer_norm_static(
input,
&self.block_weights[block_idx].attn_norm_weight,
&self.block_weights[block_idx].attn_norm_bias,
hidden_dim,
eps,
);
let qkv = self.matmul_refcell(
&normed,
&self.block_weights[block_idx].qkv_weight,
1,
hidden_dim,
qkv_dim,
)?;
let q = qkv[0..hidden_dim].to_vec();
let k_new = qkv[hidden_dim..hidden_dim + kv_dim].to_vec();
let v_new = qkv[hidden_dim + kv_dim..].to_vec();
let (cached_k, cached_v) = kv_cache.get_valid(block_idx);
let keys_cached = cached_k.to_vec();
let vals_cached = cached_v.to_vec();
kv_cache.append(block_idx, &k_new, &v_new);
let kv_len = keys_cached.len() / kv_dim + 1;
let mut full_k = keys_cached;
full_k.extend_from_slice(&k_new);
let mut full_v = vals_cached;
full_v.extend_from_slice(&v_new);
let attn_output = Self::gqa_multihead_attention(
&q,
&full_k,
&full_v,
kv_len,
num_heads,
num_kv_heads,
head_dim,
);
let attn_proj = self.matmul_refcell(
&attn_output,
&self.block_weights[block_idx].out_weight,
1,
hidden_dim,
hidden_dim,
)?;
let out_bias = &self.block_weights[block_idx].out_bias;
let post_attn: Vec<f32> = input
.iter()
.zip(attn_proj.iter())
.zip(out_bias.iter())
.map(|((&i, &a), &b)| i + a + b)
.collect();
let ffn_normed = Self::layer_norm_static(
&post_attn,
&self.block_weights[block_idx].ffn_norm_weight,
&self.block_weights[block_idx].ffn_norm_bias,
hidden_dim,
eps,
);
let fc1_out = self.matmul_refcell(
&ffn_normed,
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
)?;
let ffn_fc1_bias = &self.block_weights[block_idx].ffn_fc1_bias;
let fc1_activated: Vec<f32> = fc1_out
.iter()
.zip(ffn_fc1_bias.iter())
.map(|(&x, &b)| {
let x_b = x + b;
x_b * 0.5 + x_b * 0.5 * (0.797_884_6 * (x_b + 0.044_715 * x_b.powi(3))).tanh()
})
.collect();
let fc2_out = self.matmul_refcell(
&fc1_activated,
&self.block_weights[block_idx].ffn_fc2_weight,
1,
intermediate_dim,
hidden_dim,
)?;
let ffn_fc2_bias = &self.block_weights[block_idx].ffn_fc2_bias;
let output: Vec<f32> = post_attn
.iter()
.zip(fc2_out.iter())
.zip(ffn_fc2_bias.iter())
.map(|((&h, &f), &b)| h + f + b)
.collect();
Ok(output)
}
#[cfg(feature = "cuda")]
pub fn forward_refcell(
&self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let hidden_dim = self.config.hidden_dim;
let offset = token_id * hidden_dim;
let mut hidden = self.embedding_weights[offset..offset + hidden_dim].to_vec();
for block_idx in 0..self.config.num_layers {
hidden = self.forward_block_refcell(&hidden, block_idx, kv_cache)?;
}
hidden = self.layer_norm_refcell(&hidden, &self.final_norm_weight, &self.final_norm_bias);
let lm_head_elements = hidden_dim * self.config.vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul_transposed_simd(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
self.config.vocab_size,
)
} else {
let vocab_size = self.config.vocab_size;
let logits =
self.matmul_refcell(&hidden, &self.lm_head_weight, 1, hidden_dim, vocab_size)?;
logits
.into_iter()
.zip(self.lm_head_bias.iter())
.map(|(l, &b)| l + b)
.collect()
};
Ok(output)
}
#[cfg(feature = "cuda")]
fn layer_norm_refcell(&self, input: &[f32], weight: &[f32], bias: &[f32]) -> Vec<f32> {
Self::layer_norm_static(input, weight, bias, self.config.hidden_dim, self.config.eps)
}
#[cfg(feature = "cuda")]
pub fn generate_refcell(
&self,
prompt: &[usize],
config: &GpuGenerateConfig,
) -> Result<Vec<usize>> {
if prompt.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Prompt cannot be empty".to_string(),
});
}
let num_kv_heads = self.config.num_kv_heads;
let head_dim = self.config.head_dim();
let max_seq_len = prompt.len() + config.max_tokens;
let mut kv_cache =
StreamingKVCache::new(self.config.num_layers, max_seq_len, num_kv_heads, head_dim);
let mut tokens = prompt.to_vec();
for &token_id in prompt {
let _ = self.forward_refcell(token_id, &mut kv_cache)?;
}
for _ in 0..config.max_tokens {
let last_token = *tokens.last().unwrap_or(&0);
let logits = self.forward_refcell(last_token, &mut kv_cache)?;
let next_token = if config.temperature == 0.0 || config.top_k == 1 {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map_or(0, |(idx, _)| idx)
} else {
Self::sample_topk_generate(&logits, config.temperature, config.top_k)
};
tokens.push(next_token);
if config.stop_tokens.contains(&next_token) {
break;
}
}
Ok(tokens)
}
pub fn from_gguf_config(config: GpuModelConfig) -> Result<Self> {
Self::new(config)
}
pub fn from_mapped_gguf(mapped: &crate::gguf::MappedGGUFModel) -> Result<Self> {
use crate::gguf::GGUFConfig;
let gguf_config = GGUFConfig::from_gguf(&mapped.model)?;
let config = GpuModelConfig {
vocab_size: gguf_config.vocab_size,
hidden_dim: gguf_config.hidden_dim,
num_heads: gguf_config.num_heads,
num_kv_heads: gguf_config.num_kv_heads, num_layers: gguf_config.num_layers,
intermediate_dim: gguf_config.intermediate_dim,
eps: gguf_config.eps,
};
let scheduler = HybridScheduler::new()?;
let data = mapped.data();
let embedding_weights = mapped.model.get_tensor_f32("token_embd.weight", data)?;
let mut block_weights = Vec::with_capacity(config.num_layers);
for layer_idx in 0..config.num_layers {
let prefix = format!("blk.{}", layer_idx);
let attn_norm_weight = mapped
.model
.get_tensor_f32(&format!("{}.attn_norm.weight", prefix), data)?;
let attn_norm_bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_norm.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
let (qkv_weight, qkv_bias) = if let Ok(fused_qkv) = mapped
.model
.get_tensor_f32(&format!("{}.attn_qkv.weight", prefix), data)
{
let bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_qkv.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; 3 * config.hidden_dim]);
(fused_qkv, bias)
} else {
let head_dim = config.hidden_dim / config.num_heads;
let kv_dim = config.num_kv_heads * head_dim;
let q_weight = mapped
.model
.get_tensor_f32(&format!("{}.attn_q.weight", prefix), data)?;
let k_weight = mapped
.model
.get_tensor_f32(&format!("{}.attn_k.weight", prefix), data)?;
let v_weight = mapped
.model
.get_tensor_f32(&format!("{}.attn_v.weight", prefix), data)?;
let mut qkv_weight =
Vec::with_capacity(q_weight.len() + k_weight.len() + v_weight.len());
qkv_weight.extend_from_slice(&q_weight);
qkv_weight.extend_from_slice(&k_weight);
qkv_weight.extend_from_slice(&v_weight);
let q_bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_q.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
let k_bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_k.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; kv_dim]); let v_bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_v.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; kv_dim]);
let total_bias_dim = config.hidden_dim + 2 * kv_dim;
let mut qkv_bias = Vec::with_capacity(total_bias_dim);
qkv_bias.extend_from_slice(&q_bias);
qkv_bias.extend_from_slice(&k_bias);
qkv_bias.extend_from_slice(&v_bias);
(qkv_weight, qkv_bias)
};
let out_weight = mapped
.model
.get_tensor_f32(&format!("{}.attn_output.weight", prefix), data)?;
let out_bias = mapped
.model
.get_tensor_f32(&format!("{}.attn_output.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
let ffn_norm_weight = mapped
.model
.get_tensor_f32(&format!("{}.ffn_norm.weight", prefix), data)
.unwrap_or_else(|_| vec![1.0f32; config.hidden_dim]);
let ffn_norm_bias = mapped
.model
.get_tensor_f32(&format!("{}.ffn_norm.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
let ffn_fc1_weight = mapped
.model
.get_tensor_f32(&format!("{}.ffn_up.weight", prefix), data)?;
let ffn_fc1_bias = mapped
.model
.get_tensor_f32(&format!("{}.ffn_up.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.intermediate_dim]);
let ffn_fc2_weight = mapped
.model
.get_tensor_f32(&format!("{}.ffn_down.weight", prefix), data)?;
let ffn_fc2_bias = mapped
.model
.get_tensor_f32(&format!("{}.ffn_down.bias", prefix), data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
block_weights.push(BlockWeights {
attn_norm_weight,
attn_norm_bias,
qkv_weight,
qkv_bias,
out_weight,
out_bias,
ffn_norm_weight,
ffn_norm_bias,
ffn_fc1_weight,
ffn_fc1_bias,
ffn_fc2_weight,
ffn_fc2_bias,
});
}
let final_norm_weight = mapped.model.get_tensor_f32("output_norm.weight", data)?;
let final_norm_bias = mapped
.model
.get_tensor_f32("output_norm.bias", data)
.unwrap_or_else(|_| vec![0.0f32; config.hidden_dim]);
let lm_head_weight = mapped.model.get_tensor_f32("output.weight", data)?;
let lm_head_bias = mapped
.model
.get_tensor_f32("output.bias", data)
.unwrap_or_else(|_| vec![0.0f32; config.vocab_size]);
let lm_head_weight_t =
Self::transpose_weights(&lm_head_weight, config.hidden_dim, config.vocab_size);
Ok(Self {
embedding_weights,
block_weights,
final_norm_weight,
final_norm_bias,
lm_head_weight,
lm_head_weight_t,
lm_head_bias,
scheduler,
#[cfg(feature = "cuda")]
cuda_scheduler: None,
config,
attention_buffers: None,
})
}
#[must_use]
pub fn config(&self) -> &GpuModelConfig {
&self.config
}
pub fn with_attention_buffers(config: GpuModelConfig, max_seq_len: usize) -> Result<Self> {
let buffers = AttentionBuffers::new(&config, max_seq_len);
let mut model = Self::new(config)?;
model.attention_buffers = Some(buffers);
Ok(model)
}
#[must_use]
pub fn has_attention_buffers(&self) -> bool {
self.attention_buffers.is_some()
}
pub fn generate_optimized(
&mut self,
prompt: &[usize],
config: &GpuGenerateConfig,
) -> Result<Vec<usize>> {
if prompt.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Prompt cannot be empty".to_string(),
});
}
let head_dim = self.config.hidden_dim / self.config.num_heads;
let max_seq_len = self
.attention_buffers
.as_ref()
.map_or(512, |b| b.max_seq_len);
let mut kv_cache = StreamingKVCache::new(
self.config.num_layers,
max_seq_len,
self.config.num_kv_heads, head_dim,
);
let mut tokens = prompt.to_vec();
let logits = self.forward_gpu_with_cache(prompt, &mut kv_cache)?;
let mut next_token = if config.temperature == 0.0 || config.top_k == 1 {
Self::argmax(&logits)
} else {
Self::sample_topk_generate(&logits, config.temperature, config.top_k)
};
if config.stop_tokens.contains(&next_token) {
return Ok(tokens);
}
tokens.push(next_token);
for _ in 1..config.max_tokens {
let logits = self.forward_gpu_incremental_optimized(next_token, &mut kv_cache)?;
next_token = if config.temperature == 0.0 || config.top_k == 1 {
Self::argmax(&logits)
} else {
Self::sample_topk_generate(&logits, config.temperature, config.top_k)
};
if config.stop_tokens.contains(&next_token) {
break;
}
tokens.push(next_token);
}
Ok(tokens)
}
pub fn forward_gpu_incremental_optimized(
&mut self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let hidden_dim = self.config.hidden_dim;
let offset = token_id * hidden_dim;
let mut hidden: Vec<f32> = self.embedding_weights[offset..offset + hidden_dim].to_vec();
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_incremental_optimized(&hidden, block_idx, kv_cache)?;
}
hidden = self.layer_norm(&hidden, &self.final_norm_weight, &self.final_norm_bias);
let lm_head_elements = hidden_dim * self.config.vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul_transposed_simd(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
self.config.vocab_size,
)
} else {
let lm_weight = self.lm_head_weight.clone();
let vocab_size = self.config.vocab_size;
let logits = self.do_matmul(&hidden, &lm_weight, 1, hidden_dim, vocab_size)?;
logits
.into_iter()
.zip(self.lm_head_bias.iter())
.map(|(l, &b)| l + b)
.collect()
};
Ok(output)
}
fn forward_block_incremental_optimized(
&mut self,
input: &[f32],
block_idx: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let intermediate_dim = self.config.intermediate_dim;
let eps = self.config.eps;
let num_kv_heads = self.config.num_kv_heads;
let normed = Self::layer_norm_static(
input,
&self.block_weights[block_idx].attn_norm_weight,
&self.block_weights[block_idx].attn_norm_bias,
hidden_dim,
eps,
);
let qkv_weight = self.block_weights[block_idx].qkv_weight.clone();
let qkv = self.do_matmul(&normed, &qkv_weight, 1, hidden_dim, qkv_dim)?;
let q = qkv[0..hidden_dim].to_vec();
let k_new = qkv[hidden_dim..hidden_dim + kv_dim].to_vec();
let v_new = qkv[hidden_dim + kv_dim..].to_vec();
let (cached_k, cached_v) = kv_cache.get_valid(block_idx);
let keys_cached = cached_k.to_vec();
let vals_cached = cached_v.to_vec();
kv_cache.append(block_idx, &k_new, &v_new);
let kv_len = keys_cached.len() / kv_dim + 1;
let mut full_k = keys_cached;
full_k.extend_from_slice(&k_new);
let mut full_v = vals_cached;
full_v.extend_from_slice(&v_new);
let attn_output = Self::gqa_multihead_attention(
&q,
&full_k,
&full_v,
kv_len,
num_heads,
num_kv_heads,
head_dim,
);
let out_weight = self.block_weights[block_idx].out_weight.clone();
let attn_proj = self.do_matmul(&attn_output, &out_weight, 1, hidden_dim, hidden_dim)?;
let out_bias = &self.block_weights[block_idx].out_bias;
let mut post_attn: Vec<f32> = input
.iter()
.zip(attn_proj.iter())
.zip(out_bias.iter())
.map(|((&i, &a), &b)| i + a + b)
.collect();
let ffn_normed = Self::layer_norm_static(
&post_attn,
&self.block_weights[block_idx].ffn_norm_weight,
&self.block_weights[block_idx].ffn_norm_bias,
hidden_dim,
eps,
);
let fc1_weight = self.block_weights[block_idx].ffn_fc1_weight.clone();
let fc1_out = self.do_matmul(&ffn_normed, &fc1_weight, 1, hidden_dim, intermediate_dim)?;
let ffn_fc1_bias = &self.block_weights[block_idx].ffn_fc1_bias;
let fc1_activated: Vec<f32> = fc1_out
.iter()
.zip(ffn_fc1_bias.iter())
.map(|(&x, &b)| {
let x_b = x + b;
x_b * 0.5 + x_b * 0.5 * (0.797_884_6 * (x_b + 0.044_715 * x_b.powi(3))).tanh()
})
.collect();
let fc2_weight = self.block_weights[block_idx].ffn_fc2_weight.clone();
let fc2_out =
self.do_matmul(&fc1_activated, &fc2_weight, 1, intermediate_dim, hidden_dim)?;
let ffn_fc2_bias = &self.block_weights[block_idx].ffn_fc2_bias;
for i in 0..hidden_dim {
post_attn[i] += fc2_out[i] + ffn_fc2_bias[i];
}
Ok(post_attn)
}
#[allow(dead_code)] fn batched_multihead_attention(
&mut self,
q: &[f32],
k: &[f32],
v: &[f32],
kv_len: usize,
num_heads: usize,
head_dim: usize,
) -> Vec<f32> {
let hidden_dim = num_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let use_buffers = self.attention_buffers.is_some();
let mut output = if use_buffers {
if let Some(ref mut buffers) = self.attention_buffers {
buffers.output_buffer.fill(0.0);
}
vec![0.0; hidden_dim]
} else {
vec![0.0; hidden_dim]
};
for h in 0..num_heads {
let q_head = &q[h * head_dim..(h + 1) * head_dim];
let mut scores = Vec::with_capacity(kv_len);
for pos in 0..kv_len {
let k_offset = pos * hidden_dim + h * head_dim;
let k_head = &k[k_offset..k_offset + head_dim];
let score: f32 = q_head
.iter()
.zip(k_head.iter())
.map(|(q_i, k_i)| q_i * k_i)
.sum();
scores.push(score * scale);
}
let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_scores: Vec<f32> = scores.iter().map(|&s| (s - max_score).exp()).collect();
let sum_exp: f32 = exp_scores.iter().sum();
let attn_weights: Vec<f32> = exp_scores.iter().map(|&e| e / sum_exp).collect();
for (pos, &weight) in attn_weights.iter().enumerate() {
let v_offset = pos * hidden_dim + h * head_dim;
let v_head = &v[v_offset..v_offset + head_dim];
for d in 0..head_dim {
output[h * head_dim + d] += weight * v_head[d];
}
}
}
output
}
fn gqa_multihead_attention(
q: &[f32], k: &[f32], v: &[f32], kv_len: usize,
num_heads: usize, num_kv_heads: usize, head_dim: usize,
) -> Vec<f32> {
use trueno::Vector;
let hidden_dim = num_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let heads_per_kv = num_heads / num_kv_heads;
let mut output = vec![0.0; hidden_dim];
for h in 0..num_heads {
let q_head = &q[h * head_dim..(h + 1) * head_dim];
let q_vec = Vector::from_slice(q_head);
let kv_head = h / heads_per_kv;
let mut scores = Vec::with_capacity(kv_len);
for pos in 0..kv_len {
let k_offset = pos * kv_dim + kv_head * head_dim;
let cached_key = &k[k_offset..k_offset + head_dim];
let k_vec = Vector::from_slice(cached_key);
let score = q_vec.dot(&k_vec).unwrap_or(0.0) * scale;
scores.push(score);
}
let scores_vec = Vector::from_slice(&scores);
let attn_weights: Vec<f32> = scores_vec.softmax().map_or_else(
|_| {
let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_scores: Vec<f32> =
scores.iter().map(|&s| (s - max_score).exp()).collect();
let sum_exp: f32 = exp_scores.iter().sum();
exp_scores.iter().map(|&e| e / sum_exp).collect()
},
|v| v.as_slice().to_vec(),
);
for (pos, &weight) in attn_weights.iter().enumerate() {
let v_offset = pos * kv_dim + kv_head * head_dim;
let v_head = &v[v_offset..v_offset + head_dim];
for d in 0..head_dim {
output[h * head_dim + d] += weight * v_head[d];
}
}
}
output
}
#[must_use]
pub fn has_fused_qkv(&self) -> bool {
!self.block_weights.is_empty()
&& self.block_weights[0].qkv_weight.len()
== self.config.hidden_dim * 3 * self.config.hidden_dim
}
pub fn fused_qkv_projection(
&mut self,
input: &[f32],
) -> Result<(Vec<f32>, Vec<f32>, Vec<f32>)> {
let hidden_dim = self.config.hidden_dim;
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let qkv_weight = &self.block_weights[0].qkv_weight;
let qkv = self
.scheduler
.matmul(input, qkv_weight, 1, hidden_dim, qkv_dim)?;
let q = qkv[0..hidden_dim].to_vec();
let k = qkv[hidden_dim..hidden_dim + kv_dim].to_vec();
let v = qkv[hidden_dim + kv_dim..].to_vec();
Ok((q, k, v))
}
pub fn generate_with_fused_qkv(
&mut self,
prompt: &[usize],
config: &GpuGenerateConfig,
) -> Result<Vec<usize>> {
self.generate_optimized(prompt, config)
}
#[must_use]
pub fn has_fused_attn_proj(&self) -> bool {
!self.block_weights.is_empty()
&& self.block_weights[0].out_weight.len()
== self.config.hidden_dim * self.config.hidden_dim
}
pub fn forward_with_fused_attn_proj(
&mut self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
self.forward_gpu_incremental_optimized(token_id, kv_cache)
}
#[must_use]
pub fn has_fused_output_residual(&self) -> bool {
self.attention_buffers.is_some() && !self.block_weights.is_empty()
}
pub fn forward_with_fused_output_residual(
&mut self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
self.forward_gpu_incremental_optimized(token_id, kv_cache)
}
pub fn forward_gpu_owned(&mut self, token_ids: &[usize]) -> Result<Vec<f32>> {
self.forward_gpu(token_ids)
}
pub fn generate(&mut self, prompt: &[usize], config: &GpuGenerateConfig) -> Result<Vec<usize>> {
#[cfg(feature = "cuda")]
if self.cuda_scheduler.is_some() {
return self.generate_refcell(prompt, config);
}
self.generate_optimized(prompt, config)
}
pub fn forward_gpu_with_cache(
&mut self,
token_ids: &[usize],
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
if token_ids.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Token IDs cannot be empty".to_string(),
});
}
let seq_len = token_ids.len();
let hidden_dim = self.config.hidden_dim;
let mut hidden = Vec::with_capacity(seq_len * hidden_dim);
for &token_id in token_ids {
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let offset = token_id * hidden_dim;
hidden.extend_from_slice(&self.embedding_weights[offset..offset + hidden_dim]);
}
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_with_cache(&hidden, seq_len, block_idx, kv_cache)?;
}
hidden = self.layer_norm(&hidden, &self.final_norm_weight, &self.final_norm_bias);
let final_hidden = &hidden[(seq_len - 1) * hidden_dim..seq_len * hidden_dim];
let lm_head_elements = hidden_dim * self.config.vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul_transposed_simd(
final_hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
self.config.vocab_size,
)
} else {
let logits = self.scheduler.matmul(
final_hidden,
&self.lm_head_weight,
1,
hidden_dim,
self.config.vocab_size,
)?;
let mut output = logits;
for (out_val, &bias_val) in output.iter_mut().zip(self.lm_head_bias.iter()) {
*out_val += bias_val;
}
output
};
Ok(output)
}
pub fn forward_gpu_incremental(
&mut self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let hidden_dim = self.config.hidden_dim;
let offset = token_id * hidden_dim;
let mut hidden = self.embedding_weights[offset..offset + hidden_dim].to_vec();
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_incremental(&hidden, block_idx, kv_cache)?;
}
hidden = self.layer_norm(&hidden, &self.final_norm_weight, &self.final_norm_bias);
let lm_head_elements = hidden_dim * self.config.vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul_transposed_simd(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
self.config.vocab_size,
)
} else {
let logits = self.scheduler.matmul(
&hidden,
&self.lm_head_weight,
1,
hidden_dim,
self.config.vocab_size,
)?;
let mut output = logits;
for (out_val, &bias_val) in output.iter_mut().zip(self.lm_head_bias.iter()) {
*out_val += bias_val;
}
output
};
Ok(output)
}
fn forward_block_with_cache(
&mut self,
input: &[f32],
seq_len: usize,
block_idx: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let intermediate_dim = self.config.intermediate_dim;
let num_heads = self.config.num_heads;
let num_kv_heads = self.config.num_kv_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let block = &self.block_weights[block_idx];
let normed = Self::layer_norm_static(
input,
&block.attn_norm_weight,
&block.attn_norm_bias,
hidden_dim,
self.config.eps,
);
let qkv = self.scheduler.matmul(
&normed,
&self.block_weights[block_idx].qkv_weight,
seq_len,
hidden_dim,
qkv_dim,
)?;
let q = &qkv[..seq_len * hidden_dim];
let k = &qkv[seq_len * hidden_dim..seq_len * hidden_dim + seq_len * kv_dim];
let v = &qkv[seq_len * hidden_dim + seq_len * kv_dim..];
for pos in 0..seq_len {
let k_slice = &k[pos * kv_dim..(pos + 1) * kv_dim];
let v_slice = &v[pos * kv_dim..(pos + 1) * kv_dim];
kv_cache.append(block_idx, k_slice, v_slice);
}
let attn_out =
self.gqa_attention_with_kv(q, k, v, seq_len, num_heads, num_kv_heads, head_dim)?;
let projected = self.scheduler.matmul(
&attn_out,
&self.block_weights[block_idx].out_weight,
seq_len,
hidden_dim,
hidden_dim,
)?;
let mut residual1: Vec<f32> = input
.iter()
.zip(projected.iter())
.enumerate()
.map(|(i, (&inp, &proj))| {
inp + proj + self.block_weights[block_idx].out_bias[i % hidden_dim]
})
.collect();
let ffn_normed = Self::layer_norm_static(
&residual1,
&self.block_weights[block_idx].ffn_norm_weight,
&self.block_weights[block_idx].ffn_norm_bias,
hidden_dim,
self.config.eps,
);
let fc1_out = self.scheduler.matmul(
&ffn_normed,
&self.block_weights[block_idx].ffn_fc1_weight,
seq_len,
hidden_dim,
intermediate_dim,
)?;
let activated: Vec<f32> = fc1_out
.iter()
.enumerate()
.map(|(i, &x)| {
let x = x + self.block_weights[block_idx].ffn_fc1_bias[i % intermediate_dim];
0.5 * x
* (1.0
+ ((2.0f32 / std::f32::consts::PI).sqrt() * (x + 0.044_715 * x.powi(3)))
.tanh())
})
.collect();
let fc2_out = self.scheduler.matmul(
&activated,
&self.block_weights[block_idx].ffn_fc2_weight,
seq_len,
intermediate_dim,
hidden_dim,
)?;
for (i, x) in residual1.iter_mut().enumerate() {
*x += fc2_out[i] + self.block_weights[block_idx].ffn_fc2_bias[i % hidden_dim];
}
Ok(residual1)
}
fn forward_block_incremental(
&mut self,
input: &[f32],
block_idx: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let intermediate_dim = self.config.intermediate_dim;
let num_heads = self.config.num_heads;
let num_kv_heads = self.config.num_kv_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let block = &self.block_weights[block_idx];
let normed = Self::layer_norm_static(
input,
&block.attn_norm_weight,
&block.attn_norm_bias,
hidden_dim,
self.config.eps,
);
let qkv = self.scheduler.matmul(
&normed,
&self.block_weights[block_idx].qkv_weight,
1,
hidden_dim,
qkv_dim,
)?;
let q = &qkv[..hidden_dim];
let k_new = &qkv[hidden_dim..hidden_dim + kv_dim];
let v_new = &qkv[hidden_dim + kv_dim..];
let (cached_k, cached_v) = kv_cache.get_valid(block_idx);
let keys_cached = cached_k.to_vec();
let vals_cached = cached_v.to_vec();
let cached_len = keys_cached.len() / kv_dim;
kv_cache.append(block_idx, k_new, v_new);
let mut full_k = keys_cached;
full_k.extend_from_slice(k_new);
let mut full_v = vals_cached;
full_v.extend_from_slice(v_new);
let total_len = cached_len + 1;
let attn_out = Self::gqa_incremental_attention(
q,
&full_k,
&full_v,
total_len,
num_heads,
num_kv_heads,
head_dim,
);
let projected = self.scheduler.matmul(
&attn_out,
&self.block_weights[block_idx].out_weight,
1,
hidden_dim,
hidden_dim,
)?;
let mut residual1: Vec<f32> = input
.iter()
.zip(projected.iter())
.enumerate()
.map(|(i, (&inp, &proj))| inp + proj + self.block_weights[block_idx].out_bias[i])
.collect();
let ffn_normed = Self::layer_norm_static(
&residual1,
&self.block_weights[block_idx].ffn_norm_weight,
&self.block_weights[block_idx].ffn_norm_bias,
hidden_dim,
self.config.eps,
);
let fc1_out = self.scheduler.matmul(
&ffn_normed,
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
)?;
let activated: Vec<f32> = fc1_out
.iter()
.enumerate()
.map(|(i, &x)| {
let x = x + self.block_weights[block_idx].ffn_fc1_bias[i];
0.5 * x
* (1.0
+ ((2.0f32 / std::f32::consts::PI).sqrt() * (x + 0.044_715 * x.powi(3)))
.tanh())
})
.collect();
let fc2_out = self.scheduler.matmul(
&activated,
&self.block_weights[block_idx].ffn_fc2_weight,
1,
intermediate_dim,
hidden_dim,
)?;
for (i, x) in residual1.iter_mut().enumerate() {
*x += fc2_out[i] + self.block_weights[block_idx].ffn_fc2_bias[i];
}
Ok(residual1)
}
#[allow(dead_code)] fn attention_with_kv(
&mut self,
q: &[f32],
k: &[f32],
v: &[f32],
seq_len: usize,
num_heads: usize,
head_dim: usize,
) -> Result<Vec<f32>> {
let hidden_dim = num_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0f32; seq_len * hidden_dim];
for head in 0..num_heads {
let mut q_head = Vec::with_capacity(seq_len * head_dim);
let mut k_head = Vec::with_capacity(seq_len * head_dim);
let mut v_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * hidden_dim + head * head_dim;
q_head.extend_from_slice(&q[start..start + head_dim]);
k_head.extend_from_slice(&k[start..start + head_dim]);
v_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut k_t = vec![0.0f32; seq_len * head_dim];
for i in 0..seq_len {
for j in 0..head_dim {
k_t[j * seq_len + i] = k_head[i * head_dim + j];
}
}
let scores = self
.scheduler
.matmul(&q_head, &k_t, seq_len, head_dim, seq_len)?;
let mut attn_weights = vec![0.0f32; seq_len * seq_len];
for i in 0..seq_len {
let row_start = i * seq_len;
let max_score = scores[row_start..row_start + seq_len]
.iter()
.copied()
.fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for j in 0..seq_len {
let exp_val = ((scores[row_start + j] * scale) - max_score * scale).exp();
attn_weights[row_start + j] = exp_val;
sum += exp_val;
}
for j in 0..seq_len {
attn_weights[row_start + j] /= sum;
}
}
let head_out =
self.scheduler
.matmul(&attn_weights, &v_head, seq_len, seq_len, head_dim)?;
for i in 0..seq_len {
let out_start = i * hidden_dim + head * head_dim;
output[out_start..out_start + head_dim]
.copy_from_slice(&head_out[i * head_dim..(i + 1) * head_dim]);
}
}
Ok(output)
}
#[allow(dead_code)] fn incremental_attention(
q: &[f32],
k: &[f32],
v: &[f32],
kv_len: usize,
num_heads: usize,
head_dim: usize,
) -> Vec<f32> {
let hidden_dim = num_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0f32; hidden_dim];
for head in 0..num_heads {
let q_head = &q[head * head_dim..(head + 1) * head_dim];
let mut k_head = Vec::with_capacity(kv_len * head_dim);
let mut v_head = Vec::with_capacity(kv_len * head_dim);
for i in 0..kv_len {
let start = i * hidden_dim + head * head_dim;
k_head.extend_from_slice(&k[start..start + head_dim]);
v_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut scores = vec![0.0f32; kv_len];
for i in 0..kv_len {
let mut dot = 0.0f32;
for j in 0..head_dim {
dot += q_head[j] * k_head[i * head_dim + j];
}
scores[i] = dot * scale;
}
let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for s in &mut scores {
*s = (*s - max_score).exp();
sum += *s;
}
for s in &mut scores {
*s /= sum;
}
let mut head_out = vec![0.0f32; head_dim];
for i in 0..kv_len {
for j in 0..head_dim {
head_out[j] += scores[i] * v_head[i * head_dim + j];
}
}
output[head * head_dim..(head + 1) * head_dim].copy_from_slice(&head_out);
}
output
}
#[allow(clippy::too_many_arguments)] fn gqa_attention_with_kv(
&mut self,
q: &[f32], k: &[f32], v: &[f32], seq_len: usize,
num_heads: usize,
num_kv_heads: usize,
head_dim: usize,
) -> Result<Vec<f32>> {
let hidden_dim = num_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let heads_per_kv = num_heads / num_kv_heads;
let mut output = vec![0.0f32; seq_len * hidden_dim];
for head in 0..num_heads {
let kv_head = head / heads_per_kv;
let mut q_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * hidden_dim + head * head_dim;
q_head.extend_from_slice(&q[start..start + head_dim]);
}
let mut keys_for_head = Vec::with_capacity(seq_len * head_dim);
let mut vals_for_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * kv_dim + kv_head * head_dim;
keys_for_head.extend_from_slice(&k[start..start + head_dim]);
vals_for_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut k_t = vec![0.0f32; seq_len * head_dim];
for i in 0..seq_len {
for j in 0..head_dim {
k_t[j * seq_len + i] = keys_for_head[i * head_dim + j];
}
}
let scores = self
.scheduler
.matmul(&q_head, &k_t, seq_len, head_dim, seq_len)?;
let mut attn_weights = vec![0.0f32; seq_len * seq_len];
for i in 0..seq_len {
let row_start = i * seq_len;
let max_score = scores[row_start..row_start + seq_len]
.iter()
.copied()
.fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for j in 0..seq_len {
let exp_val = ((scores[row_start + j] * scale) - max_score * scale).exp();
attn_weights[row_start + j] = exp_val;
sum += exp_val;
}
for j in 0..seq_len {
attn_weights[row_start + j] /= sum;
}
}
let head_out =
self.scheduler
.matmul(&attn_weights, &vals_for_head, seq_len, seq_len, head_dim)?;
for i in 0..seq_len {
let out_start = i * hidden_dim + head * head_dim;
output[out_start..out_start + head_dim]
.copy_from_slice(&head_out[i * head_dim..(i + 1) * head_dim]);
}
}
Ok(output)
}
fn gqa_incremental_attention(
q: &[f32], k: &[f32], v: &[f32], kv_len: usize,
num_heads: usize,
num_kv_heads: usize,
head_dim: usize,
) -> Vec<f32> {
let hidden_dim = num_heads * head_dim;
let kv_dim = num_kv_heads * head_dim;
let scale = 1.0 / (head_dim as f32).sqrt();
let heads_per_kv = num_heads / num_kv_heads;
let mut output = vec![0.0f32; hidden_dim];
for head in 0..num_heads {
let kv_head = head / heads_per_kv;
let q_head = &q[head * head_dim..(head + 1) * head_dim];
let mut k_head = Vec::with_capacity(kv_len * head_dim);
let mut v_head = Vec::with_capacity(kv_len * head_dim);
for i in 0..kv_len {
let start = i * kv_dim + kv_head * head_dim;
k_head.extend_from_slice(&k[start..start + head_dim]);
v_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut scores = vec![0.0f32; kv_len];
for i in 0..kv_len {
let mut dot = 0.0f32;
for j in 0..head_dim {
dot += q_head[j] * k_head[i * head_dim + j];
}
scores[i] = dot * scale;
}
let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for s in &mut scores {
*s = (*s - max_score).exp();
sum += *s;
}
for s in &mut scores {
*s /= sum;
}
let mut head_out = vec![0.0f32; head_dim];
for i in 0..kv_len {
for j in 0..head_dim {
head_out[j] += scores[i] * v_head[i * head_dim + j];
}
}
output[head * head_dim..(head + 1) * head_dim].copy_from_slice(&head_out);
}
output
}
pub fn generate_with_cache(
&mut self,
prompt: &[usize],
config: &GpuGenerateConfig,
) -> Result<Vec<usize>> {
if prompt.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Prompt cannot be empty".to_string(),
});
}
let max_seq_len = prompt.len() + config.max_tokens;
let head_dim = self.config.hidden_dim / self.config.num_heads;
let mut kv_cache = StreamingKVCache::new(
self.config.num_layers,
max_seq_len,
self.config.num_kv_heads, head_dim,
);
let mut tokens = prompt.to_vec();
let logits = self.forward_gpu_with_cache(prompt, &mut kv_cache)?;
let mut next_token = if config.temperature == 0.0 || config.top_k == 1 {
Self::argmax(&logits)
} else {
Self::sample_topk_generate(&logits, config.temperature, config.top_k)
};
if config.stop_tokens.contains(&next_token) {
return Ok(tokens);
}
tokens.push(next_token);
for _ in 1..config.max_tokens {
let logits = self.forward_gpu_incremental(next_token, &mut kv_cache)?;
next_token = if config.temperature == 0.0 || config.top_k == 1 {
Self::argmax(&logits)
} else {
Self::sample_topk_generate(&logits, config.temperature, config.top_k)
};
if config.stop_tokens.contains(&next_token) {
break;
}
tokens.push(next_token);
}
Ok(tokens)
}
fn sample_topk_generate(logits: &[f32], temperature: f32, top_k: usize) -> usize {
let scaled: Vec<f32> = logits.iter().map(|&x| x / temperature).collect();
let max_logit = scaled.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_logits: Vec<f32> = scaled.iter().map(|&x| (x - max_logit).exp()).collect();
let sum: f32 = exp_logits.iter().sum();
let probs: Vec<f32> = exp_logits.iter().map(|&x| x / sum).collect();
let mut indexed: Vec<(usize, f32)> =
probs.iter().enumerate().map(|(i, &p)| (i, p)).collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
indexed.truncate(top_k);
indexed.first().map_or(0, |&(idx, _)| idx)
}
fn transpose_weights(weights: &[f32], rows: usize, cols: usize) -> Vec<f32> {
let mut transposed = vec![0.0f32; rows * cols];
for i in 0..rows {
for j in 0..cols {
transposed[j * rows + i] = weights[i * cols + j];
}
}
transposed
}
#[must_use]
pub fn has_gpu(&self) -> bool {
self.scheduler.has_gpu()
}
pub fn forward_gpu(&mut self, token_ids: &[usize]) -> Result<Vec<f32>> {
if token_ids.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Token IDs cannot be empty".to_string(),
});
}
let seq_len = token_ids.len();
let hidden_dim = self.config.hidden_dim;
let mut hidden = Vec::with_capacity(seq_len * hidden_dim);
for &token_id in token_ids {
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let offset = token_id * hidden_dim;
hidden.extend_from_slice(&self.embedding_weights[offset..offset + hidden_dim]);
}
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_idx(&hidden, seq_len, block_idx)?;
}
hidden = self.layer_norm(&hidden, &self.final_norm_weight, &self.final_norm_bias);
let lm_head_elements = hidden_dim * self.config.vocab_size;
let logits = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul(
&hidden,
&self.lm_head_weight,
seq_len,
hidden_dim,
self.config.vocab_size,
)
} else {
let lm_weight = self.lm_head_weight.clone();
self.do_matmul(
&hidden,
&lm_weight,
seq_len,
hidden_dim,
self.config.vocab_size,
)?
};
let mut output = logits;
for i in 0..seq_len {
for j in 0..self.config.vocab_size {
output[i * self.config.vocab_size + j] += self.lm_head_bias[j];
}
}
Ok(output)
}
fn forward_block_idx(
&mut self,
input: &[f32],
seq_len: usize,
block_idx: usize,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let intermediate_dim = self.config.intermediate_dim;
let qkv_dim = self.config.qkv_dim();
let block = &self.block_weights[block_idx];
let attn_norm_weight = &block.attn_norm_weight;
let attn_norm_bias = &block.attn_norm_bias;
let normed = Self::layer_norm_static(
input,
attn_norm_weight,
attn_norm_bias,
hidden_dim,
self.config.eps,
);
let qkv_weight = self.block_weights[block_idx].qkv_weight.clone();
let qkv = self.do_matmul(&normed, &qkv_weight, seq_len, hidden_dim, qkv_dim)?;
let attn_out = self.optimized_gqa_attention(&qkv, seq_len)?;
let out_weight = self.block_weights[block_idx].out_weight.clone();
let out_bias = self.block_weights[block_idx].out_bias.clone();
let projected = self.do_matmul(&attn_out, &out_weight, seq_len, hidden_dim, hidden_dim)?;
let mut residual1: Vec<f32> = input
.iter()
.zip(projected.iter())
.enumerate()
.map(|(i, (&inp, &proj))| inp + proj + out_bias[i % hidden_dim])
.collect();
let ffn_norm_weight = self.block_weights[block_idx].ffn_norm_weight.clone();
let ffn_norm_bias = self.block_weights[block_idx].ffn_norm_bias.clone();
let ffn_normed = Self::layer_norm_static(
&residual1,
&ffn_norm_weight,
&ffn_norm_bias,
hidden_dim,
self.config.eps,
);
let ffn_fc1_weight = self.block_weights[block_idx].ffn_fc1_weight.clone();
let ffn_fc1_bias = self.block_weights[block_idx].ffn_fc1_bias.clone();
let fc1_out = self.do_matmul(
&ffn_normed,
&ffn_fc1_weight,
seq_len,
hidden_dim,
intermediate_dim,
)?;
let activated: Vec<f32> = fc1_out
.iter()
.enumerate()
.map(|(i, &x)| {
let x = x + ffn_fc1_bias[i % intermediate_dim];
0.5 * x
* (1.0
+ ((2.0f32 / std::f32::consts::PI).sqrt() * (x + 0.044_715 * x.powi(3)))
.tanh())
})
.collect();
let ffn_fc2_weight = self.block_weights[block_idx].ffn_fc2_weight.clone();
let ffn_fc2_bias = self.block_weights[block_idx].ffn_fc2_bias.clone();
let fc2_out = self.do_matmul(
&activated,
&ffn_fc2_weight,
seq_len,
intermediate_dim,
hidden_dim,
)?;
for (i, x) in residual1.iter_mut().enumerate() {
*x += fc2_out[i] + ffn_fc2_bias[i % hidden_dim];
}
Ok(residual1)
}
#[allow(dead_code)] fn optimized_attention(&mut self, qkv: &[f32], seq_len: usize) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let head_dim = hidden_dim / num_heads;
let q = &qkv[..seq_len * hidden_dim];
let k = &qkv[seq_len * hidden_dim..seq_len * 2 * hidden_dim];
let v = &qkv[seq_len * 2 * hidden_dim..];
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0f32; seq_len * hidden_dim];
for head in 0..num_heads {
let mut q_head = Vec::with_capacity(seq_len * head_dim);
let mut k_head = Vec::with_capacity(seq_len * head_dim);
let mut v_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * hidden_dim + head * head_dim;
q_head.extend_from_slice(&q[start..start + head_dim]);
k_head.extend_from_slice(&k[start..start + head_dim]);
v_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut attn_scores = vec![f32::NEG_INFINITY; seq_len * seq_len];
let scores = self
.scheduler
.matmul_transpose_b(&q_head, &k_head, seq_len, head_dim, seq_len)?;
for i in 0..seq_len {
for j in 0..=i {
attn_scores[i * seq_len + j] = scores[i * seq_len + j] * scale;
}
}
for i in 0..seq_len {
let row_start = i * seq_len;
let row = &mut attn_scores[row_start..row_start + seq_len];
let max_val = row[..=i].iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for item in row.iter_mut().take(i + 1) {
*item = (*item - max_val).exp();
sum += *item;
}
for item in row.iter_mut().take(i + 1) {
*item /= sum;
}
for item in row.iter_mut().skip(i + 1) {
*item = 0.0;
}
}
let head_output =
self.scheduler
.matmul(&attn_scores, &v_head, seq_len, seq_len, head_dim)?;
for i in 0..seq_len {
let out_start = i * hidden_dim + head * head_dim;
let head_start = i * head_dim;
output[out_start..out_start + head_dim]
.copy_from_slice(&head_output[head_start..head_start + head_dim]);
}
}
Ok(output)
}
fn optimized_gqa_attention(&mut self, qkv: &[f32], seq_len: usize) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let num_kv_heads = self.config.num_kv_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let heads_per_kv = num_heads / num_kv_heads;
let q = &qkv[..seq_len * hidden_dim];
let k = &qkv[seq_len * hidden_dim..seq_len * hidden_dim + seq_len * kv_dim];
let v = &qkv[seq_len * hidden_dim + seq_len * kv_dim..];
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0f32; seq_len * hidden_dim];
for head in 0..num_heads {
let kv_head = head / heads_per_kv;
let mut q_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * hidden_dim + head * head_dim;
q_head.extend_from_slice(&q[start..start + head_dim]);
}
let mut k_head = Vec::with_capacity(seq_len * head_dim);
let mut v_head = Vec::with_capacity(seq_len * head_dim);
for i in 0..seq_len {
let start = i * kv_dim + kv_head * head_dim;
k_head.extend_from_slice(&k[start..start + head_dim]);
v_head.extend_from_slice(&v[start..start + head_dim]);
}
let mut attn_scores = vec![f32::NEG_INFINITY; seq_len * seq_len];
let scores = self
.scheduler
.matmul_transpose_b(&q_head, &k_head, seq_len, head_dim, seq_len)?;
for i in 0..seq_len {
for j in 0..=i {
attn_scores[i * seq_len + j] = scores[i * seq_len + j] * scale;
}
}
for i in 0..seq_len {
let row_start = i * seq_len;
let row = &mut attn_scores[row_start..row_start + seq_len];
let max_val = row[..=i].iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0f32;
for item in row.iter_mut().take(i + 1) {
*item = (*item - max_val).exp();
sum += *item;
}
for item in row.iter_mut().take(i + 1) {
*item /= sum;
}
for item in row.iter_mut().skip(i + 1) {
*item = 0.0;
}
}
let head_output =
self.scheduler
.matmul(&attn_scores, &v_head, seq_len, seq_len, head_dim)?;
for i in 0..seq_len {
let out_start = i * hidden_dim + head * head_dim;
let head_start = i * head_dim;
output[out_start..out_start + head_dim]
.copy_from_slice(&head_output[head_start..head_start + head_dim]);
}
}
Ok(output)
}
#[allow(dead_code, clippy::unnecessary_wraps)]
fn simplified_attention(&self, qkv: &[f32], seq_len: usize) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let head_dim = hidden_dim / self.config.num_heads;
let q = &qkv[..seq_len * hidden_dim];
let k = &qkv[seq_len * hidden_dim..seq_len * 2 * hidden_dim];
let v = &qkv[seq_len * 2 * hidden_dim..];
let scale = 1.0 / (head_dim as f32).sqrt();
let mut output = vec![0.0f32; seq_len * hidden_dim];
for head in 0..self.config.num_heads {
for i in 0..seq_len {
let mut weights = Vec::with_capacity(seq_len);
let mut max_score = f32::NEG_INFINITY;
for j in 0..=i {
let mut score = 0.0f32;
for d in 0..head_dim {
let q_idx = i * hidden_dim + head * head_dim + d;
let k_idx = j * hidden_dim + head * head_dim + d;
score += q[q_idx] * k[k_idx];
}
score *= scale;
max_score = max_score.max(score);
weights.push(score);
}
let mut sum = 0.0f32;
for w in &mut weights {
*w = (*w - max_score).exp();
sum += *w;
}
for w in &mut weights {
*w /= sum;
}
for d in 0..head_dim {
let out_idx = i * hidden_dim + head * head_dim + d;
for (j, &w) in weights.iter().enumerate() {
let v_idx = j * hidden_dim + head * head_dim + d;
output[out_idx] += w * v[v_idx];
}
}
}
}
Ok(output)
}
#[allow(clippy::cast_precision_loss)]
fn layer_norm_static(
input: &[f32],
weight: &[f32],
bias: &[f32],
hidden_dim: usize,
eps: f32,
) -> Vec<f32> {
let num_rows = input.len() / hidden_dim;
let mut output = Vec::with_capacity(input.len());
for row in 0..num_rows {
let start = row * hidden_dim;
let row_data = &input[start..start + hidden_dim];
let mean: f32 = row_data.iter().sum::<f32>() / hidden_dim as f32;
let var: f32 =
row_data.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / hidden_dim as f32;
let std = (var + eps).sqrt();
for (i, &x) in row_data.iter().enumerate() {
let normalized = (x - mean) / std;
output.push(normalized * weight[i] + bias[i]);
}
}
output
}
fn layer_norm(&self, input: &[f32], weight: &[f32], bias: &[f32]) -> Vec<f32> {
Self::layer_norm_static(input, weight, bias, self.config.hidden_dim, self.config.eps)
}
pub fn generate_gpu(&mut self, prompt: &[usize], max_tokens: usize) -> Result<Vec<usize>> {
let mut tokens = prompt.to_vec();
let vocab_size = self.config.vocab_size;
let logits = self.forward_gpu(&tokens)?;
let last_pos_start = (tokens.len() - 1) * vocab_size;
let last_logits = &logits[last_pos_start..last_pos_start + vocab_size];
let next_token = Self::argmax(last_logits);
tokens.push(next_token);
if vocab_size > 8192 {
for _ in 1..max_tokens {
let next_token = self.forward_single_token_greedy(&tokens)?;
tokens.push(next_token);
}
} else {
for _ in 1..max_tokens {
let logits = self.forward_single_token(&tokens)?;
let next_token = Self::argmax(&logits);
tokens.push(next_token);
}
}
Ok(tokens)
}
fn forward_single_token(&mut self, tokens: &[usize]) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let vocab_size = self.config.vocab_size;
let last_token = *tokens.last().ok_or_else(|| RealizarError::InvalidShape {
reason: "Token list empty".to_string(),
})?;
if last_token >= vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!("Token {} out of bounds", last_token),
});
}
let offset = last_token * hidden_dim;
let mut hidden: Vec<f32> = self.embedding_weights[offset..offset + hidden_dim].to_vec();
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_single(&hidden, block_idx)?;
}
hidden = Self::layer_norm_static(
&hidden,
&self.final_norm_weight,
&self.final_norm_bias,
hidden_dim,
self.config.eps,
);
let lm_head_elements = hidden_dim * vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
cpu_matmul_transposed_simd(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
vocab_size,
)
} else {
let logits =
self.scheduler
.matmul(&hidden, &self.lm_head_weight, 1, hidden_dim, vocab_size)?;
logits
.iter()
.zip(self.lm_head_bias.iter())
.map(|(&x, &b)| x + b)
.collect()
};
Ok(output)
}
fn forward_single_token_greedy(&mut self, tokens: &[usize]) -> Result<usize> {
let hidden_dim = self.config.hidden_dim;
let vocab_size = self.config.vocab_size;
let last_token = *tokens.last().ok_or_else(|| RealizarError::InvalidShape {
reason: "Token list empty".to_string(),
})?;
if last_token >= vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!("Token {} out of bounds", last_token),
});
}
let offset = last_token * hidden_dim;
let mut hidden: Vec<f32> = self.embedding_weights[offset..offset + hidden_dim].to_vec();
for block_idx in 0..self.block_weights.len() {
hidden = self.forward_block_single(&hidden, block_idx)?;
}
hidden = Self::layer_norm_static(
&hidden,
&self.final_norm_weight,
&self.final_norm_bias,
hidden_dim,
self.config.eps,
);
let lm_head_elements = hidden_dim * vocab_size;
if vocab_size > 8192 || exceeds_gpu_buffer_limit(lm_head_elements) {
Ok(Self::optimized_lm_head_argmax_transposed(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
vocab_size,
))
} else {
let logits =
self.scheduler
.matmul(&hidden, &self.lm_head_weight, 1, hidden_dim, vocab_size)?;
let output: Vec<f32> = logits
.iter()
.zip(self.lm_head_bias.iter())
.map(|(&x, &b)| x + b)
.collect();
Ok(Self::argmax(&output))
}
}
#[allow(clippy::unnecessary_wraps)]
fn forward_block_single(&mut self, input: &[f32], block_idx: usize) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let intermediate_dim = self.config.intermediate_dim;
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let block = &self.block_weights[block_idx];
let normed = Self::layer_norm_static(
input,
&block.attn_norm_weight,
&block.attn_norm_bias,
hidden_dim,
self.config.eps,
);
let qkv_weight = &self.block_weights[block_idx].qkv_weight;
let qkv = cpu_matmul(&normed, qkv_weight, 1, hidden_dim, qkv_dim);
let v = &qkv[hidden_dim + kv_dim..];
let num_kv_heads = self.config.num_kv_heads;
let heads_per_kv = self.config.num_heads / num_kv_heads;
let head_dim = self.config.head_dim();
let attn_out: Vec<f32> = if heads_per_kv == 1 {
v.to_vec()
} else {
let mut expanded = Vec::with_capacity(hidden_dim);
for kv_h in 0..num_kv_heads {
let v_head = &v[kv_h * head_dim..(kv_h + 1) * head_dim];
for _ in 0..heads_per_kv {
expanded.extend_from_slice(v_head);
}
}
expanded
};
let out_weight = &self.block_weights[block_idx].out_weight;
let out_bias = &self.block_weights[block_idx].out_bias;
let projected = cpu_matmul(&attn_out, out_weight, 1, hidden_dim, hidden_dim);
let residual1: Vec<f32> = input
.iter()
.zip(projected.iter())
.enumerate()
.map(|(i, (&inp, &proj))| inp + proj + out_bias[i])
.collect();
let ffn_norm_weight = &self.block_weights[block_idx].ffn_norm_weight;
let ffn_norm_bias = &self.block_weights[block_idx].ffn_norm_bias;
let ffn_normed = Self::layer_norm_static(
&residual1,
ffn_norm_weight,
ffn_norm_bias,
hidden_dim,
self.config.eps,
);
let ffn_fc1_weight = &self.block_weights[block_idx].ffn_fc1_weight;
let ffn_fc1_bias = &self.block_weights[block_idx].ffn_fc1_bias;
let fc1_out = cpu_matmul(&ffn_normed, ffn_fc1_weight, 1, hidden_dim, intermediate_dim);
let activated: Vec<f32> = fc1_out
.iter()
.enumerate()
.map(|(i, &x)| {
let x = x + ffn_fc1_bias[i];
0.5 * x
* (1.0
+ ((2.0f32 / std::f32::consts::PI).sqrt() * (x + 0.044_715 * x.powi(3)))
.tanh())
})
.collect();
let ffn_fc2_weight = &self.block_weights[block_idx].ffn_fc2_weight;
let ffn_fc2_bias = &self.block_weights[block_idx].ffn_fc2_bias;
let fc2_out = cpu_matmul(&activated, ffn_fc2_weight, 1, intermediate_dim, hidden_dim);
let output: Vec<f32> = residual1
.iter()
.zip(fc2_out.iter())
.enumerate()
.map(|(i, (&r, &fc))| r + fc + ffn_fc2_bias[i])
.collect();
Ok(output)
}
#[allow(clippy::items_after_statements)]
fn argmax(logits: &[f32]) -> usize {
if logits.len() <= 1024 {
return logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map_or(0, |(i, _)| i);
}
const CHUNK_SIZE: usize = 4096;
let chunk_maxes: Vec<(usize, f32)> = logits
.chunks(CHUNK_SIZE)
.enumerate()
.map(|(chunk_idx, chunk)| {
let (local_idx, &max_val) = chunk
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.unwrap();
(chunk_idx * CHUNK_SIZE + local_idx, max_val)
})
.collect();
chunk_maxes
.into_iter()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map_or(0, |(idx, _)| idx)
}
#[allow(clippy::many_single_char_names, clippy::items_after_statements)]
fn optimized_lm_head_argmax_transposed(
hidden: &[f32],
weight_t: &[f32], bias: &[f32],
hidden_dim: usize,
vocab_size: usize,
) -> usize {
use rayon::prelude::*;
const CHUNK_SIZE: usize = 4096;
(0..vocab_size)
.into_par_iter()
.step_by(CHUNK_SIZE)
.map(|chunk_start| {
let chunk_end = (chunk_start + CHUNK_SIZE).min(vocab_size);
let mut best_local_idx = chunk_start;
let mut best_local_val = f32::NEG_INFINITY;
for j in chunk_start..chunk_end {
let row = &weight_t[j * hidden_dim..(j + 1) * hidden_dim];
let dot: f32 = row.iter().zip(hidden.iter()).map(|(&w, &h)| w * h).sum();
let logit = dot + bias[j];
if logit > best_local_val {
best_local_val = logit;
best_local_idx = j;
}
}
(best_local_idx, best_local_val)
})
.reduce(
|| (0, f32::NEG_INFINITY),
|a, b| if a.1 > b.1 { a } else { b },
)
.0
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ErrorClassification {
Transient,
Fatal,
GpuFailure,
}
#[derive(Debug, Clone)]
pub enum RecoveryAction {
Retry {
delay: Duration,
},
FallbackToCpu,
Fail,
}
pub struct ErrorRecoveryStrategy {
max_retries: u32,
base_delay: Duration,
max_delay: Duration,
jitter: f64,
}
impl ErrorRecoveryStrategy {
#[must_use]
pub fn new() -> Self {
Self {
max_retries: 3,
base_delay: Duration::from_millis(100),
max_delay: Duration::from_secs(10),
jitter: 0.1,
}
}
#[must_use]
pub fn with_max_retries(mut self, max_retries: u32) -> Self {
self.max_retries = max_retries;
self
}
#[must_use]
pub fn with_base_delay(mut self, base_delay: Duration) -> Self {
self.base_delay = base_delay;
self
}
#[must_use]
pub fn with_max_delay(mut self, max_delay: Duration) -> Self {
self.max_delay = max_delay;
self
}
#[must_use]
pub fn with_jitter(mut self, jitter: f64) -> Self {
self.jitter = jitter.clamp(0.0, 1.0);
self
}
#[must_use]
pub fn max_retries(&self) -> u32 {
self.max_retries
}
#[must_use]
pub fn classify_error(&self, error: &std::io::Error) -> ErrorClassification {
match error.kind() {
std::io::ErrorKind::TimedOut
| std::io::ErrorKind::ConnectionReset
| std::io::ErrorKind::ConnectionAborted
| std::io::ErrorKind::Interrupted
| std::io::ErrorKind::WouldBlock => ErrorClassification::Transient,
std::io::ErrorKind::Other => {
let msg = error.to_string().to_lowercase();
if msg.contains("gpu") || msg.contains("cuda") || msg.contains("wgpu") {
ErrorClassification::GpuFailure
} else {
ErrorClassification::Transient
}
},
_ => ErrorClassification::Fatal,
}
}
#[must_use]
pub fn determine_action(&self, error: &std::io::Error, attempt: u32) -> RecoveryAction {
if attempt >= self.max_retries {
return RecoveryAction::Fail;
}
match self.classify_error(error) {
ErrorClassification::Transient => RecoveryAction::Retry {
delay: self.calculate_delay(attempt),
},
ErrorClassification::GpuFailure => RecoveryAction::FallbackToCpu,
ErrorClassification::Fatal => RecoveryAction::Fail,
}
}
#[must_use]
pub fn determine_action_with_fallback(
&self,
error: &std::io::Error,
attempt: u32,
) -> RecoveryAction {
let msg = error.to_string().to_lowercase();
if msg.contains("gpu") || msg.contains("unavailable") {
RecoveryAction::FallbackToCpu
} else {
self.determine_action(error, attempt)
}
}
#[must_use]
pub fn calculate_delay(&self, attempt: u32) -> Duration {
let base_ms = self.base_delay.as_millis() as f64;
let exp_delay = base_ms * 2.0_f64.powi(attempt as i32);
let capped_delay = exp_delay.min(self.max_delay.as_millis() as f64);
let jitter_range = capped_delay * self.jitter;
let jittered = capped_delay + (jitter_range * 0.5);
Duration::from_millis(jittered as u64)
}
}
impl Default for ErrorRecoveryStrategy {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DegradationMode {
Normal,
CpuFallback,
MemoryPressure,
LowLatency,
HighThroughput,
}
#[derive(Debug, Clone, Copy)]
pub struct SystemLoad {
pub cpu_percent: f64,
pub memory_percent: f64,
pub queue_depth: u32,
}
pub struct DegradationManager {
gpu_available: bool,
memory_pressure: f64,
system_load: Option<SystemLoad>,
latency_priority: bool,
mode: DegradationMode,
}
impl DegradationManager {
#[must_use]
pub fn new() -> Self {
Self {
gpu_available: true,
memory_pressure: 0.0,
system_load: None,
latency_priority: false,
mode: DegradationMode::Normal,
}
}
#[must_use]
pub fn current_mode(&self) -> DegradationMode {
self.mode
}
pub fn set_gpu_available(&mut self, available: bool) {
self.gpu_available = available;
self.update_mode();
}
pub fn update_memory_pressure(&mut self, pressure: f64) {
self.memory_pressure = pressure.clamp(0.0, 1.0);
self.update_mode();
}
pub fn update_system_load(&mut self, load: SystemLoad) {
self.system_load = Some(load);
self.update_mode();
}
pub fn set_latency_priority(&mut self, priority: bool) {
self.latency_priority = priority;
self.update_mode();
}
#[must_use]
pub fn recommended_batch_size(&self, requested: usize) -> usize {
if self.memory_pressure > 0.8 {
(requested as f64 * (1.0 - self.memory_pressure)).max(1.0) as usize
} else {
requested
}
}
#[must_use]
pub fn recommended_max_context(&self, requested: usize) -> usize {
if let Some(load) = &self.system_load {
if load.cpu_percent > 90.0 || load.memory_percent > 80.0 || load.queue_depth > 50 {
(requested as f64 * 0.75).max(256.0) as usize
} else {
requested
}
} else {
requested
}
}
fn update_mode(&mut self) {
self.mode = if !self.gpu_available {
DegradationMode::CpuFallback
} else if self.latency_priority {
DegradationMode::LowLatency
} else if self.memory_pressure > 0.8 {
DegradationMode::MemoryPressure
} else if let Some(load) = &self.system_load {
if load.cpu_percent > 90.0 || load.memory_percent > 80.0 {
DegradationMode::MemoryPressure
} else {
DegradationMode::Normal
}
} else {
DegradationMode::Normal
};
}
}
impl Default for DegradationManager {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub enum RequestOutcome {
Success,
Failed(String),
}
pub struct FailureIsolator {
active_requests: std::sync::atomic::AtomicU64,
success_count: std::sync::atomic::AtomicU64,
failure_count: std::sync::atomic::AtomicU64,
consecutive_failures: std::sync::atomic::AtomicU32,
circuit_open: std::sync::atomic::AtomicBool,
next_request_id: std::sync::atomic::AtomicU64,
failure_threshold: u32,
cleanups: std::sync::Mutex<HashMap<u64, Box<dyn FnOnce() + Send>>>,
}
impl FailureIsolator {
#[must_use]
pub fn new() -> Self {
Self {
active_requests: std::sync::atomic::AtomicU64::new(0),
success_count: std::sync::atomic::AtomicU64::new(0),
failure_count: std::sync::atomic::AtomicU64::new(0),
consecutive_failures: std::sync::atomic::AtomicU32::new(0),
circuit_open: std::sync::atomic::AtomicBool::new(false),
next_request_id: std::sync::atomic::AtomicU64::new(0),
failure_threshold: 5,
cleanups: std::sync::Mutex::new(HashMap::new()),
}
}
#[must_use]
pub fn active_requests(&self) -> u64 {
self.active_requests
.load(std::sync::atomic::Ordering::SeqCst)
}
#[must_use]
pub fn success_count(&self) -> u64 {
self.success_count.load(std::sync::atomic::Ordering::SeqCst)
}
#[must_use]
pub fn failure_count(&self) -> u64 {
self.failure_count.load(std::sync::atomic::Ordering::SeqCst)
}
#[must_use]
pub fn is_circuit_open(&self) -> bool {
self.circuit_open.load(std::sync::atomic::Ordering::SeqCst)
}
#[must_use]
pub fn start_request(&self) -> u64 {
self.active_requests
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
self.next_request_id
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
}
pub fn try_start_request(&self) -> std::result::Result<u64, &'static str> {
if self.is_circuit_open() {
Err("Circuit breaker is open")
} else {
Ok(self.start_request())
}
}
pub fn register_cleanup<F: FnOnce() + Send + 'static>(&self, request_id: u64, cleanup: F) {
if let Ok(mut cleanups) = self.cleanups.lock() {
cleanups.insert(request_id, Box::new(cleanup));
}
}
pub fn complete_request(&self, request_id: u64, outcome: &RequestOutcome) {
self.active_requests
.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
match outcome {
RequestOutcome::Success => {
self.success_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
self.consecutive_failures
.store(0, std::sync::atomic::Ordering::SeqCst);
},
RequestOutcome::Failed(_) => {
self.failure_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let failures = self
.consecutive_failures
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
+ 1;
if failures >= self.failure_threshold {
self.circuit_open
.store(true, std::sync::atomic::Ordering::SeqCst);
}
if let Ok(mut cleanups) = self.cleanups.lock() {
if let Some(cleanup) = cleanups.remove(&request_id) {
cleanup();
}
}
},
}
}
pub fn reset_circuit(&self) {
self.circuit_open
.store(false, std::sync::atomic::Ordering::SeqCst);
self.consecutive_failures
.store(0, std::sync::atomic::Ordering::SeqCst);
}
}
impl Default for FailureIsolator {
fn default() -> Self {
Self::new()
}
}
#[allow(dead_code)]
pub struct IsolatedRequest {
id: u64,
}
#[derive(Debug, Clone)]
pub struct ConnectionConfig {
max_connections: usize,
min_connections: usize,
idle_timeout: Duration,
}
impl ConnectionConfig {
#[must_use]
pub fn new() -> Self {
Self {
max_connections: 10,
min_connections: 1,
idle_timeout: Duration::from_secs(300),
}
}
#[must_use]
pub fn with_max_connections(mut self, max: usize) -> Self {
self.max_connections = max;
self
}
#[must_use]
pub fn with_min_connections(mut self, min: usize) -> Self {
self.min_connections = min;
self
}
#[must_use]
pub fn with_idle_timeout(mut self, timeout: Duration) -> Self {
self.idle_timeout = timeout;
self
}
}
impl Default for ConnectionConfig {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ConnectionState {
Healthy,
Stale,
Broken,
}
#[derive(Debug)]
pub struct Connection {
#[allow(dead_code)]
id: u64,
created_at: std::time::Instant,
}
pub struct ConnectionPool {
config: ConnectionConfig,
active: std::sync::atomic::AtomicUsize,
idle: std::sync::Mutex<Vec<Connection>>,
next_id: std::sync::atomic::AtomicU64,
}
impl ConnectionPool {
#[must_use]
pub fn new(config: ConnectionConfig) -> Self {
Self {
config,
active: std::sync::atomic::AtomicUsize::new(0),
idle: std::sync::Mutex::new(Vec::new()),
next_id: std::sync::atomic::AtomicU64::new(0),
}
}
#[must_use]
pub fn max_connections(&self) -> usize {
self.config.max_connections
}
#[must_use]
pub fn min_connections(&self) -> usize {
self.config.min_connections
}
#[must_use]
pub fn active_connections(&self) -> usize {
self.active.load(std::sync::atomic::Ordering::SeqCst)
}
#[must_use]
pub fn idle_connections(&self) -> usize {
self.idle.lock().unwrap().len()
}
pub fn acquire(&self) -> std::result::Result<Connection, &'static str> {
{
let mut idle = self.idle.lock().unwrap();
if let Some(conn) = idle.pop() {
self.active
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
return Ok(conn);
}
}
let current = self.active.load(std::sync::atomic::Ordering::SeqCst);
if current < self.config.max_connections {
self.active
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let id = self
.next_id
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
return Ok(Connection {
id,
created_at: std::time::Instant::now(),
});
}
Err("Pool exhausted")
}
pub fn try_acquire(&self) -> std::result::Result<Connection, &'static str> {
self.acquire()
}
pub fn release(&self, conn: Connection) {
self.active
.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
let mut idle = self.idle.lock().unwrap();
idle.push(conn);
}
#[must_use]
pub fn check_health(&self, conn: &Connection) -> ConnectionState {
let age = conn.created_at.elapsed();
if age > self.config.idle_timeout {
ConnectionState::Stale
} else {
ConnectionState::Healthy
}
}
pub fn warm(&self) {
let current_idle = self.idle_connections();
let need = self.config.min_connections.saturating_sub(current_idle);
let mut idle = self.idle.lock().unwrap();
for _ in 0..need {
let id = self
.next_id
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
idle.push(Connection {
id,
created_at: std::time::Instant::now(),
});
}
}
}
#[derive(Debug, Clone)]
#[allow(clippy::struct_field_names)]
pub struct ResourceConfig {
max_memory_per_request: u64,
max_total_memory: u64,
max_compute_time: Duration,
max_queue_depth: usize,
}
impl ResourceConfig {
#[must_use]
pub fn new() -> Self {
Self {
max_memory_per_request: 512 * 1024 * 1024, max_total_memory: 4 * 1024 * 1024 * 1024, max_compute_time: Duration::from_secs(30),
max_queue_depth: 100,
}
}
#[must_use]
pub fn with_max_memory_per_request(mut self, bytes: u64) -> Self {
self.max_memory_per_request = bytes;
self
}
#[must_use]
pub fn with_max_total_memory(mut self, bytes: u64) -> Self {
self.max_total_memory = bytes;
self
}
#[must_use]
pub fn with_max_compute_time(mut self, time: Duration) -> Self {
self.max_compute_time = time;
self
}
#[must_use]
pub fn with_max_queue_depth(mut self, depth: usize) -> Self {
self.max_queue_depth = depth;
self
}
}
impl Default for ResourceConfig {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub enum LimitResult {
Allowed,
Denied {
reason: String,
},
Backpressure,
}
pub struct ResourceLimiter {
config: ResourceConfig,
current_memory: std::sync::atomic::AtomicU64,
queue_depth: std::sync::atomic::AtomicUsize,
}
impl ResourceLimiter {
#[must_use]
pub fn new(config: ResourceConfig) -> Self {
Self {
config,
current_memory: std::sync::atomic::AtomicU64::new(0),
queue_depth: std::sync::atomic::AtomicUsize::new(0),
}
}
#[must_use]
pub fn check_memory(&self, bytes: u64) -> LimitResult {
if bytes > self.config.max_memory_per_request {
return LimitResult::Denied {
reason: format!(
"Request {} bytes exceeds per-request limit {} bytes",
bytes, self.config.max_memory_per_request
),
};
}
let current = self
.current_memory
.load(std::sync::atomic::Ordering::SeqCst);
if current + bytes > self.config.max_total_memory {
return LimitResult::Denied {
reason: format!(
"Total memory {} + {} would exceed limit {}",
current, bytes, self.config.max_total_memory
),
};
}
LimitResult::Allowed
}
pub fn allocate(&self, bytes: u64) -> std::result::Result<(), &'static str> {
if let LimitResult::Denied { .. } = self.check_memory(bytes) {
return Err("Memory limit exceeded");
}
self.current_memory
.fetch_add(bytes, std::sync::atomic::Ordering::SeqCst);
Ok(())
}
pub fn deallocate(&self, bytes: u64) {
self.current_memory
.fetch_sub(bytes, std::sync::atomic::Ordering::SeqCst);
}
#[must_use]
pub fn current_memory(&self) -> u64 {
self.current_memory
.load(std::sync::atomic::Ordering::SeqCst)
}
pub fn enqueue(&self) -> LimitResult {
let current = self
.queue_depth
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
if current >= self.config.max_queue_depth {
self.queue_depth
.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
LimitResult::Backpressure
} else {
LimitResult::Allowed
}
}
#[must_use]
pub fn try_enqueue(&self) -> LimitResult {
let current = self.queue_depth.load(std::sync::atomic::Ordering::SeqCst);
if current >= self.config.max_queue_depth {
LimitResult::Backpressure
} else {
self.enqueue()
}
}
pub fn dequeue(&self) {
let current = self.queue_depth.load(std::sync::atomic::Ordering::SeqCst);
if current > 0 {
self.queue_depth
.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
}
}
#[must_use]
pub fn start_compute(&self) -> std::time::Instant {
std::time::Instant::now()
}
}
#[derive(Debug, Clone)]
pub struct ResourceMetrics {
pub memory_bytes: u64,
pub gpu_utilization: f64,
pub queue_depth: usize,
pub last_latency_ms: u64,
}
#[derive(Debug, Clone)]
pub struct LatencyStats {
pub min_ms: u64,
pub max_ms: u64,
pub avg_ms: u64,
}
#[derive(Debug, Clone)]
pub struct ResourceSnapshot {
pub timestamp: u64,
pub memory_bytes: u64,
pub gpu_utilization: f64,
pub queue_depth: usize,
}
pub struct ResourceMonitor {
memory_bytes: std::sync::atomic::AtomicU64,
gpu_utilization: std::sync::Mutex<f64>,
queue_depth: std::sync::atomic::AtomicUsize,
latencies: std::sync::Mutex<Vec<u64>>,
last_latency_ms: std::sync::atomic::AtomicU64,
}
impl ResourceMonitor {
#[must_use]
pub fn new() -> Self {
Self {
memory_bytes: std::sync::atomic::AtomicU64::new(0),
gpu_utilization: std::sync::Mutex::new(0.0),
queue_depth: std::sync::atomic::AtomicUsize::new(0),
latencies: std::sync::Mutex::new(Vec::new()),
last_latency_ms: std::sync::atomic::AtomicU64::new(0),
}
}
pub fn record_memory_usage(&self, bytes: u64) {
self.memory_bytes
.store(bytes, std::sync::atomic::Ordering::SeqCst);
}
pub fn record_gpu_utilization(&self, utilization: f64) {
*self.gpu_utilization.lock().unwrap() = utilization;
}
pub fn record_queue_depth(&self, depth: usize) {
self.queue_depth
.store(depth, std::sync::atomic::Ordering::SeqCst);
}
pub fn record_latency(&self, duration: Duration) {
let ms = duration.as_millis() as u64;
self.last_latency_ms
.store(ms, std::sync::atomic::Ordering::SeqCst);
self.latencies.lock().unwrap().push(ms);
}
#[must_use]
pub fn current_metrics(&self) -> ResourceMetrics {
ResourceMetrics {
memory_bytes: self.memory_bytes.load(std::sync::atomic::Ordering::SeqCst),
gpu_utilization: *self.gpu_utilization.lock().unwrap(),
queue_depth: self.queue_depth.load(std::sync::atomic::Ordering::SeqCst),
last_latency_ms: self
.last_latency_ms
.load(std::sync::atomic::Ordering::SeqCst),
}
}
#[must_use]
pub fn latency_stats(&self) -> LatencyStats {
let latencies = self.latencies.lock().unwrap();
if latencies.is_empty() {
return LatencyStats {
min_ms: 0,
max_ms: 0,
avg_ms: 0,
};
}
let min_ms = *latencies.iter().min().unwrap_or(&0);
let max_ms = *latencies.iter().max().unwrap_or(&0);
let sum: u64 = latencies.iter().sum();
let avg_ms = sum / latencies.len() as u64;
LatencyStats {
min_ms,
max_ms,
avg_ms,
}
}
#[must_use]
pub fn snapshot(&self) -> ResourceSnapshot {
use std::time::{SystemTime, UNIX_EPOCH};
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
ResourceSnapshot {
timestamp,
memory_bytes: self.memory_bytes.load(std::sync::atomic::Ordering::SeqCst),
gpu_utilization: *self.gpu_utilization.lock().unwrap(),
queue_depth: self.queue_depth.load(std::sync::atomic::Ordering::SeqCst),
}
}
}
impl Default for ResourceMonitor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ErrorCategory {
Transient,
Permanent,
}
#[derive(Debug, Clone)]
pub enum RetryDecision {
Retry {
delay: Duration,
},
Abort {
reason: String,
},
}
#[derive(Debug, Clone)]
pub struct RetryConfig {
max_retries: u32,
base_delay: Duration,
max_delay: Duration,
jitter_factor: f64,
}
impl RetryConfig {
#[must_use]
pub fn new() -> Self {
Self {
max_retries: 3,
base_delay: Duration::from_millis(100),
max_delay: Duration::from_secs(30),
jitter_factor: 0.1,
}
}
#[must_use]
pub fn with_max_retries(mut self, max: u32) -> Self {
self.max_retries = max;
self
}
#[must_use]
pub fn with_base_delay(mut self, delay: Duration) -> Self {
self.base_delay = delay;
self
}
#[must_use]
pub fn with_max_delay(mut self, delay: Duration) -> Self {
self.max_delay = delay;
self
}
#[must_use]
pub fn with_jitter_factor(mut self, factor: f64) -> Self {
self.jitter_factor = factor.clamp(0.0, 1.0);
self
}
}
impl Default for RetryConfig {
fn default() -> Self {
Self::new()
}
}
pub struct RetryPolicy {
config: RetryConfig,
}
impl RetryPolicy {
#[must_use]
pub fn new(config: RetryConfig) -> Self {
Self { config }
}
#[must_use]
pub fn max_retries(&self) -> u32 {
self.config.max_retries
}
#[must_use]
pub fn should_retry(&self, attempt: u32, category: ErrorCategory) -> RetryDecision {
if category == ErrorCategory::Permanent {
return RetryDecision::Abort {
reason: "Permanent error".to_string(),
};
}
if attempt > self.config.max_retries {
return RetryDecision::Abort {
reason: format!("Max retries ({}) exceeded", self.config.max_retries),
};
}
RetryDecision::Retry {
delay: self.calculate_delay(attempt),
}
}
#[must_use]
pub fn calculate_delay(&self, attempt: u32) -> Duration {
let exp_delay_ms = self.config.base_delay.as_millis() as u64 * (1u64 << attempt.min(20));
let delay_ms = exp_delay_ms.min(self.config.max_delay.as_millis() as u64);
Duration::from_millis(delay_ms)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CircuitState {
Closed,
Open,
HalfOpen,
}
#[derive(Debug, Clone)]
pub struct CircuitConfig {
failure_threshold: u32,
success_threshold: u32,
timeout: Duration,
}
impl CircuitConfig {
#[must_use]
pub fn new() -> Self {
Self {
failure_threshold: 5,
success_threshold: 2,
timeout: Duration::from_secs(30),
}
}
#[must_use]
pub fn with_failure_threshold(mut self, threshold: u32) -> Self {
self.failure_threshold = threshold;
self
}
#[must_use]
pub fn with_success_threshold(mut self, threshold: u32) -> Self {
self.success_threshold = threshold;
self
}
#[must_use]
pub fn with_timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
}
impl Default for CircuitConfig {
fn default() -> Self {
Self::new()
}
}
pub struct CircuitBreaker {
config: CircuitConfig,
state: std::sync::Mutex<CircuitState>,
failure_count: std::sync::atomic::AtomicU32,
success_count: std::sync::atomic::AtomicU32,
last_failure: std::sync::Mutex<Option<std::time::Instant>>,
}
impl CircuitBreaker {
#[must_use]
pub fn new(config: CircuitConfig) -> Self {
Self {
config,
state: std::sync::Mutex::new(CircuitState::Closed),
failure_count: std::sync::atomic::AtomicU32::new(0),
success_count: std::sync::atomic::AtomicU32::new(0),
last_failure: std::sync::Mutex::new(None),
}
}
#[must_use]
pub fn state(&self) -> CircuitState {
*self.state.lock().unwrap()
}
#[must_use]
pub fn allow_request(&self) -> bool {
let mut state = self.state.lock().unwrap();
match *state {
CircuitState::Closed | CircuitState::HalfOpen => true,
CircuitState::Open => {
let last_failure = self.last_failure.lock().unwrap();
if let Some(last) = *last_failure {
if last.elapsed() >= self.config.timeout {
*state = CircuitState::HalfOpen;
self.success_count
.store(0, std::sync::atomic::Ordering::SeqCst);
return true;
}
}
false
},
}
}
pub fn record_failure(&self) {
let count = self
.failure_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
+ 1;
*self.last_failure.lock().unwrap() = Some(std::time::Instant::now());
let mut state = self.state.lock().unwrap();
match *state {
CircuitState::Closed => {
if count >= self.config.failure_threshold {
*state = CircuitState::Open;
}
},
CircuitState::HalfOpen => {
*state = CircuitState::Open;
},
CircuitState::Open => {},
}
}
pub fn record_success(&self) {
self.failure_count
.store(0, std::sync::atomic::Ordering::SeqCst);
let count = self
.success_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
+ 1;
let mut state = self.state.lock().unwrap();
if *state == CircuitState::HalfOpen && count >= self.config.success_threshold {
*state = CircuitState::Closed;
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RequestType {
Inference,
Embedding,
Batch,
}
#[derive(Debug)]
pub struct BulkheadPermit {
request_type: RequestType,
}
pub struct BulkheadConfig {
pools: HashMap<String, usize>,
}
impl BulkheadConfig {
#[must_use]
pub fn new() -> Self {
Self {
pools: HashMap::new(),
}
}
#[must_use]
pub fn with_pool(mut self, name: &str, size: usize) -> Self {
self.pools.insert(name.to_string(), size);
self
}
}
impl Default for BulkheadConfig {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct BulkheadStats {
pub pool_count: usize,
pub total_capacity: usize,
}
pub struct BulkheadManager {
inference_available: std::sync::atomic::AtomicUsize,
inference_capacity: usize,
embedding_available: std::sync::atomic::AtomicUsize,
embedding_capacity: usize,
batch_available: std::sync::atomic::AtomicUsize,
batch_capacity: usize,
}
impl BulkheadManager {
#[must_use]
pub fn new(config: &BulkheadConfig) -> Self {
let inference_cap = *config.pools.get("inference").unwrap_or(&10);
let embedding_cap = *config.pools.get("embedding").unwrap_or(&5);
let batch_cap = *config.pools.get("batch").unwrap_or(&2);
Self {
inference_available: std::sync::atomic::AtomicUsize::new(inference_cap),
inference_capacity: inference_cap,
embedding_available: std::sync::atomic::AtomicUsize::new(embedding_cap),
embedding_capacity: embedding_cap,
batch_available: std::sync::atomic::AtomicUsize::new(batch_cap),
batch_capacity: batch_cap,
}
}
#[must_use]
pub fn available(&self, request_type: RequestType) -> usize {
match request_type {
RequestType::Inference => self
.inference_available
.load(std::sync::atomic::Ordering::SeqCst),
RequestType::Embedding => self
.embedding_available
.load(std::sync::atomic::Ordering::SeqCst),
RequestType::Batch => self
.batch_available
.load(std::sync::atomic::Ordering::SeqCst),
}
}
pub fn acquire(
&self,
request_type: RequestType,
) -> std::result::Result<BulkheadPermit, &'static str> {
let available = match request_type {
RequestType::Inference => &self.inference_available,
RequestType::Embedding => &self.embedding_available,
RequestType::Batch => &self.batch_available,
};
let current = available.load(std::sync::atomic::Ordering::SeqCst);
if current == 0 {
return Err("Pool exhausted");
}
available.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
Ok(BulkheadPermit { request_type })
}
pub fn try_acquire(
&self,
request_type: RequestType,
) -> std::result::Result<BulkheadPermit, &'static str> {
self.acquire(request_type)
}
pub fn release(&self, permit: &BulkheadPermit) {
let available = match permit.request_type {
RequestType::Inference => &self.inference_available,
RequestType::Embedding => &self.embedding_available,
RequestType::Batch => &self.batch_available,
};
available.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
}
#[must_use]
pub fn stats(&self) -> BulkheadStats {
BulkheadStats {
pool_count: 3,
total_capacity: self.inference_capacity + self.embedding_capacity + self.batch_capacity,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum LogLevel {
Trace,
Debug,
Info,
Warn,
Error,
}
impl LogLevel {
fn as_str(self) -> &'static str {
match self {
Self::Trace => "TRACE",
Self::Debug => "DEBUG",
Self::Info => "INFO",
Self::Warn => "WARN",
Self::Error => "ERROR",
}
}
}
#[derive(Debug, Clone)]
pub struct LogEntry {
level: LogLevel,
message: String,
timestamp: u64,
correlation_id: Option<String>,
fields: HashMap<String, String>,
}
impl LogEntry {
#[must_use]
pub fn new(level: LogLevel, message: &str) -> Self {
use std::time::{SystemTime, UNIX_EPOCH};
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
Self {
level,
message: message.to_string(),
timestamp,
correlation_id: None,
fields: HashMap::new(),
}
}
#[must_use]
pub fn with_correlation_id(mut self, id: &str) -> Self {
self.correlation_id = Some(id.to_string());
self
}
#[must_use]
pub fn with_field(mut self, key: &str, value: &str) -> Self {
self.fields.insert(key.to_string(), value.to_string());
self
}
#[must_use]
pub fn correlation_id(&self) -> Option<&str> {
self.correlation_id.as_deref()
}
#[must_use]
pub fn level(&self) -> LogLevel {
self.level
}
#[must_use]
pub fn timestamp(&self) -> u64 {
self.timestamp
}
#[must_use]
pub fn to_json(&self) -> String {
use std::fmt::Write;
let mut json = format!(
"{{\"level\":\"{}\",\"message\":\"{}\",\"timestamp\":{}",
self.level.as_str(),
self.message,
self.timestamp
);
if let Some(ref id) = self.correlation_id {
let _ = write!(json, ",\"correlation_id\":\"{}\"", id);
}
for (key, value) in &self.fields {
let _ = write!(json, ",\"{}\":\"{}\"", key, value);
}
json.push('}');
json
}
}
#[derive(Debug, Clone)]
pub struct LogConfig {
default_level: LogLevel,
json_format: bool,
module_levels: HashMap<String, LogLevel>,
}
impl LogConfig {
#[must_use]
pub fn new() -> Self {
Self {
default_level: LogLevel::Info,
json_format: false,
module_levels: HashMap::new(),
}
}
#[must_use]
pub fn with_level(mut self, level: LogLevel) -> Self {
self.default_level = level;
self
}
#[must_use]
pub fn with_json_format(mut self, enabled: bool) -> Self {
self.json_format = enabled;
self
}
#[must_use]
pub fn with_module_level(mut self, module: &str, level: LogLevel) -> Self {
self.module_levels.insert(module.to_string(), level);
self
}
}
impl Default for LogConfig {
fn default() -> Self {
Self::new()
}
}
pub struct Logger {
config: LogConfig,
}
impl Logger {
#[must_use]
pub fn new(config: LogConfig) -> Self {
Self { config }
}
#[must_use]
pub fn is_enabled(&self, level: LogLevel, module: &str) -> bool {
let min_level = self
.config
.module_levels
.get(module)
.copied()
.unwrap_or(self.config.default_level);
level >= min_level
}
}
pub struct PhaseTimer {
phases: std::sync::Mutex<HashMap<String, (Option<std::time::Instant>, u64)>>,
}
impl PhaseTimer {
#[must_use]
pub fn new() -> Self {
Self {
phases: std::sync::Mutex::new(HashMap::new()),
}
}
pub fn start_phase(&self, name: &str) {
let mut phases = self.phases.lock().unwrap();
phases.insert(name.to_string(), (Some(std::time::Instant::now()), 0));
}
pub fn end_phase(&self, name: &str) {
let mut phases = self.phases.lock().unwrap();
if let Some((Some(start_time), _)) = phases.get(name) {
let elapsed = start_time.elapsed().as_micros() as u64;
phases.insert(name.to_string(), (None, elapsed));
}
}
#[must_use]
pub fn breakdown(&self) -> HashMap<String, u64> {
let phases = self.phases.lock().unwrap();
phases.iter().map(|(k, (_, v))| (k.clone(), *v)).collect()
}
}
impl Default for PhaseTimer {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct MemoryReport {
pub peak_bytes: u64,
pub current_bytes: u64,
pub allocation_count: u64,
}
pub struct MemoryTracker {
current: std::sync::atomic::AtomicU64,
peak: std::sync::atomic::AtomicU64,
allocation_count: std::sync::atomic::AtomicU64,
}
impl MemoryTracker {
#[must_use]
pub fn new() -> Self {
Self {
current: std::sync::atomic::AtomicU64::new(0),
peak: std::sync::atomic::AtomicU64::new(0),
allocation_count: std::sync::atomic::AtomicU64::new(0),
}
}
pub fn record_allocation(&self, _name: &str, bytes: u64) {
let new_current = self
.current
.fetch_add(bytes, std::sync::atomic::Ordering::SeqCst)
+ bytes;
self.allocation_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let mut peak = self.peak.load(std::sync::atomic::Ordering::SeqCst);
while new_current > peak {
match self.peak.compare_exchange_weak(
peak,
new_current,
std::sync::atomic::Ordering::SeqCst,
std::sync::atomic::Ordering::Relaxed,
) {
Ok(_) => break,
Err(current_peak) => peak = current_peak,
}
}
}
pub fn record_deallocation(&self, _name: &str, bytes: u64) {
self.current
.fetch_sub(bytes, std::sync::atomic::Ordering::SeqCst);
}
#[must_use]
pub fn report(&self) -> MemoryReport {
MemoryReport {
peak_bytes: self.peak.load(std::sync::atomic::Ordering::SeqCst),
current_bytes: self.current.load(std::sync::atomic::Ordering::SeqCst),
allocation_count: self
.allocation_count
.load(std::sync::atomic::Ordering::SeqCst),
}
}
}
impl Default for MemoryTracker {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct DiagnosticsSummary {
pub request_count: u64,
}
pub struct DiagnosticsCollector {
request_count: std::sync::atomic::AtomicU64,
#[allow(dead_code)]
timings: std::sync::Mutex<Vec<HashMap<String, u64>>>,
#[allow(dead_code)]
memory_snapshots: std::sync::Mutex<Vec<MemoryReport>>,
}
impl DiagnosticsCollector {
#[must_use]
pub fn new() -> Self {
Self {
request_count: std::sync::atomic::AtomicU64::new(0),
timings: std::sync::Mutex::new(Vec::new()),
memory_snapshots: std::sync::Mutex::new(Vec::new()),
}
}
pub fn record_request_timing(&self, _request_id: &str, timing: HashMap<String, u64>) {
self.request_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
self.timings.lock().unwrap().push(timing);
}
pub fn record_memory_snapshot(&self, report: MemoryReport) {
self.memory_snapshots.lock().unwrap().push(report);
}
#[must_use]
pub fn summary(&self) -> DiagnosticsSummary {
DiagnosticsSummary {
request_count: self.request_count.load(std::sync::atomic::Ordering::SeqCst),
}
}
}
impl Default for DiagnosticsCollector {
fn default() -> Self {
Self::new()
}
}
pub struct DebugMode {
enabled: std::sync::atomic::AtomicBool,
}
impl DebugMode {
#[must_use]
pub fn new() -> Self {
Self {
enabled: std::sync::atomic::AtomicBool::new(false),
}
}
#[must_use]
pub fn is_enabled(&self) -> bool {
self.enabled.load(std::sync::atomic::Ordering::SeqCst)
}
pub fn enable(&self) {
self.enabled
.store(true, std::sync::atomic::Ordering::SeqCst);
}
#[allow(dead_code)]
pub fn disable(&self) {
self.enabled
.store(false, std::sync::atomic::Ordering::SeqCst);
}
}
impl Default for DebugMode {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct RequestCapture {
input: String,
params: HashMap<String, String>,
}
impl RequestCapture {
#[must_use]
pub fn new() -> Self {
Self {
input: String::new(),
params: HashMap::new(),
}
}
#[must_use]
pub fn with_input(mut self, input: &str) -> Self {
self.input = input.to_string();
self
}
#[must_use]
pub fn with_params(mut self, key: &str, value: &str) -> Self {
self.params.insert(key.to_string(), value.to_string());
self
}
#[must_use]
pub fn input(&self) -> &str {
&self.input
}
#[must_use]
pub fn params(&self) -> &HashMap<String, String> {
&self.params
}
#[must_use]
pub fn to_json(&self) -> String {
let params_json: Vec<String> = self
.params
.iter()
.map(|(k, v)| format!("\"{}\":\"{}\"", k, v))
.collect();
format!(
"{{\"input\":\"{}\",\"params\":{{{}}}}}",
self.input,
params_json.join(",")
)
}
pub fn from_json(json: &str) -> std::result::Result<Self, &'static str> {
let input_start = json.find("\"input\":\"").ok_or("Missing input")?;
let input_rest = &json[input_start + 9..];
let input_end = input_rest.find('"').ok_or("Invalid input")?;
let input = &input_rest[..input_end];
Ok(Self {
input: input.to_string(),
params: HashMap::new(),
})
}
}
impl Default for RequestCapture {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct StateDump {
error: String,
stack_trace: String,
state: HashMap<String, String>,
}
impl StateDump {
#[must_use]
pub fn new() -> Self {
Self {
error: String::new(),
stack_trace: String::new(),
state: HashMap::new(),
}
}
#[must_use]
pub fn with_error(mut self, error: &str) -> Self {
self.error = error.to_string();
self
}
#[must_use]
pub fn with_stack_trace(mut self, trace: &str) -> Self {
self.stack_trace = trace.to_string();
self
}
#[must_use]
pub fn with_state(mut self, key: &str, value: &str) -> Self {
self.state.insert(key.to_string(), value.to_string());
self
}
#[must_use]
pub fn error(&self) -> &str {
&self.error
}
#[must_use]
pub fn stack_trace(&self) -> &str {
&self.stack_trace
}
#[must_use]
pub fn state(&self) -> &HashMap<String, String> {
&self.state
}
#[must_use]
pub fn to_json(&self) -> String {
let state_json: Vec<String> = self
.state
.iter()
.map(|(k, v)| format!("\"{}\":\"{}\"", k, v))
.collect();
format!(
"{{\"error\":\"{}\",\"stack_trace\":\"{}\",\"state\":{{{}}}}}",
self.error,
self.stack_trace.replace('\n', "\\n"),
state_json.join(",")
)
}
}
impl Default for StateDump {
fn default() -> Self {
Self::new()
}
}
pub struct GgufModelState {
model: Option<GpuModel>,
model_name: Option<String>,
ready: bool,
}
impl std::fmt::Debug for GgufModelState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GgufModelState")
.field("model_name", &self.model_name)
.field("ready", &self.ready)
.field("is_loaded", &self.model.is_some())
.finish()
}
}
impl GgufModelState {
#[must_use]
pub fn new() -> Self {
Self {
model: None,
model_name: None,
ready: false,
}
}
#[must_use]
pub fn with_model(model: GpuModel, name: String) -> Self {
Self {
model: Some(model),
model_name: Some(name),
ready: true,
}
}
#[must_use]
pub fn is_loaded(&self) -> bool {
self.model.is_some()
}
#[must_use]
pub fn is_ready(&self) -> bool {
self.ready && self.model.is_some()
}
#[must_use]
pub fn model_name(&self) -> Option<&str> {
self.model_name.as_deref()
}
#[must_use]
pub fn vocab_size(&self) -> usize {
self.model.as_ref().map_or(0, |m| m.config().vocab_size)
}
#[must_use]
pub fn model(&self) -> Option<&GpuModel> {
self.model.as_ref()
}
pub fn model_mut(&mut self) -> Option<&mut GpuModel> {
self.model.as_mut()
}
}
impl Default for GgufModelState {
fn default() -> Self {
Self::new()
}
}
pub fn load_gguf_to_gpu(
vocab_size: usize,
hidden_dim: usize,
num_layers: usize,
) -> Result<GgufModelState> {
let num_heads = hidden_dim / 64; let config = GpuModelConfig {
vocab_size,
hidden_dim,
num_heads,
num_kv_heads: num_heads, num_layers,
intermediate_dim: hidden_dim * 4, eps: 1e-5,
};
let model = GpuModel::new(config)?;
let model_name = format!("test_{}x{}x{}", vocab_size, hidden_dim, num_layers);
Ok(GgufModelState::with_model(model, model_name))
}
#[cfg(all(test, feature = "heavy-tests"))]
mod tests {
use super::*;
use serial_test::serial;
#[test]
fn test_gpu_compute_auto_creation() {
let compute = GpuCompute::auto();
assert!(compute.is_ok(), "Auto creation should succeed");
let compute = compute.unwrap();
assert!(
compute.backend() == ComputeBackend::Gpu || compute.backend() == ComputeBackend::Cpu
);
}
#[test]
fn test_gpu_compute_cpu_backend() {
let compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
assert!(!compute.is_gpu());
assert_eq!(compute.backend(), ComputeBackend::Cpu);
}
#[test]
fn test_gpu_compute_matmul_cpu_fallback() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = vec![1.0, 2.0, 3.0, 4.0]; let b = vec![5.0, 6.0, 7.0, 8.0];
let c = compute.matmul(&a, &b, 2, 2, 2).unwrap();
assert_eq!(c.len(), 4);
assert!((c[0] - 19.0).abs() < 1e-5);
assert!((c[1] - 22.0).abs() < 1e-5);
assert!((c[2] - 43.0).abs() < 1e-5);
assert!((c[3] - 50.0).abs() < 1e-5);
}
#[test]
fn test_gpu_compute_matmul_non_square() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; let b = vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0];
let c = compute.matmul(&a, &b, 2, 3, 2).unwrap();
assert_eq!(c.len(), 4);
assert!((c[0] - 58.0).abs() < 1e-5);
assert!((c[1] - 64.0).abs() < 1e-5);
assert!((c[2] - 139.0).abs() < 1e-5);
assert!((c[3] - 154.0).abs() < 1e-5);
}
#[test]
fn test_gpu_compute_matmul_dimension_error() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = vec![1.0, 2.0, 3.0]; let b = vec![1.0, 2.0, 3.0, 4.0];
let result = compute.matmul(&a, &b, 2, 2, 2);
assert!(result.is_err());
}
#[test]
fn test_gpu_compute_matmul_tensor() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = Tensor::from_vec(vec![2, 3], vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
let b = Tensor::from_vec(vec![3, 2], vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).unwrap();
let c = compute.matmul_tensor(&a, &b).unwrap();
assert_eq!(c.shape(), &[2, 2]);
assert!((c.data()[0] - 58.0).abs() < 1e-5);
assert!((c.data()[3] - 154.0).abs() < 1e-5);
}
#[test]
fn test_gpu_compute_matmul_tensor_dimension_mismatch() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = Tensor::from_vec(vec![2, 3], vec![1.0; 6]).unwrap();
let b = Tensor::from_vec(vec![2, 2], vec![1.0; 4]).unwrap();
let result = compute.matmul_tensor(&a, &b);
assert!(result.is_err());
}
#[test]
fn test_gpu_compute_dot_cpu_fallback() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0, 6.0];
let result = compute.dot(&a, &b).unwrap();
assert!((result - 32.0).abs() < 1e-5); }
#[test]
fn test_gpu_compute_dot_length_mismatch() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0];
let result = compute.dot(&a, &b);
assert!(result.is_err());
}
#[test]
fn test_gpu_compute_relu_cpu_fallback() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let input = vec![-1.0, 0.0, 1.0, -0.5, 2.0];
let output = compute.relu(&input).unwrap();
assert_eq!(output, vec![0.0, 0.0, 1.0, 0.0, 2.0]);
}
#[test]
fn test_gpu_compute_sigmoid_cpu_fallback() {
let mut compute = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let input = vec![0.0];
let output = compute.sigmoid(&input).unwrap();
assert!((output[0] - 0.5).abs() < 1e-5); }
#[test]
fn test_hybrid_scheduler_creation() {
let scheduler = HybridScheduler::new();
assert!(scheduler.is_ok());
}
#[test]
fn test_hybrid_scheduler_threshold() {
let scheduler = HybridScheduler::with_threshold(1000).unwrap();
assert_eq!(scheduler.gpu_threshold(), 1000);
}
#[test]
fn test_hybrid_scheduler_should_use_gpu() {
let scheduler = HybridScheduler::with_threshold(1000).unwrap();
assert!(!scheduler.should_use_gpu(9, 9, 9) || !scheduler.has_gpu());
if scheduler.has_gpu() {
assert!(scheduler.should_use_gpu(10, 10, 10));
assert!(scheduler.should_use_gpu(100, 100, 100));
}
}
#[test]
fn test_hybrid_scheduler_matmul() {
let mut scheduler = HybridScheduler::with_threshold(1000).unwrap();
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let c = scheduler.matmul(&a, &b, 2, 2, 2).unwrap();
assert_eq!(c.len(), 4);
assert!((c[0] - 19.0).abs() < 1e-5);
}
#[test]
#[serial]
fn test_gpu_backend_matmul() {
let compute = GpuCompute::new(ComputeBackend::Gpu);
if compute.is_err() {
eprintln!("GPU not available, skipping test");
return;
}
let mut compute = compute.unwrap();
assert!(compute.is_gpu());
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let c = compute.matmul(&a, &b, 2, 2, 2).unwrap();
assert!((c[0] - 19.0).abs() < 1e-4);
assert!((c[1] - 22.0).abs() < 1e-4);
assert!((c[2] - 43.0).abs() < 1e-4);
assert!((c[3] - 50.0).abs() < 1e-4);
}
#[test]
#[serial]
fn test_gpu_backend_large_matmul_speedup() {
use std::time::Instant;
let compute = GpuCompute::new(ComputeBackend::Gpu);
if compute.is_err() {
eprintln!("GPU not available, skipping test");
return;
}
let mut gpu = compute.unwrap();
let mut cpu = GpuCompute::new(ComputeBackend::Cpu).unwrap();
let (rows, inner_dim, cols) = (256usize, 256usize, 256usize);
let matrix_a: Vec<f32> = (0..rows * inner_dim)
.map(|i| (i % 17) as f32 * 0.1)
.collect();
let matrix_b: Vec<f32> = (0..inner_dim * cols)
.map(|i| (i % 19) as f32 * 0.1)
.collect();
let _ = gpu.matmul(&matrix_a, &matrix_b, rows, inner_dim, cols);
let _ = cpu.matmul(&matrix_a, &matrix_b, rows, inner_dim, cols);
let iterations = 10;
let start = Instant::now();
for _ in 0..iterations {
let _ = gpu.matmul(&matrix_a, &matrix_b, rows, inner_dim, cols);
}
let gpu_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = cpu.matmul(&matrix_a, &matrix_b, rows, inner_dim, cols);
}
let cpu_time = start.elapsed();
let speedup = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
eprintln!(
"GPU matmul speedup: {:.1}x (GPU: {:.2}ms, CPU: {:.2}ms)",
speedup,
gpu_time.as_millis() as f64 / iterations as f64,
cpu_time.as_millis() as f64 / iterations as f64
);
assert!(speedup >= 1.0, "GPU should not be slower than CPU");
}
#[test]
#[serial]
#[ignore] fn test_phase4_acceptance_gpu_throughput() {
use std::time::Instant;
let mut compute = GpuCompute::auto().unwrap();
let has_gpu = compute.is_gpu();
let hidden = 256;
let intermediate = 512;
let num_layers = 4;
let tokens = 100;
let w1: Vec<f32> = (0..hidden * intermediate)
.map(|i| (i % 13) as f32 * 0.01)
.collect();
let w2: Vec<f32> = (0..intermediate * hidden)
.map(|i| (i % 17) as f32 * 0.01)
.collect();
let input: Vec<f32> = vec![0.5; hidden];
let _ = compute.matmul(&input, &w1, 1, hidden, intermediate);
let start = Instant::now();
for _token in 0..tokens {
for _layer in 0..num_layers {
let h1 = compute
.matmul(&input, &w1, 1, hidden, intermediate)
.unwrap();
let _ = compute.matmul(&h1, &w2, 1, intermediate, hidden).unwrap();
}
}
let elapsed = start.elapsed();
let tok_per_sec = tokens as f64 / elapsed.as_secs_f64();
let (target, backend_name) = if has_gpu {
(25.0, "GPU (wgpu)")
} else {
(25.0, "CPU")
};
eprintln!(
"Phase 4 throughput [{backend_name}]: {tok_per_sec:.1} tok/s (target: ≥{target} tok/s)",
);
assert!(
tok_per_sec >= target,
"Phase 4 acceptance FAILED [{backend_name}]: {:.1} tok/s < {target} tok/s",
tok_per_sec
);
}
#[test]
fn test_buffer_pool_creation() {
let pool = GpuBufferPool::new();
let stats = pool.stats();
assert_eq!(stats.cached_buffers, 0);
assert_eq!(stats.cached_bytes, 0);
}
#[test]
fn test_buffer_pool_acquire_release() {
let mut pool = GpuBufferPool::new();
let buf = pool.acquire(1000);
assert_eq!(buf.len(), 1000);
pool.release(buf);
let stats = pool.stats();
assert_eq!(stats.cached_buffers, 1);
}
#[test]
fn test_buffer_pool_reuse() {
let mut pool = GpuBufferPool::new();
let buf1 = pool.acquire(1000);
let _buf1_ptr = buf1.as_ptr(); pool.release(buf1);
let buf2 = pool.acquire(1000);
let stats = pool.stats();
assert!(buf2.len() == 1000);
drop(buf2);
assert!(stats.cached_buffers <= 1);
}
#[test]
fn test_buffer_pool_clear() {
let mut pool = GpuBufferPool::new();
let buf1 = pool.acquire(1000);
let buf2 = pool.acquire(2000);
pool.release(buf1);
pool.release(buf2);
pool.clear();
let stats = pool.stats();
assert_eq!(stats.cached_buffers, 0);
}
#[test]
fn test_buffer_pool_bucket_sizing() {
let mut pool = GpuBufferPool::new();
let buf = pool.acquire(100);
assert!(buf.len() == 100); pool.release(buf);
let stats = pool.stats();
assert!(stats.cached_bytes >= 100 * 4);
}
#[test]
fn test_async_result_ready() {
let result = AsyncGpuResult::ready(vec![1.0, 2.0, 3.0]);
assert!(result.is_ready());
assert!(result.try_get().is_some());
assert_eq!(result.wait(), vec![1.0, 2.0, 3.0]);
}
#[test]
fn test_async_result_pending() {
let mut result = AsyncGpuResult::pending();
assert!(!result.is_ready());
assert!(result.try_get().is_none());
result.set_result(vec![4.0, 5.0, 6.0]);
assert!(result.is_ready());
assert_eq!(result.wait(), vec![4.0, 5.0, 6.0]);
}
#[test]
fn test_hybrid_scheduler_pooled_matmul() {
let mut scheduler = HybridScheduler::with_threshold(1000).unwrap();
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let c = scheduler.matmul_pooled(&a, &b, 2, 2, 2).unwrap();
assert_eq!(c.len(), 4);
assert!((c[0] - 19.0).abs() < 1e-5);
scheduler.release_buffer(c);
let stats = scheduler.pool_stats();
assert_eq!(stats.cached_buffers, 1);
}
#[test]
fn test_hybrid_scheduler_async_matmul() {
let mut scheduler = HybridScheduler::with_threshold(1000).unwrap();
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let result = scheduler.matmul_async(&a, &b, 2, 2, 2).unwrap();
assert!(result.is_ready());
let c = result.wait();
assert!((c[0] - 19.0).abs() < 1e-5);
}
#[test]
fn test_hybrid_scheduler_batch_matmul() {
let mut scheduler = HybridScheduler::with_threshold(1000).unwrap();
let ops = vec![
(vec![1.0, 2.0, 3.0, 4.0], vec![5.0, 6.0, 7.0, 8.0], 2, 2, 2),
(vec![1.0, 0.0, 0.0, 1.0], vec![2.0, 3.0, 4.0, 5.0], 2, 2, 2),
];
let results = scheduler.matmul_batch(&ops).unwrap();
assert_eq!(results.len(), 2);
assert!((results[0][0] - 19.0).abs() < 1e-5); assert!((results[1][0] - 2.0).abs() < 1e-5); }
#[test]
fn test_hybrid_scheduler_pool_stats() {
let mut scheduler = HybridScheduler::with_threshold(1000).unwrap();
let stats = scheduler.pool_stats();
assert_eq!(stats.cached_buffers, 0);
for _ in 0..3 {
let c = scheduler
.matmul_pooled(&[1.0; 4], &[1.0; 4], 2, 2, 2)
.unwrap();
scheduler.release_buffer(c);
}
let stats = scheduler.pool_stats();
assert!(stats.cached_buffers >= 1);
}
#[test]
fn test_streaming_kv_cache_creation() {
let cache = StreamingKVCache::new(4, 2048, 8, 64);
assert_eq!(cache.len(), 0);
assert!(cache.is_empty());
assert_eq!(cache.max_positions(), 2048);
let expected_bytes = 4 * 2048 * 8 * 64 * 2 * 4;
assert_eq!(cache.memory_bytes(), expected_bytes);
}
#[test]
fn test_streaming_kv_cache_append() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let kv_dim = 4 * 32;
let key = vec![1.0f32; kv_dim];
let value = vec![2.0f32; kv_dim];
cache.append(0, &key, &value);
assert_eq!(cache.len(), 0);
cache.append(1, &key, &value);
assert_eq!(cache.len(), 1);
assert!(!cache.is_empty());
}
#[test]
fn test_streaming_kv_cache_get_range() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let kv_dim = 4 * 32;
for pos in 0..3 {
let key = vec![(pos + 1) as f32; kv_dim];
let value = vec![(pos + 10) as f32; kv_dim];
for layer in 0..2 {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), 3);
let (keys, values) = cache.get_range(0, 0, 2);
assert_eq!(keys.len(), 2 * kv_dim);
assert_eq!(values.len(), 2 * kv_dim);
assert!((keys[0] - 1.0).abs() < 1e-5);
assert!((keys[kv_dim] - 2.0).abs() < 1e-5);
}
#[test]
fn test_streaming_kv_cache_get_valid() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let kv_dim = 4 * 32;
for pos in 0..5 {
let key = vec![(pos + 1) as f32; kv_dim];
let value = vec![(pos + 10) as f32; kv_dim];
for layer in 0..2 {
cache.append(layer, &key, &value);
}
}
let (keys, values) = cache.get_valid(0);
assert_eq!(keys.len(), 5 * kv_dim);
assert_eq!(values.len(), 5 * kv_dim);
}
#[test]
fn test_streaming_kv_cache_circular_buffer() {
let mut cache = StreamingKVCache::new(1, 3, 2, 4); let kv_dim = 2 * 4;
for pos in 0..3 {
let key = vec![(pos + 1) as f32; kv_dim];
let value = vec![(pos + 10) as f32; kv_dim];
cache.append(0, &key, &value);
}
assert_eq!(cache.len(), 3);
let key = vec![100.0f32; kv_dim];
let value = vec![200.0f32; kv_dim];
cache.append(0, &key, &value);
assert_eq!(cache.len(), 3);
let (keys, _) = cache.get_range(0, 0, 1);
assert!((keys[0] - 100.0).abs() < 1e-5);
}
#[test]
fn test_streaming_kv_cache_clear() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let kv_dim = 4 * 32;
for _ in 0..5 {
let key = vec![1.0f32; kv_dim];
let value = vec![2.0f32; kv_dim];
for layer in 0..2 {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), 5);
cache.clear();
assert_eq!(cache.len(), 0);
assert!(cache.is_empty());
}
#[test]
fn test_streaming_kv_cache_memory_calculation() {
let cache = StreamingKVCache::new(32, 2048, 32, 128);
let expected_bytes = 32 * 2048 * 32 * 128 * 2 * 4;
assert_eq!(cache.memory_bytes(), expected_bytes);
let memory_mb = cache.memory_mb();
assert!((memory_mb - 2048.0).abs() < 1.0); }
#[test]
fn test_streaming_kv_cache_memory_bound() {
let mut cache = StreamingKVCache::new(1, 10, 2, 4);
let kv_dim = 2 * 4;
let initial_bytes = cache.memory_bytes();
for pos in 0..100 {
let key = vec![pos as f32; kv_dim];
let value = vec![pos as f32; kv_dim];
cache.append(0, &key, &value);
}
assert_eq!(cache.memory_bytes(), initial_bytes);
assert_eq!(cache.len(), 10);
}
#[test]
#[should_panic(expected = "Layer index out of bounds")]
fn test_streaming_kv_cache_layer_bounds() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let kv_dim = 4 * 32;
let key = vec![1.0f32; kv_dim];
let value = vec![2.0f32; kv_dim];
cache.append(2, &key, &value);
}
#[test]
#[should_panic(expected = "Key dimension mismatch")]
fn test_streaming_kv_cache_dimension_mismatch() {
let mut cache = StreamingKVCache::new(2, 100, 4, 32);
let key = vec![1.0f32; 10]; let value = vec![2.0f32; 4 * 32];
cache.append(0, &key, &value);
}
#[test]
fn test_streaming_kv_cache_8192_positions() {
let num_layers = 4; let max_positions = 8192;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
assert_eq!(cache.max_positions(), 8192);
assert_eq!(cache.len(), 0);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
for _pos in 0..8192 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), max_positions);
}
#[test]
fn test_ultra_long_context_memory_bound() {
let num_layers = 32;
let max_positions = 8192;
let num_heads = 32;
let head_dim = 128;
let cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let expected_bytes = num_layers * max_positions * num_heads * head_dim * 2 * 4;
assert_eq!(cache.memory_bytes(), expected_bytes);
let memory_gb = cache.memory_mb() / 1024.0;
assert!(
memory_gb < 9.0,
"8192 context KV cache should be < 9 GB, got {:.2} GB",
memory_gb
);
}
#[test]
fn test_ultra_long_context_fill_performance() {
use std::time::Instant;
let num_layers = 4;
let max_positions = 8192;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
let start = Instant::now();
for _pos in 0..8192 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
let elapsed = start.elapsed();
let fill_rate = 8192.0 / elapsed.as_secs_f64();
assert!(
fill_rate > 100.0,
"Fill rate should be > 100 pos/s, got {:.0}",
fill_rate
);
}
#[test]
fn test_streaming_kv_cache_16384_positions() {
let num_layers = 4; let max_positions = 16384;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
assert_eq!(cache.max_positions(), 16384);
assert_eq!(cache.len(), 0);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
for _pos in 0..16384 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), max_positions);
}
#[test]
fn test_super_long_context_memory_bound() {
let num_layers = 32;
let max_positions = 16384;
let num_heads = 32;
let head_dim = 128;
let cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let expected_bytes = num_layers * max_positions * num_heads * head_dim * 2 * 4;
assert_eq!(cache.memory_bytes(), expected_bytes);
let memory_gb = cache.memory_mb() / 1024.0;
assert!(
memory_gb < 18.0,
"16384 context KV cache should be < 18 GB, got {:.2} GB",
memory_gb
);
}
#[test]
fn test_super_long_context_fill_performance() {
use std::time::Instant;
let num_layers = 4;
let max_positions = 16384;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
let start = Instant::now();
for _pos in 0..16384 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
let elapsed = start.elapsed();
let fill_rate = 16384.0 / elapsed.as_secs_f64();
assert!(
fill_rate > 50.0,
"Fill rate should be > 50 pos/s, got {:.0}",
fill_rate
);
}
#[test]
fn test_streaming_kv_cache_32768_positions() {
let num_layers = 4; let max_positions = 32768;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
assert_eq!(cache.max_positions(), 32768);
assert_eq!(cache.len(), 0);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
for _pos in 0..32768 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), max_positions);
}
#[test]
fn test_mega_long_context_memory_bound() {
let num_layers = 32;
let max_positions = 32768;
let num_heads = 32;
let head_dim = 128;
let cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let expected_bytes = num_layers * max_positions * num_heads * head_dim * 2 * 4;
assert_eq!(cache.memory_bytes(), expected_bytes);
let memory_gb = cache.memory_mb() / 1024.0;
assert!(
memory_gb < 36.0,
"32768 context KV cache should be < 36 GB, got {:.2} GB",
memory_gb
);
}
#[test]
fn test_mega_long_context_fill_performance() {
use std::time::Instant;
let num_layers = 4;
let max_positions = 32768;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
let start = Instant::now();
for _pos in 0..32768 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
let elapsed = start.elapsed();
let fill_rate = 32768.0 / elapsed.as_secs_f64();
assert!(
fill_rate > 25.0,
"Fill rate should be > 25 pos/s, got {:.0}",
fill_rate
);
}
#[test]
fn test_f32_f16_conversion_roundtrip() {
let test_values = vec![
0.0f32, 1.0, -1.0, 0.5, -0.5, 0.125, 100.0, -100.0, 0.001, 65504.0,
];
for &original in &test_values {
let fp16_bits = StreamingKVCacheFp16::f32_to_f16(original);
let recovered = StreamingKVCacheFp16::f16_to_f32(fp16_bits);
let error = if original.abs() > 1e-6 {
((recovered - original) / original).abs()
} else {
(recovered - original).abs()
};
assert!(
error < 0.01,
"FP16 roundtrip error too large for {}: got {}, error {}",
original,
recovered,
error
);
}
}
#[test]
fn test_streaming_kv_cache_fp16_basic() {
let num_layers = 2;
let max_positions = 16;
let num_heads = 4;
let head_dim = 8;
let mut cache = StreamingKVCacheFp16::new(num_layers, max_positions, num_heads, head_dim);
assert!(cache.is_empty());
assert_eq!(cache.len(), 0);
assert_eq!(cache.max_positions(), 16);
let kv_dim = num_heads * head_dim;
let key = vec![0.5f32; kv_dim];
let value = vec![0.25f32; kv_dim];
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
assert_eq!(cache.len(), 1);
let (keys, values) = cache.get_valid_f32(0);
assert_eq!(keys.len(), kv_dim);
assert_eq!(values.len(), kv_dim);
for &k in &keys {
assert!((k - 0.5).abs() < 0.01, "Key mismatch: {}", k);
}
for &v in &values {
assert!((v - 0.25).abs() < 0.01, "Value mismatch: {}", v);
}
}
#[test]
fn test_streaming_kv_cache_fp16_memory_half() {
let num_layers = 32;
let max_positions = 65536;
let num_heads = 32;
let head_dim = 128;
let cache_fp16 = StreamingKVCacheFp16::new(num_layers, max_positions, num_heads, head_dim);
let cache_fp32 = StreamingKVCache::new(num_layers, max_positions, num_heads, head_dim);
let fp16_bytes = cache_fp16.memory_bytes();
let fp32_bytes = cache_fp32.memory_bytes();
assert_eq!(fp16_bytes * 2, fp32_bytes);
let fp16_gb = cache_fp16.memory_mb() / 1024.0;
assert!(
fp16_gb < 36.0,
"FP16 65536 context should be < 36 GB, got {:.2} GB",
fp16_gb
);
assert!(
fp16_gb > 30.0,
"FP16 65536 context should be > 30 GB, got {:.2} GB",
fp16_gb
);
}
#[test]
fn test_streaming_kv_cache_fp16_65536_positions() {
let num_layers = 4;
let max_positions = 65536;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCacheFp16::new(num_layers, max_positions, num_heads, head_dim);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
for _pos in 0..65536 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
assert_eq!(cache.len(), max_positions);
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
assert_eq!(cache.len(), max_positions); }
#[test]
fn test_fp16_kv_cache_memory_bound_65536() {
let num_layers = 32;
let max_positions = 65536;
let num_heads = 32;
let head_dim = 128;
let cache = StreamingKVCacheFp16::new(num_layers, max_positions, num_heads, head_dim);
let expected_bytes = num_layers * max_positions * num_heads * head_dim * 2 * 2;
assert_eq!(cache.memory_bytes(), expected_bytes);
let memory_gb = cache.memory_mb() / 1024.0;
assert!(
memory_gb < 36.0,
"65536 context FP16 KV cache should be < 36 GB, got {:.2} GB",
memory_gb
);
}
#[test]
fn test_fp16_kv_cache_fill_performance_65536() {
use std::time::Instant;
let num_layers = 4;
let max_positions = 65536;
let num_heads = 8;
let head_dim = 64;
let mut cache = StreamingKVCacheFp16::new(num_layers, max_positions, num_heads, head_dim);
let kv_dim = num_heads * head_dim;
let key = vec![0.1f32; kv_dim];
let value = vec![0.2f32; kv_dim];
let start = Instant::now();
for _pos in 0..65536 {
for layer in 0..num_layers {
cache.append(layer, &key, &value);
}
}
let elapsed = start.elapsed();
let fill_rate = 65536.0 / elapsed.as_secs_f64();
assert!(
fill_rate > 10.0,
"FP16 fill rate should be > 10 pos/s, got {:.0}",
fill_rate
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1001a_cuda_executor_matmul_correctness() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1001a: CUDA not available, skipping");
return;
}
let mut executor = CudaExecutor::new(0).expect("Failed to create CudaExecutor");
let a = vec![1.0f32; 16]; let b = vec![1.0f32; 16]; let mut result = vec![0.0f32; 16];
executor
.gemm(&a, &b, &mut result, 4, 4, 4)
.expect("GEMM failed");
for (i, &val) in result.iter().enumerate() {
assert!(
(val - 4.0).abs() < 1e-3,
"IMP-1001a: Element {} mismatch: got {}, expected 4.0",
i,
val
);
}
let a = vec![2.0f32; 64]; let b = vec![1.0f32; 64]; let mut result = vec![0.0f32; 64];
executor
.gemm(&a, &b, &mut result, 8, 8, 8)
.expect("GEMM 8x8 failed");
for (i, &val) in result.iter().enumerate() {
assert!(
(val - 16.0).abs() < 1e-3,
"IMP-1001a: 8x8 element {} mismatch: got {}, expected 16.0",
i,
val
);
}
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1001b_cuda_softmax_correctness() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1001b: CUDA not available, skipping");
return;
}
let mut executor = CudaExecutor::new(0).expect("Failed to create CudaExecutor");
let mut data = vec![1.0, 2.0, 3.0, 4.0];
executor.softmax(&mut data).expect("Softmax failed");
let sum: f32 = data.iter().sum();
assert!(
(sum - 1.0).abs() < 1e-5,
"IMP-1001b: Softmax should sum to 1, got {}",
sum
);
assert!(
data[0] < data[1] && data[1] < data[2] && data[2] < data[3],
"IMP-1001b: Softmax should preserve ordering"
);
}
#[test]
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
fn test_imp_1001c_cuda_inference_speedup() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1001c: CUDA not available, skipping");
return;
}
let mut executor = CudaExecutor::new(0).expect("Failed to create CudaExecutor");
let m: u32 = 512;
let k: u32 = 2048;
let n: u32 = 2048;
let a: Vec<f32> = (0..(m * k) as usize)
.map(|i| (i % 100) as f32 * 0.01)
.collect();
let b: Vec<f32> = (0..(k * n) as usize)
.map(|i| (i % 100) as f32 * 0.01)
.collect();
let mut result = vec![0.0f32; (m * n) as usize];
let _ = executor.gemm(&a, &b, &mut result, m, n, k);
let start = Instant::now();
executor
.gemm(&a, &b, &mut result, m, n, k)
.expect("GEMM failed");
let cuda_time = start.elapsed();
let start = Instant::now();
let _cpu_result = cpu_matmul(&a, &b, m as usize, k as usize, n as usize);
let cpu_time = start.elapsed();
let speedup = cpu_time.as_secs_f64() / cuda_time.as_secs_f64();
println!(
"IMP-1001c: CUDA={:.2}ms, CPU={:.2}ms, speedup={:.1}x",
cuda_time.as_secs_f64() * 1000.0,
cpu_time.as_secs_f64() * 1000.0,
speedup
);
assert!(
speedup > 5.0,
"IMP-1001c: CUDA should be >5x faster for 512x2048x2048 GEMM, got {:.1}x",
speedup
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1001d_gpu_model_with_cuda_backend() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1001d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 1000,
hidden_dim: 256,
num_layers: 2,
num_heads: 4,
num_kv_heads: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let mut model = GpuModel::new(config).expect("Failed to create GpuModel");
let prompt = vec![1usize, 2, 3];
let gen_config = GpuGenerateConfig {
max_tokens: 5,
temperature: 1.0,
top_k: 50,
stop_tokens: vec![],
};
let result = model.generate(&prompt, &gen_config);
assert!(result.is_ok(), "IMP-1001d: Generate should succeed");
let tokens = result.unwrap();
assert!(
tokens.len() >= prompt.len(),
"IMP-1001d: Should generate at least prompt length tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1002a_cuda_scheduler_creation() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1002a: CUDA not available, skipping");
return;
}
let scheduler = CudaScheduler::new();
assert!(
scheduler.is_ok(),
"IMP-1002a: CudaScheduler creation should succeed"
);
let scheduler = scheduler.unwrap();
assert!(
scheduler.has_cuda(),
"IMP-1002a: CudaScheduler should report CUDA available"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1002b_cuda_scheduler_matmul() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1002b: CUDA not available, skipping");
return;
}
let mut scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let a = vec![1.0f32; 16]; let b = vec![1.0f32; 16];
let result = scheduler.matmul(&a, &b, 4, 4, 4);
assert!(result.is_ok(), "IMP-1002b: matmul should succeed");
let output = result.unwrap();
assert_eq!(
output.len(),
16,
"IMP-1002b: Output should be 4x4=16 elements"
);
for (i, &val) in output.iter().enumerate() {
assert!(
(val - 4.0).abs() < 1e-3,
"IMP-1002b: Element {} should be 4.0, got {}",
i,
val
);
}
}
#[test]
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
fn test_imp_1002c_cuda_scheduler_large_matmul() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1002c: CUDA not available, skipping");
return;
}
let mut scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let m = 64;
let k = 64;
let n = 64;
let a: Vec<f32> = vec![1.0; m * k]; let b: Vec<f32> = vec![1.0; k * n];
let result = scheduler.matmul(&a, &b, m, k, n);
assert!(
result.is_ok(),
"IMP-1002c: Large matmul should succeed: {:?}",
result.err()
);
let output = result.unwrap();
assert_eq!(
output.len(),
m * n,
"IMP-1002c: Output should be {}x{}={} elements",
m,
n,
m * n
);
let expected = k as f32;
for (i, &val) in output.iter().take(10).enumerate() {
assert!(
(val - expected).abs() < 1.0,
"IMP-1002c: Element {} should be ~{}, got {}",
i,
expected,
val
);
}
let m = 128;
let k = 128;
let n = 128;
let a: Vec<f32> = vec![1.0; m * k];
let b: Vec<f32> = vec![1.0; k * n];
let result = scheduler.matmul(&a, &b, m, k, n);
assert!(result.is_ok(), "IMP-1002c: 128x128 matmul should succeed");
let output = result.unwrap();
assert_eq!(output.len(), m * n);
for (i, &val) in output.iter().take(10).enumerate() {
assert!(
(val - 128.0).abs() < 1.0,
"IMP-1002c: 128x128 element {} should be ~128, got {}",
i,
val
);
}
}
#[test]
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
fn test_imp_1002d_cuda_scheduler_no_m1_restriction() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1002d: CUDA not available, skipping");
return;
}
let mut cuda_scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let mut hybrid_scheduler =
HybridScheduler::with_threshold(1000).expect("Failed to create HybridScheduler");
let m = 1;
let k = 4096;
let n = 4096;
let a: Vec<f32> = vec![1.0; m * k];
let b: Vec<f32> = vec![1.0; k * n];
assert!(
!hybrid_scheduler.should_use_gpu(m, k, n),
"IMP-1002d: HybridScheduler should reject m=1 for GPU"
);
assert!(
cuda_scheduler.uses_cuda_for(m, k, n),
"IMP-1002d: CudaScheduler should use CUDA even for m=1"
);
let start = Instant::now();
let hybrid_result = hybrid_scheduler.matmul(&a, &b, m, k, n).unwrap();
let hybrid_time = start.elapsed();
let start = Instant::now();
let cuda_result = cuda_scheduler.matmul(&a, &b, m, k, n).unwrap();
let cuda_time = start.elapsed();
println!(
"IMP-1002d: m=1 matmul - Hybrid(CPU)={:.2}ms, CUDA={:.2}ms",
hybrid_time.as_secs_f64() * 1000.0,
cuda_time.as_secs_f64() * 1000.0
);
assert!(
hybrid_result.len() == m * n && cuda_result.len() == m * n,
"IMP-1002d: Both schedulers should produce correct output size"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1003a_gpu_model_with_cuda_scheduler() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1003a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 64,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 128,
eps: 1e-5,
};
let model = GpuModel::new_with_cuda(config);
assert!(
model.is_ok(),
"IMP-1003a: GpuModel::new_with_cuda() should succeed"
);
let model = model.unwrap();
assert!(
model.has_cuda_scheduler(),
"IMP-1003a: Model should have CUDA scheduler"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1003b_cuda_scheduler_used_for_forward() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1003b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 64,
num_heads: 4,
num_kv_heads: 4,
num_layers: 1,
intermediate_dim: 128,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let token_ids = vec![0usize];
let result = model.forward_gpu(&token_ids);
assert!(result.is_ok(), "IMP-1003b: Forward pass should succeed");
let logits = result.unwrap();
assert_eq!(logits.len(), 100, "IMP-1003b: Output should be vocab_size");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1003c_cuda_scheduler_vs_hybrid_single_token() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1003c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 32000, hidden_dim: 512, num_heads: 8,
num_kv_heads: 8,
num_layers: 4,
intermediate_dim: 1024,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let token_ids = vec![42usize];
let _ = cuda_model.forward_gpu(&token_ids);
let _ = hybrid_model.forward_gpu(&token_ids);
let start = Instant::now();
for _ in 0..10 {
let _ = cuda_model.forward_gpu(&token_ids);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for _ in 0..10 {
let _ = hybrid_model.forward_gpu(&token_ids);
}
let hybrid_time = start.elapsed();
println!(
"IMP-1003c: Single-token forward (10 iters) - CUDA={:.2}ms, Hybrid(CPU)={:.2}ms",
cuda_time.as_secs_f64() * 1000.0,
hybrid_time.as_secs_f64() * 1000.0
);
assert!(
cuda_time.as_micros() > 0 && hybrid_time.as_micros() > 0,
"IMP-1003c: Both paths should complete"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1003d_cuda_scheduler_matmul_dispatch() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1003d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 64,
num_heads: 4,
num_kv_heads: 4,
num_layers: 1,
intermediate_dim: 128,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let a: Vec<f32> = vec![1.0; 64];
let b: Vec<f32> = vec![1.0; 64 * 100];
let result = model.cuda_matmul(&a, &b, 1, 64, 100);
assert!(result.is_ok(), "IMP-1003d: cuda_matmul should succeed");
let output = result.unwrap();
assert_eq!(output.len(), 100, "IMP-1003d: Output size should be m*n");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1004a_cuda_matmul_benchmark() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1004a: CUDA not available, skipping");
return;
}
let mut cuda_scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let test_cases = [
(1, 4096, 4096, "1x4096x4096 (m=1, attention output)"),
(1, 4096, 11008, "1x4096x11008 (m=1, FFN fc1)"),
(1, 11008, 4096, "1x11008x4096 (m=1, FFN fc2)"),
(1, 4096, 32000, "1x4096x32000 (m=1, LM head)"),
];
for (m, k, n, desc) in test_cases {
let a: Vec<f32> = vec![1.0; m * k];
let b: Vec<f32> = vec![1.0; k * n];
let _ = cuda_scheduler.matmul(&a, &b, m, k, n);
let iterations = 10;
let start = Instant::now();
for _ in 0..iterations {
let _ = cuda_scheduler.matmul(&a, &b, m, k, n);
}
let elapsed = start.elapsed();
let avg_ms = elapsed.as_secs_f64() * 1000.0 / iterations as f64;
println!("IMP-1004a: {} - {:.3}ms avg", desc, avg_ms);
}
}
#[test]
#[cfg(feature = "cuda")]
#[allow(clippy::many_single_char_names)]
fn test_imp_1004b_cuda_vs_cpu_matmul() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1004b: CUDA not available, skipping");
return;
}
let mut cuda_scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let mut hybrid_scheduler =
HybridScheduler::with_threshold(1).expect("Failed to create HybridScheduler");
let m = 1;
let k = 4096;
let n = 4096;
let a: Vec<f32> = vec![1.0; m * k];
let b: Vec<f32> = vec![1.0; k * n];
let _ = cuda_scheduler.matmul(&a, &b, m, k, n);
let _ = hybrid_scheduler.matmul(&a, &b, m, k, n);
let iterations = 20;
let start = Instant::now();
for _ in 0..iterations {
let _ = cuda_scheduler.matmul(&a, &b, m, k, n);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = hybrid_scheduler.matmul(&a, &b, m, k, n);
}
let cpu_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let cpu_avg_ms = cpu_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = cpu_avg_ms / cuda_avg_ms;
println!(
"IMP-1004b: m=1 matmul (1x{}x{}) - CUDA={:.3}ms, CPU={:.3}ms, Speedup={:.2}x",
k, n, cuda_avg_ms, cpu_avg_ms, speedup
);
let cuda_result = cuda_scheduler.matmul(&a, &b, m, k, n).unwrap();
let cpu_result = hybrid_scheduler.matmul(&a, &b, m, k, n).unwrap();
assert_eq!(
cuda_result.len(),
cpu_result.len(),
"IMP-1004b: Both should produce same output size"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1004c_full_forward_benchmark() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1004c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 32000,
hidden_dim: 512,
num_heads: 8,
num_kv_heads: 8,
num_layers: 4,
intermediate_dim: 1024,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let token_ids = vec![42usize];
let _ = cuda_model.forward_gpu(&token_ids);
let _ = hybrid_model.forward_gpu(&token_ids);
let iterations = 20;
let start = Instant::now();
for _ in 0..iterations {
let _ = cuda_model.forward_gpu(&token_ids);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = hybrid_model.forward_gpu(&token_ids);
}
let hybrid_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let hybrid_avg_ms = hybrid_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = hybrid_avg_ms / cuda_avg_ms;
println!(
"IMP-1004c: Full forward pass - CUDA={:.3}ms, Hybrid={:.3}ms, Speedup={:.2}x",
cuda_avg_ms, hybrid_avg_ms, speedup
);
let cuda_tok_per_sec = 1000.0 / cuda_avg_ms;
let hybrid_tok_per_sec = 1000.0 / hybrid_avg_ms;
println!(
"IMP-1004c: Throughput - CUDA={:.1} tok/s, Hybrid={:.1} tok/s",
cuda_tok_per_sec, hybrid_tok_per_sec
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1004d_token_generation_throughput() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1004d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 32000,
hidden_dim: 512,
num_heads: 8,
num_kv_heads: 8,
num_layers: 4,
intermediate_dim: 1024,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let prompt = vec![1usize, 2, 3];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = cuda_model
.generate(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1004d: Generated {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1004d: Target=228 tok/s (Ollama), Current={:.1} tok/s, Gap={:.0}x",
tok_per_sec,
228.0 / tok_per_sec.max(0.001)
);
assert!(
tokens_generated > 0,
"IMP-1004d: Should generate at least some tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_parity_120a_cached_vs_uncached_matmul() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("PARITY-120a: CUDA not available, skipping");
return;
}
let mut scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let k = 4096usize;
let n = 4096usize;
let weight: Vec<f32> = vec![1.0; k * n];
let x: Vec<f32> = vec![1.0; k];
scheduler
.cache_weight("test_weight", &weight)
.expect("Failed to cache weight");
let _ = scheduler.matmul(&x, &weight, 1, k, n);
let _ = scheduler.matmul_cached("test_weight", &x, k, n);
let iterations = 20;
let start = Instant::now();
for _ in 0..iterations {
let _ = scheduler.matmul(&x, &weight, 1, k, n);
}
let uncached_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = scheduler.matmul_cached("test_weight", &x, k, n);
}
let cached_time = start.elapsed();
let uncached_avg_ms = uncached_time.as_secs_f64() * 1000.0 / iterations as f64;
let cached_avg_ms = cached_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = uncached_avg_ms / cached_avg_ms;
println!(
"PARITY-120a: 1x{}x{} - Uncached={:.3}ms, Cached={:.3}ms, Speedup={:.1}x",
k, n, uncached_avg_ms, cached_avg_ms, speedup
);
let uncached_result = scheduler.matmul(&x, &weight, 1, k, n).unwrap();
let cached_result = scheduler.matmul_cached("test_weight", &x, k, n).unwrap();
assert_eq!(
uncached_result.len(),
cached_result.len(),
"PARITY-120a: Output sizes should match"
);
for (i, (u, c)) in uncached_result.iter().zip(cached_result.iter()).enumerate() {
assert!(
(u - c).abs() < 0.01,
"PARITY-120a: Results differ at {}: uncached={}, cached={}",
i,
u,
c
);
}
assert!(
speedup > 1.5,
"PARITY-120a: Expected >1.5x speedup, got {:.1}x",
speedup
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_parity_120b_full_layer_cached() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("PARITY-120b: CUDA not available, skipping");
return;
}
let mut scheduler = CudaScheduler::new().expect("Failed to create CudaScheduler");
let hidden = 4096usize;
let qkv = 3 * hidden;
let intermediate = 11008usize;
let qkv_weight: Vec<f32> = vec![1.0; hidden * qkv];
let out_weight: Vec<f32> = vec![1.0; hidden * hidden];
let fc1_weight: Vec<f32> = vec![1.0; hidden * intermediate];
let fc2_weight: Vec<f32> = vec![1.0; intermediate * hidden];
scheduler.cache_weight("qkv", &qkv_weight).unwrap();
scheduler.cache_weight("out", &out_weight).unwrap();
scheduler.cache_weight("fc1", &fc1_weight).unwrap();
scheduler.cache_weight("fc2", &fc2_weight).unwrap();
assert_eq!(
scheduler.cached_weight_count(),
4,
"PARITY-120b: Should have 4 cached weights"
);
let x: Vec<f32> = vec![1.0; hidden];
let iterations = 10;
let start = Instant::now();
for _ in 0..iterations {
let qkv_out = scheduler.matmul_cached("qkv", &x, hidden, qkv).unwrap();
let attn_out = scheduler
.matmul_cached("out", &qkv_out[..hidden], hidden, hidden)
.unwrap();
let fc1_out = scheduler
.matmul_cached("fc1", &attn_out, hidden, intermediate)
.unwrap();
let _fc2_out = scheduler
.matmul_cached("fc2", &fc1_out, intermediate, hidden)
.unwrap();
}
let elapsed = start.elapsed();
let avg_ms = elapsed.as_secs_f64() * 1000.0 / iterations as f64;
let tok_per_sec = 1000.0 / avg_ms;
println!(
"PARITY-120b: Full layer (4 matmuls) - {:.2}ms/token = {:.1} tok/s",
avg_ms, tok_per_sec
);
println!(
"PARITY-120b: Target=228 tok/s (Ollama), Current={:.1} tok/s (single layer)",
tok_per_sec
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1005a_do_matmul_uses_cuda_scheduler() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1005a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 64,
num_heads: 4,
num_kv_heads: 4,
num_layers: 1,
intermediate_dim: 128,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let a: Vec<f32> = vec![1.0; 64];
let b: Vec<f32> = vec![1.0; 64 * 100];
let cuda_result = cuda_model.do_matmul(&a, &b, 1, 64, 100);
let hybrid_result = hybrid_model.do_matmul(&a, &b, 1, 64, 100);
assert!(
cuda_result.is_ok(),
"IMP-1005a: CUDA do_matmul should succeed"
);
assert!(
hybrid_result.is_ok(),
"IMP-1005a: Hybrid do_matmul should succeed"
);
assert_eq!(
cuda_result.unwrap().len(),
hybrid_result.unwrap().len(),
"IMP-1005a: Both should produce same output size"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1005b_forward_gpu_speedup_with_cuda() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1005b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 32000,
hidden_dim: 512,
num_heads: 8,
num_kv_heads: 8,
num_layers: 4,
intermediate_dim: 1024,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let token_ids = vec![42usize];
let _ = cuda_model.forward_gpu(&token_ids);
let _ = hybrid_model.forward_gpu(&token_ids);
let iterations = 20;
let start = Instant::now();
for _ in 0..iterations {
let _ = cuda_model.forward_gpu(&token_ids);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = hybrid_model.forward_gpu(&token_ids);
}
let hybrid_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let hybrid_avg_ms = hybrid_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = hybrid_avg_ms / cuda_avg_ms;
println!(
"IMP-1005b: forward_gpu (m=1) - CUDA={:.3}ms, Hybrid={:.3}ms, Speedup={:.2}x",
cuda_avg_ms, hybrid_avg_ms, speedup
);
let cuda_tok_per_sec = 1000.0 / cuda_avg_ms;
let hybrid_tok_per_sec = 1000.0 / hybrid_avg_ms;
println!(
"IMP-1005b: Throughput - CUDA={:.1} tok/s, Hybrid={:.1} tok/s",
cuda_tok_per_sec, hybrid_tok_per_sec
);
assert!(
speedup > 0.5,
"IMP-1005b: CUDA path should not be catastrophically slower"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1005c_token_generation_with_cuda_forward() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1005c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 32000,
hidden_dim: 512,
num_heads: 8,
num_kv_heads: 8,
num_layers: 4,
intermediate_dim: 1024,
eps: 1e-5,
};
let mut cuda_model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let prompt = vec![1usize, 2, 3];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = cuda_model
.generate(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1005c: Generated {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1005c: Previous=9.1 tok/s, Current={:.1} tok/s, Target=228 tok/s",
tok_per_sec
);
assert!(
tokens_generated > 0,
"IMP-1005c: Should generate at least some tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1005d_forward_block_uses_do_matmul() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1005d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let input: Vec<f32> = vec![0.1; 256]; let seq_len = 1;
let _ = cuda_model.forward_block_idx(&input, seq_len, 0);
let _ = hybrid_model.forward_block_idx(&input, seq_len, 0);
let iterations = 20;
let start = Instant::now();
for _ in 0..iterations {
let _ = cuda_model.forward_block_idx(&input, seq_len, 0);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = hybrid_model.forward_block_idx(&input, seq_len, 0);
}
let hybrid_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let hybrid_avg_ms = hybrid_time.as_secs_f64() * 1000.0 / iterations as f64;
println!(
"IMP-1005d: forward_block_idx (m=1) - CUDA={:.3}ms, Hybrid={:.3}ms",
cuda_avg_ms, hybrid_avg_ms
);
let cuda_result = cuda_model.forward_block_idx(&input, seq_len, 0);
let hybrid_result = hybrid_model.forward_block_idx(&input, seq_len, 0);
assert!(
cuda_result.is_ok() && hybrid_result.is_ok(),
"IMP-1005d: Both should complete successfully"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1006a_incremental_forward_uses_cuda() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1006a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let token_id: usize = 42;
let num_kv_heads = cuda_model.config.num_kv_heads;
let head_dim = cuda_model.config.head_dim();
let max_positions = 128;
let mut cuda_cache = StreamingKVCache::new(
cuda_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let mut hybrid_cache = StreamingKVCache::new(
hybrid_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = cuda_model.forward_gpu_incremental_optimized(token_id, &mut cuda_cache);
let _ = hybrid_model.forward_gpu_incremental_optimized(token_id, &mut hybrid_cache);
let iterations = 20;
let start = Instant::now();
for i in 0..iterations {
let mut cache = StreamingKVCache::new(
cuda_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = cuda_model.forward_gpu_incremental_optimized(i % 100, &mut cache);
}
let cuda_time = start.elapsed();
let start = Instant::now();
for i in 0..iterations {
let mut cache = StreamingKVCache::new(
hybrid_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = hybrid_model.forward_gpu_incremental_optimized(i % 100, &mut cache);
}
let hybrid_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let hybrid_avg_ms = hybrid_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = hybrid_avg_ms / cuda_avg_ms;
println!(
"IMP-1006a: incremental_forward (m=1) - CUDA={:.3}ms, Hybrid={:.3}ms, Speedup={:.2}x",
cuda_avg_ms, hybrid_avg_ms, speedup
);
assert!(
cuda_avg_ms < hybrid_avg_ms * 2.0,
"IMP-1006a: CUDA incremental should not be much slower than Hybrid"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1006b_block_incremental_uses_cuda() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1006b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut cuda_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let mut hybrid_model = GpuModel::new(config).expect("Failed to create Hybrid model");
let input: Vec<f32> = vec![0.1; 256];
let block_idx = 0;
let num_kv_heads = cuda_model.config.num_kv_heads;
let head_dim = cuda_model.config.head_dim();
let max_positions = 128;
let mut cuda_cache = StreamingKVCache::new(
cuda_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let mut hybrid_cache = StreamingKVCache::new(
hybrid_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = cuda_model.forward_block_incremental_optimized(&input, block_idx, &mut cuda_cache);
let _ =
hybrid_model.forward_block_incremental_optimized(&input, block_idx, &mut hybrid_cache);
let iterations = 20;
let mut cuda_cache2 = StreamingKVCache::new(
cuda_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let start = Instant::now();
for _ in 0..iterations {
let _ =
cuda_model.forward_block_incremental_optimized(&input, block_idx, &mut cuda_cache2);
}
let cuda_time = start.elapsed();
let mut hybrid_cache2 = StreamingKVCache::new(
hybrid_model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let start = Instant::now();
for _ in 0..iterations {
let _ = hybrid_model.forward_block_incremental_optimized(
&input,
block_idx,
&mut hybrid_cache2,
);
}
let hybrid_time = start.elapsed();
let cuda_avg_ms = cuda_time.as_secs_f64() * 1000.0 / iterations as f64;
let hybrid_avg_ms = hybrid_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = hybrid_avg_ms / cuda_avg_ms;
println!(
"IMP-1006b: block_incremental (m=1) - CUDA={:.3}ms, Hybrid={:.3}ms, Speedup={:.2}x",
cuda_avg_ms, hybrid_avg_ms, speedup
);
assert!(
cuda_avg_ms > 0.0,
"IMP-1006b: CUDA path should complete successfully"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1006c_generate_throughput_improved() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1006c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4, intermediate_dim: 512,
eps: 1e-5,
};
let mut cuda_model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = cuda_model
.generate(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1006c: Generated {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1006c: Previous=9.1 tok/s, Current={:.1} tok/s, Target=228 tok/s (Ollama)",
tok_per_sec
);
assert!(
tokens_generated > 0,
"IMP-1006c: Should generate at least some tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1006d_all_matmuls_routed_to_cuda() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1006d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut cuda_model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
assert!(
cuda_model.has_cuda_scheduler(),
"IMP-1006d: CUDA model should have cuda_scheduler"
);
let a: Vec<f32> = vec![1.0; 256 * 128];
let b: Vec<f32> = vec![1.0; 128 * 64];
let result = cuda_model.do_matmul(&a, &b, 256, 128, 64);
assert!(
result.is_ok(),
"IMP-1006d: do_matmul should complete via CUDA"
);
let output = result.unwrap();
assert_eq!(
output.len(),
256 * 64,
"IMP-1006d: Output dimensions should be correct"
);
println!("IMP-1006d: All matmuls routed to CudaScheduler ✓");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1007a_no_clone_matmul() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1007a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config).expect("Failed to create model");
let input: Vec<f32> = vec![0.1; 256];
let result = model.matmul_split(&input, 0, WeightType::Qkv);
assert!(result.is_ok(), "IMP-1007a: matmul_split should work");
println!("IMP-1007a: Zero-clone matmul verified ✓");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1007b_incremental_no_clone_speedup() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1007b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config.clone()).expect("Failed to create model");
let num_kv_heads = model.config.num_kv_heads;
let head_dim = model.config.head_dim();
let max_positions = 128;
let input: Vec<f32> = vec![0.1; 256];
let block_idx = 0;
let mut cache = StreamingKVCache::new(
model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = model.forward_block_incremental_optimized(&input, block_idx, &mut cache);
let iterations = 50;
let mut cache2 = StreamingKVCache::new(
model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let start = Instant::now();
for _ in 0..iterations {
let _ = model.forward_block_incremental_optimized(&input, block_idx, &mut cache2);
}
let elapsed = start.elapsed();
let avg_ms = elapsed.as_secs_f64() * 1000.0 / iterations as f64;
println!(
"IMP-1007b: block_incremental avg={:.3}ms ({} iterations)",
avg_ms, iterations
);
println!(
"IMP-1007b: Previous=0.698ms, Current={:.3}ms, Target=<0.5ms",
avg_ms
);
assert!(avg_ms > 0.0, "IMP-1007b: Should complete successfully");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1007c_generate_throughput_improved() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1007c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config).expect("Failed to create model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = model
.generate(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1007c: Generated {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1007c: Previous=37.3 tok/s, Current={:.1} tok/s, Target=228 tok/s (Ollama)",
tok_per_sec
);
assert!(
tokens_generated > 0,
"IMP-1007c: Should generate at least some tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1008a_refcell_scheduler_matmul() {
use crate::cuda::CudaExecutor;
if !CudaExecutor::is_available() {
println!("IMP-1008a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let a: Vec<f32> = vec![0.1; 256];
let b: Vec<f32> = vec![0.2; 256 * 512];
let result = model
.matmul_refcell(&a, &b, 1, 256, 512)
.expect("matmul_refcell should work");
assert_eq!(
result.len(),
512,
"IMP-1008a: Output should be 512 elements"
);
let sum: f32 = result.iter().sum();
assert!(sum.is_finite(), "IMP-1008a: Result should be finite");
assert!(sum != 0.0, "IMP-1008a: Result should be non-zero");
println!("IMP-1008a: matmul_refcell works with &self");
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1008b_zero_clone_forward_block() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1008b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 2,
intermediate_dim: 512,
eps: 1e-5,
};
let model = GpuModel::new_with_cuda(config.clone()).expect("Failed to create CUDA model");
let input: Vec<f32> = vec![0.1; 256];
let block_idx = 0;
let num_kv_heads = model.config.num_kv_heads;
let head_dim = model.config.head_dim();
let max_positions = 128;
let mut cache = StreamingKVCache::new(
model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let _ = model.forward_block_refcell(&input, block_idx, &mut cache);
let iterations = 100;
let mut cache2 = StreamingKVCache::new(
model.config.num_layers,
max_positions,
num_kv_heads,
head_dim,
);
let start = Instant::now();
for _ in 0..iterations {
let _ = model.forward_block_refcell(&input, block_idx, &mut cache2);
}
let refcell_time = start.elapsed();
let refcell_avg_us = refcell_time.as_micros() as f64 / iterations as f64;
println!(
"IMP-1008b: forward_block_refcell avg={:.1}µs ({} iterations)",
refcell_avg_us, iterations
);
assert!(
refcell_avg_us > 0.0,
"IMP-1008b: forward_block_refcell should complete"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1008c_generate_throughput_refcell() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1008c: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = model
.generate_refcell(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1008c: Generated {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1008c: Previous=35.1 tok/s, Current={:.1} tok/s, Target=228 tok/s (Ollama)",
tok_per_sec
);
assert!(
tokens_generated > 0,
"IMP-1008c: Should generate at least some tokens"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1008d_compare_clone_vs_refcell() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1008d: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let mut clone_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create clone model");
let refcell_model =
GpuModel::new_with_cuda(config).expect("Failed to create refcell model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let _ = clone_model.generate(&prompt, &gen_config);
let _ = refcell_model.generate_refcell(&prompt, &gen_config);
let iterations = 5;
let start = Instant::now();
for _ in 0..iterations {
let _ = clone_model.generate(&prompt, &gen_config);
}
let clone_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = refcell_model.generate_refcell(&prompt, &gen_config);
}
let refcell_time = start.elapsed();
let clone_avg_ms = clone_time.as_secs_f64() * 1000.0 / iterations as f64;
let refcell_avg_ms = refcell_time.as_secs_f64() * 1000.0 / iterations as f64;
let speedup = clone_avg_ms / refcell_avg_ms;
println!(
"IMP-1008d: Clone={:.2}ms, RefCell={:.2}ms, Speedup={:.2}x",
clone_avg_ms, refcell_avg_ms, speedup
);
assert!(
speedup > 0.9,
"IMP-1008d: RefCell should not be slower than clone"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1009a_main_generate_uses_refcell() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1009a: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let mut model = GpuModel::new_with_cuda(config).expect("Failed to create CUDA model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let start = Instant::now();
let tokens = model
.generate(&prompt, &gen_config)
.expect("Generation failed");
let elapsed = start.elapsed();
let tokens_generated = tokens.len() - prompt.len();
let tok_per_sec = tokens_generated as f64 / elapsed.as_secs_f64();
println!(
"IMP-1009a: Main generate() - {} tokens in {:.3}ms ({:.1} tok/s)",
tokens_generated,
elapsed.as_secs_f64() * 1000.0,
tok_per_sec
);
println!(
"IMP-1009a: Target=100+ tok/s (RefCell speed), Current={:.1} tok/s",
tok_per_sec
);
assert!(
tok_per_sec > 100.0,
"IMP-1009a: Main generate() should achieve >100 tok/s with RefCell wiring"
);
}
#[test]
#[cfg(feature = "cuda")]
fn test_imp_1009b_generate_parity_with_refcell() {
use crate::cuda::CudaExecutor;
use std::time::Instant;
if !CudaExecutor::is_available() {
println!("IMP-1009b: CUDA not available, skipping");
return;
}
let config = GpuModelConfig {
vocab_size: 100,
hidden_dim: 256,
num_heads: 4,
num_kv_heads: 4,
num_layers: 4,
intermediate_dim: 512,
eps: 1e-5,
};
let mut clone_model =
GpuModel::new_with_cuda(config.clone()).expect("Failed to create model");
let refcell_model = GpuModel::new_with_cuda(config).expect("Failed to create model");
let prompt: Vec<usize> = vec![1, 2, 3, 4, 5];
let gen_config = GpuGenerateConfig::deterministic(10);
let _ = clone_model.generate(&prompt, &gen_config);
let _ = refcell_model.generate_refcell(&prompt, &gen_config);
let iterations = 5;
let start = Instant::now();
for _ in 0..iterations {
let _ = clone_model.generate(&prompt, &gen_config);
}
let main_time = start.elapsed();
let start = Instant::now();
for _ in 0..iterations {
let _ = refcell_model.generate_refcell(&prompt, &gen_config);
}
let refcell_time = start.elapsed();
let main_avg_ms = main_time.as_secs_f64() * 1000.0 / iterations as f64;
let refcell_avg_ms = refcell_time.as_secs_f64() * 1000.0 / iterations as f64;
let ratio = main_avg_ms / refcell_avg_ms;
println!(
"IMP-1009b: Main={:.2}ms, RefCell={:.2}ms, Ratio={:.2}x",
main_avg_ms, refcell_avg_ms, ratio
);
assert!(
ratio < 1.5,
"IMP-1009b: Main generate() should be within 1.5x of RefCell throughput"
);
}
}