#[must_use]
pub fn quantized_dot_q8(block_a: &[u8], block_b: &[u8]) -> f32 {
if block_a.len() < 34 || block_b.len() < 34 {
return 0.0;
}
let scale_a = half::f16::from_le_bytes([block_a[0], block_a[1]]).to_f32();
let scale_b = half::f16::from_le_bytes([block_b[0], block_b[1]]).to_f32();
let mut acc = 0i32;
for i in 0..32 {
let a_val = block_a[2 + i] as i8 as i32;
let b_val = block_b[2 + i] as i8 as i32;
acc += a_val * b_val;
}
(acc as f32) * scale_a * scale_b
}
const Q4_BLOCK_SIZE: usize = 18; const Q4_BLOCK_VALUES: usize = 32;
#[inline]
fn process_q4_block(
weights: &[u8],
block_offset: usize,
input: &[f32],
input_offset: usize,
cols: usize,
) -> f32 {
let scale =
half::f16::from_le_bytes([weights[block_offset], weights[block_offset + 1]]).to_f32();
let mut acc = 0.0f32;
for i in 0..16 {
let byte = weights[block_offset + 2 + i];
let val_lo = (byte & 0x0F) as i32 - 8;
let val_hi = ((byte >> 4) & 0x0F) as i32 - 8;
let in_idx_lo = input_offset + i * 2;
let in_idx_hi = input_offset + i * 2 + 1;
if in_idx_lo < cols {
acc += (val_lo as f32) * scale * input[in_idx_lo];
}
if in_idx_hi < cols {
acc += (val_hi as f32) * scale * input[in_idx_hi];
}
}
acc
}
#[must_use]
pub fn quantized_matvec_q4(weights: &[u8], input: &[f32], rows: usize, cols: usize) -> Vec<f32> {
let blocks_per_row = cols.div_ceil(Q4_BLOCK_VALUES);
let row_bytes = blocks_per_row * Q4_BLOCK_SIZE;
let mut output = vec![0.0f32; rows];
for (row, out_val) in output.iter_mut().enumerate().take(rows) {
let row_offset = row * row_bytes;
let mut acc = 0.0f32;
for block_idx in 0..blocks_per_row {
let block_offset = row_offset + block_idx * Q4_BLOCK_SIZE;
if block_offset + Q4_BLOCK_SIZE > weights.len() {
break;
}
acc += process_q4_block(
weights,
block_offset,
input,
block_idx * Q4_BLOCK_VALUES,
cols,
);
}
*out_val = acc;
}
output
}
#[must_use]
pub fn quantized_matvec_q8(weights: &[u8], input: &[f32], rows: usize, cols: usize) -> Vec<f32> {
const Q8_BLOCK_SIZE: usize = 34; const Q8_BLOCK_VALUES: usize = 32;
let blocks_per_row = cols.div_ceil(Q8_BLOCK_VALUES);
let row_bytes = blocks_per_row * Q8_BLOCK_SIZE;
let mut output = vec![0.0f32; rows];
for (row, out_val) in output.iter_mut().enumerate().take(rows) {
let row_offset = row * row_bytes;
let mut acc = 0.0f32;
for block_idx in 0..blocks_per_row {
let block_offset = row_offset + block_idx * Q8_BLOCK_SIZE;
if block_offset + Q8_BLOCK_SIZE > weights.len() {
break;
}
let scale =
half::f16::from_le_bytes([weights[block_offset], weights[block_offset + 1]])
.to_f32();
let input_offset = block_idx * Q8_BLOCK_VALUES;
for i in 0..32 {
let val = weights[block_offset + 2 + i] as i8 as i32;
let in_idx = input_offset + i;
if in_idx < cols {
acc += (val as f32) * scale * input[in_idx];
}
}
}
*out_val = acc;
}
output
}
#[derive(Debug, Clone, Default)]
pub struct QuantizedAccumulator {
sum: f32,
}
impl QuantizedAccumulator {
#[must_use]
pub fn new() -> Self {
Self { sum: 0.0 }
}
#[must_use]
pub fn sum(&self) -> f32 {
self.sum
}
pub fn reset(&mut self) {
self.sum = 0.0;
}
#[inline]
pub fn add_scaled(&mut self, value: f32, scale: f32) {
self.sum += value * scale;
}
#[inline]
pub fn add_block(&mut self, block_sum: f32, block_scale: f32) {
self.sum += block_sum * block_scale;
}
}
#[derive(Debug)]
pub struct DoubleBuffer<T> {
front: Vec<T>,
back: Vec<T>,
}
impl<T: Default + Clone> DoubleBuffer<T> {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
front: vec![T::default(); capacity],
back: vec![T::default(); capacity],
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.front.len()
}
#[must_use]
pub fn front(&self) -> &[T] {
&self.front
}
pub fn back_mut(&mut self) -> &mut [T] {
&mut self.back
}
pub fn swap(&mut self) {
std::mem::swap(&mut self.front, &mut self.back);
}
}
#[derive(Debug, Clone)]
pub struct ChunkedProcessor {
chunk_size: usize,
}
impl ChunkedProcessor {
#[must_use]
pub fn new(chunk_size: usize) -> Self {
Self { chunk_size }
}
#[must_use]
pub fn chunk_size(&self) -> usize {
self.chunk_size
}
#[must_use]
pub fn num_chunks(&self, total_len: usize) -> usize {
if total_len == 0 {
return 0;
}
total_len.div_ceil(self.chunk_size)
}
#[must_use]
pub fn chunk_bounds(&self, chunk_idx: usize, total_len: usize) -> (usize, usize) {
let start = chunk_idx * self.chunk_size;
let end = (start + self.chunk_size).min(total_len);
(start, end)
}
pub fn process_chunks<T, F>(&self, data: &[T], mut process: F) -> f32
where
F: FnMut(&[T]) -> f32,
{
let mut total = 0.0f32;
let num_chunks = self.num_chunks(data.len());
for chunk_idx in 0..num_chunks {
let (start, end) = self.chunk_bounds(chunk_idx, data.len());
total += process(&data[start..end]);
}
total
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
pub enum GpuPipelineStage {
Embed = 0,
Attention = 1,
FFN = 2,
Output = 3,
}
#[derive(Debug)]
pub struct InferencePipeline {
num_stages: usize,
stage_times: std::collections::HashMap<GpuPipelineStage, f32>,
}
impl InferencePipeline {
#[must_use]
pub fn new(num_stages: usize) -> Self {
Self {
num_stages,
stage_times: std::collections::HashMap::new(),
}
}
#[must_use]
pub fn num_stages(&self) -> usize {
self.num_stages
}
pub fn record_stage_time(&mut self, stage: GpuPipelineStage, time_ms: f32) {
self.stage_times.insert(stage, time_ms);
}
#[must_use]
pub fn total_latency(&self) -> f32 {
self.stage_times.values().sum()
}
#[must_use]
pub fn stage_breakdown(&self) -> &std::collections::HashMap<GpuPipelineStage, f32> {
&self.stage_times
}
pub fn reset(&mut self) {
self.stage_times.clear();
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ErrorClassification {
Transient,
Fatal,
GpuFailure,
}
#[derive(Debug, Clone)]
pub enum RecoveryAction {
Retry {
delay: Duration,
},
FallbackToCpu,
Fail,
}
pub struct ErrorRecoveryStrategy {
max_retries: u32,
base_delay: Duration,
max_delay: Duration,
jitter: f64,
}