use std::fs::File;
use std::path::Path;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use trueno::{Matrix as TruenoMatrix, Vector as TruenoVector};
use crate::apr::MAGIC;
use crate::error::{RealizarError, Result};
pub const APR_TRANSFORMER_MAGIC: [u8; 4] = [0x41, 0x50, 0x52, 0x54];
pub const APR_TRANSFORMER_VERSION: u32 = 1;
pub const APR_TRANSFORMER_HEADER_SIZE: usize = 64;
#[derive(Debug)]
pub struct MmapAprTransformer {
mmap: Mmap,
pub config: AprTransformerConfig,
tensor_data_offset: usize,
is_mmap: bool,
}
impl MmapAprTransformer {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref()).map_err(|e| RealizarError::IoError {
message: format!("Failed to open APR file: {e}"),
})?;
let mmap = unsafe {
Mmap::map(&file).map_err(|e| RealizarError::IoError {
message: format!("Failed to mmap APR file: {e}"),
})?
};
if mmap.len() < APR_TRANSFORMER_HEADER_SIZE {
return Err(RealizarError::FormatError {
reason: format!(
"APR file too small: {} bytes (need at least {})",
mmap.len(),
APR_TRANSFORMER_HEADER_SIZE
),
});
}
let header_bytes = &mmap[..APR_TRANSFORMER_HEADER_SIZE];
let magic = &header_bytes[0..4];
if magic != MAGIC && magic != APR_TRANSFORMER_MAGIC {
return Err(RealizarError::FormatError {
reason: format!(
"Invalid APR magic: expected {:?} or {:?}, got {:?}",
MAGIC, APR_TRANSFORMER_MAGIC, magic
),
});
}
let version = u32::from_le_bytes([
header_bytes[4],
header_bytes[5],
header_bytes[6],
header_bytes[7],
]);
if version > APR_TRANSFORMER_VERSION {
return Err(RealizarError::FormatError {
reason: format!("Unsupported APR version: {version}"),
});
}
let hidden_dim = u32::from_le_bytes([
header_bytes[8],
header_bytes[9],
header_bytes[10],
header_bytes[11],
]) as usize;
let num_layers = u32::from_le_bytes([
header_bytes[12],
header_bytes[13],
header_bytes[14],
header_bytes[15],
]) as usize;
let num_heads = u32::from_le_bytes([
header_bytes[16],
header_bytes[17],
header_bytes[18],
header_bytes[19],
]) as usize;
let num_kv_heads = u32::from_le_bytes([
header_bytes[20],
header_bytes[21],
header_bytes[22],
header_bytes[23],
]) as usize;
let vocab_size = u32::from_le_bytes([
header_bytes[24],
header_bytes[25],
header_bytes[26],
header_bytes[27],
]) as usize;
let intermediate_dim = u32::from_le_bytes([
header_bytes[28],
header_bytes[29],
header_bytes[30],
header_bytes[31],
]) as usize;
let context_length = u32::from_le_bytes([
header_bytes[32],
header_bytes[33],
header_bytes[34],
header_bytes[35],
]) as usize;
let rope_theta = f32::from_le_bytes([
header_bytes[36],
header_bytes[37],
header_bytes[38],
header_bytes[39],
]);
let eps = f32::from_le_bytes([
header_bytes[40],
header_bytes[41],
header_bytes[42],
header_bytes[43],
]);
let tensor_data_offset = u32::from_le_bytes([
header_bytes[44],
header_bytes[45],
header_bytes[46],
header_bytes[47],
]) as usize;
let config = AprTransformerConfig {
architecture: "apr".to_string(),
hidden_dim,
num_layers,
num_heads,
num_kv_heads,
vocab_size,
intermediate_dim,
context_length,
rope_theta,
eps,
};
Ok(Self {
mmap,
config,
tensor_data_offset,
is_mmap: true,
})
}
#[must_use]
pub fn is_mmap(&self) -> bool {
self.is_mmap
}
pub fn get_tensor_bytes(&self, offset: usize, len: usize) -> Result<&[u8]> {
let start = self.tensor_data_offset + offset;
let end = start + len;
if end > self.mmap.len() {
return Err(RealizarError::FormatError {
reason: format!(
"Tensor access out of bounds: offset={offset}, len={len}, file_size={}",
self.mmap.len()
),
});
}
Ok(&self.mmap[start..end])
}
pub fn get_tensor_f32(&self, offset: usize, num_elements: usize) -> Result<Vec<f32>> {
let bytes = self.get_tensor_bytes(offset, num_elements * 4)?;
let floats: Vec<f32> = bytes
.chunks_exact(4)
.map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
.collect();
Ok(floats)
}
#[must_use]
pub fn file_size(&self) -> usize {
self.mmap.len()
}
#[must_use]
pub fn num_parameters(&self) -> usize {
let hidden = self.config.hidden_dim;
let vocab = self.config.vocab_size;
let layers = self.config.num_layers;
let intermediate = self.config.intermediate_dim;
let embed_params = vocab * hidden * 2;
let layer_params = hidden
+ (hidden * 3 * hidden)
+ (hidden * hidden)
+ (hidden * intermediate)
+ (intermediate * hidden);
let norm_params = hidden;
embed_params + (layers * layer_params) + norm_params
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[allow(non_camel_case_types)] pub enum AprQuantizationType {
#[default]
F32,
Q4_K,
Q8_0,
}
impl AprQuantizationType {
#[must_use]
pub fn bits_per_weight(&self) -> f64 {
match self {
Self::F32 => 32.0,
Self::Q4_K => 4.5, Self::Q8_0 => 8.0, }
}
#[must_use]
pub fn bytes_per_block(&self) -> usize {
match self {
Self::F32 => 4, Self::Q4_K => 144, Self::Q8_0 => 36, }
}
#[must_use]
pub fn values_per_block(&self) -> usize {
match self {
Self::F32 => 1,
Self::Q4_K => 256,
Self::Q8_0 => 32,
}
}
#[must_use]
pub fn to_byte(&self) -> u8 {
match self {
Self::F32 => 0,
Self::Q4_K => 1,
Self::Q8_0 => 2,
}
}
#[must_use]
pub fn from_byte(byte: u8) -> Option<Self> {
match byte {
0 => Some(Self::F32),
1 => Some(Self::Q4_K),
2 => Some(Self::Q8_0),
_ => None,
}
}
}
#[derive(Debug, Clone)]
pub struct QuantizedAprTransformer {
config: AprTransformerConfig,
quant_type: AprQuantizationType,
token_embedding: Vec<f32>,
layer_weights: Vec<Vec<u8>>,
output_norm_weight: Vec<f32>,
lm_head_weight: Vec<u8>,
}
impl QuantizedAprTransformer {
#[must_use]
pub fn new(config: AprTransformerConfig, quant_type: AprQuantizationType) -> Self {
let hidden_dim = config.hidden_dim;
let vocab_size = config.vocab_size;
let _intermediate_dim = config.intermediate_dim;
let embed_size = vocab_size * hidden_dim; let layer_weight_size = Self::calculate_layer_bytes(&config, quant_type);
let lm_head_size = Self::calculate_quantized_bytes(hidden_dim * vocab_size, quant_type);
let layer_weights = (0..config.num_layers)
.map(|_| vec![0u8; layer_weight_size])
.collect();
Self {
config,
quant_type,
token_embedding: vec![0.0; embed_size],
layer_weights,
output_norm_weight: vec![1.0; hidden_dim],
lm_head_weight: vec![0u8; lm_head_size],
}
}
#[must_use]
pub fn from_f32_transformer(
f32_model: &AprTransformer,
quant_type: AprQuantizationType,
) -> Self {
let config = f32_model.config.clone();
Self::new(config, quant_type)
}
#[must_use]
pub fn quantization_type(&self) -> AprQuantizationType {
self.quant_type
}
#[must_use]
pub fn bits_per_weight(&self) -> f64 {
self.quant_type.bits_per_weight()
}
#[must_use]
pub fn config(&self) -> &AprTransformerConfig {
&self.config
}
#[must_use]
pub fn weight_bytes(&self) -> usize {
let embed_bytes = self.token_embedding.len() * 4; let layer_bytes: usize = self.layer_weights.iter().map(std::vec::Vec::len).sum();
let norm_bytes = self.output_norm_weight.len() * 4; let lm_head_bytes = self.lm_head_weight.len();
embed_bytes + layer_bytes + norm_bytes + lm_head_bytes
}
#[must_use]
pub fn f32_equivalent_bytes(&self) -> usize {
let num_params = self.num_parameters();
num_params * 4 }
#[must_use]
pub fn num_parameters(&self) -> usize {
let hidden = self.config.hidden_dim;
let vocab = self.config.vocab_size;
let layers = self.config.num_layers;
let intermediate = self.config.intermediate_dim;
let embed_params = vocab * hidden * 2;
let layer_params = hidden
+ (hidden * 3 * hidden)
+ (hidden * hidden)
+ (hidden * intermediate)
+ (intermediate * hidden);
let norm_params = hidden;
embed_params + (layers * layer_params) + norm_params
}
fn calculate_layer_bytes(
config: &AprTransformerConfig,
quant_type: AprQuantizationType,
) -> usize {
let hidden = config.hidden_dim;
let intermediate = config.intermediate_dim;
let weight_elements = (hidden * 3 * hidden)
+ (hidden * hidden)
+ (hidden * intermediate)
+ (intermediate * hidden);
Self::calculate_quantized_bytes(weight_elements, quant_type)
}
fn calculate_quantized_bytes(num_elements: usize, quant_type: AprQuantizationType) -> usize {
let values_per_block = quant_type.values_per_block();
let bytes_per_block = quant_type.bytes_per_block();
let num_blocks = num_elements.div_ceil(values_per_block);
num_blocks * bytes_per_block
}
pub fn forward(&self, token_ids: &[u32]) -> Result<Vec<f32>> {
if token_ids.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Token sequence cannot be empty".to_string(),
});
}
let hidden_dim = self.config.hidden_dim;
let _vocab_size = self.config.vocab_size;
let mut hidden = Vec::with_capacity(token_ids.len() * hidden_dim);
for &token_id in token_ids {
let offset = (token_id as usize) * hidden_dim;
if offset + hidden_dim <= self.token_embedding.len() {
hidden.extend_from_slice(&self.token_embedding[offset..offset + hidden_dim]);
} else {
hidden.extend(std::iter::repeat(0.0).take(hidden_dim));
}
}
for _layer_weights in &self.layer_weights {
}
let seq_len = token_ids.len();
let eps = self.config.eps;
let mut normed = Vec::with_capacity(hidden.len());
for s in 0..seq_len {
let start = s * hidden_dim;
let slice = &hidden[start..start + hidden_dim];
let mean: f32 = slice.iter().sum::<f32>() / hidden_dim as f32;
let variance: f32 =
slice.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / hidden_dim as f32;
let std_dev = (variance + eps).sqrt();
for (i, &x) in slice.iter().enumerate() {
let normalized = (x - mean) / std_dev;
normed.push(normalized * self.output_norm_weight[i]);
}
}
let last_hidden_start = (seq_len - 1) * hidden_dim;
let last_hidden = &normed[last_hidden_start..last_hidden_start + hidden_dim];
let logits = self.compute_lm_head_logits(last_hidden)?;
Ok(logits)
}
fn compute_lm_head_logits(&self, _hidden: &[f32]) -> Result<Vec<f32>> {
let vocab_size = self.config.vocab_size;
let _hidden_dim = self.config.hidden_dim;
let logits = vec![0.0f32; vocab_size];
match self.quant_type {
AprQuantizationType::F32 => {
},
AprQuantizationType::Q4_K => {
},
AprQuantizationType::Q8_0 => {
},
}
Ok(logits)
}
pub fn to_bytes(&self) -> Result<Vec<u8>> {
let mut bytes = Vec::new();
bytes.extend_from_slice(&APR_TRANSFORMER_MAGIC);
bytes.extend_from_slice(&APR_TRANSFORMER_VERSION.to_le_bytes());
bytes.extend_from_slice(&(self.config.hidden_dim as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.num_layers as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.num_heads as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.num_kv_heads as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.vocab_size as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.intermediate_dim as u32).to_le_bytes());
bytes.extend_from_slice(&(self.config.context_length as u32).to_le_bytes());
bytes.extend_from_slice(&self.config.rope_theta.to_le_bytes());
bytes.extend_from_slice(&self.config.eps.to_le_bytes());
let tensor_offset = APR_TRANSFORMER_HEADER_SIZE as u32;
bytes.extend_from_slice(&tensor_offset.to_le_bytes());
bytes.push(self.quant_type.to_byte());
while bytes.len() < APR_TRANSFORMER_HEADER_SIZE {
bytes.push(0);
}
for &v in &self.token_embedding {
bytes.extend_from_slice(&v.to_le_bytes());
}
for layer in &self.layer_weights {
bytes.extend_from_slice(layer);
}
for &v in &self.output_norm_weight {
bytes.extend_from_slice(&v.to_le_bytes());
}
bytes.extend_from_slice(&self.lm_head_weight);
Ok(bytes)
}
pub fn from_bytes(data: &[u8]) -> Result<Self> {
if data.len() < APR_TRANSFORMER_HEADER_SIZE {
return Err(RealizarError::FormatError {
reason: format!("Data too small: {} bytes", data.len()),
});
}
if data[0..4] != APR_TRANSFORMER_MAGIC {
return Err(RealizarError::FormatError {
reason: "Invalid APR magic".to_string(),
});
}
let hidden_dim = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
let num_layers = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
let num_heads = u32::from_le_bytes([data[16], data[17], data[18], data[19]]) as usize;
let num_kv_heads = u32::from_le_bytes([data[20], data[21], data[22], data[23]]) as usize;
let vocab_size = u32::from_le_bytes([data[24], data[25], data[26], data[27]]) as usize;
let intermediate_dim =
u32::from_le_bytes([data[28], data[29], data[30], data[31]]) as usize;
let context_length = u32::from_le_bytes([data[32], data[33], data[34], data[35]]) as usize;
let rope_theta = f32::from_le_bytes([data[36], data[37], data[38], data[39]]);
let eps = f32::from_le_bytes([data[40], data[41], data[42], data[43]]);
let quant_type =
AprQuantizationType::from_byte(data[48]).ok_or_else(|| RealizarError::FormatError {
reason: format!("Invalid quantization type: {}", data[48]),
})?;
let config = AprTransformerConfig {
architecture: "apr".to_string(),
hidden_dim,
num_layers,
num_heads,
num_kv_heads,
vocab_size,
intermediate_dim,
context_length,
rope_theta,
eps,
};
Ok(Self::new(config, quant_type))
}
pub fn forward_with_cache(
&self,
token_id: u32,
cache: &mut AprKVCache,
_position: usize,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let num_kv_heads = self.config.num_kv_heads;
let head_dim = hidden_dim / num_heads;
let mut hidden = Vec::with_capacity(hidden_dim);
let offset = (token_id as usize) * hidden_dim;
if offset + hidden_dim <= self.token_embedding.len() {
hidden.extend_from_slice(&self.token_embedding[offset..offset + hidden_dim]);
} else {
hidden.extend(std::iter::repeat(0.0).take(hidden_dim));
}
for layer_idx in 0..self.config.num_layers {
let kv_size = num_kv_heads * head_dim;
let k = vec![0.0f32; kv_size];
let v = vec![0.0f32; kv_size];
cache.append(layer_idx, &k, &v);
}
let eps = self.config.eps;
let mean: f32 = hidden.iter().sum::<f32>() / hidden_dim as f32;
let variance: f32 =
hidden.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / hidden_dim as f32;
let std_dev = (variance + eps).sqrt();
let mut normed = Vec::with_capacity(hidden_dim);
for (i, &x) in hidden.iter().enumerate() {
let normalized = (x - mean) / std_dev;
normed.push(normalized * self.output_norm_weight[i]);
}
let logits = self.compute_lm_head_logits(&normed)?;
Ok(logits)
}
}
#[derive(Debug, Clone)]
pub struct AprKVCache {
num_layers: usize,
num_kv_heads: usize,
head_dim: usize,
capacity: usize,
len: usize,
k_cache: Vec<Vec<f32>>,
v_cache: Vec<Vec<f32>>,
}
impl AprKVCache {
#[must_use]
pub fn new(config: &AprTransformerConfig) -> Self {
let num_layers = config.num_layers;
let num_kv_heads = config.num_kv_heads;
let head_dim = config.hidden_dim / config.num_heads;
let capacity = config.context_length;
let kv_size = capacity * num_kv_heads * head_dim;
let k_cache = (0..num_layers).map(|_| vec![0.0f32; kv_size]).collect();
let v_cache = (0..num_layers).map(|_| vec![0.0f32; kv_size]).collect();
Self {
num_layers,
num_kv_heads,
head_dim,
capacity,
len: 0,
k_cache,
v_cache,
}
}
#[must_use]
pub fn len(&self) -> usize {
self.len
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len == 0
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
pub fn append(&mut self, layer: usize, k: &[f32], v: &[f32]) {
assert!(layer < self.num_layers, "Layer index out of bounds");
assert!(self.len < self.capacity, "KV cache is full");
let kv_size = self.num_kv_heads * self.head_dim;
let offset = self.len * kv_size;
self.k_cache[layer][offset..offset + kv_size].copy_from_slice(k);
self.v_cache[layer][offset..offset + kv_size].copy_from_slice(v);
if layer == 0 {
self.len += 1;
}
}
#[must_use]
pub fn get(&self, layer: usize) -> (&[f32], &[f32]) {
let kv_size = self.num_kv_heads * self.head_dim;
let used_size = self.len * kv_size;
(
&self.k_cache[layer][..used_size],
&self.v_cache[layer][..used_size],
)
}
pub fn clear(&mut self) {
self.len = 0;
}
}
#[derive(Debug, Clone)]
pub struct GenerateConfig {
pub max_tokens: usize,
pub temperature: f32,
pub top_p: f32,
pub top_k: usize,
pub repetition_penalty: f32,
}
impl Default for GenerateConfig {
fn default() -> Self {
Self {
max_tokens: 32,
temperature: 1.0,
top_p: 0.9,
top_k: 0,
repetition_penalty: 1.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AprTransformerConfig {
pub architecture: String,
pub hidden_dim: usize,
pub num_layers: usize,
pub num_heads: usize,
pub num_kv_heads: usize,
pub vocab_size: usize,
pub intermediate_dim: usize,
pub context_length: usize,
pub rope_theta: f32,
pub eps: f32,
}
impl Default for AprTransformerConfig {
fn default() -> Self {
Self {
architecture: "unknown".to_string(),
hidden_dim: 512,
num_layers: 6,
num_heads: 8,
num_kv_heads: 8,
vocab_size: 32000,
intermediate_dim: 2048,
context_length: 2048,
rope_theta: 10000.0,
eps: 1e-5,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AprTransformerLayer {
pub attn_norm_weight: Vec<f32>,
pub attn_norm_bias: Option<Vec<f32>>,
pub qkv_weight: Vec<f32>,
pub qkv_bias: Option<Vec<f32>>,
pub attn_output_weight: Vec<f32>,
pub attn_output_bias: Option<Vec<f32>>,
pub ffn_gate_weight: Option<Vec<f32>>,
pub ffn_gate_bias: Option<Vec<f32>>,
pub ffn_up_weight: Vec<f32>,
pub ffn_up_bias: Option<Vec<f32>>,
pub ffn_down_weight: Vec<f32>,
pub ffn_down_bias: Option<Vec<f32>>,
pub ffn_norm_weight: Option<Vec<f32>>,
pub ffn_norm_bias: Option<Vec<f32>>,
}
impl AprTransformerLayer {
pub fn empty(hidden_dim: usize, intermediate_dim: usize) -> Self {
Self {
attn_norm_weight: vec![1.0; hidden_dim],
attn_norm_bias: None,
qkv_weight: vec![0.0; hidden_dim * 3 * hidden_dim],
qkv_bias: None,
attn_output_weight: vec![0.0; hidden_dim * hidden_dim],
attn_output_bias: None,
ffn_gate_weight: None,
ffn_gate_bias: None,
ffn_up_weight: vec![0.0; hidden_dim * intermediate_dim],
ffn_up_bias: None,
ffn_down_weight: vec![0.0; intermediate_dim * hidden_dim],
ffn_down_bias: None,
ffn_norm_weight: None,
ffn_norm_bias: None,
}
}
#[must_use]
pub fn num_parameters(&self) -> usize {
let mut count = 0;
count += self.attn_norm_weight.len();
count += self.attn_norm_bias.as_ref().map_or(0, Vec::len);
count += self.qkv_weight.len();
count += self.qkv_bias.as_ref().map_or(0, Vec::len);
count += self.attn_output_weight.len();
count += self.attn_output_bias.as_ref().map_or(0, Vec::len);
count += self.ffn_gate_weight.as_ref().map_or(0, Vec::len);
count += self.ffn_gate_bias.as_ref().map_or(0, Vec::len);
count += self.ffn_up_weight.len();
count += self.ffn_up_bias.as_ref().map_or(0, Vec::len);
count += self.ffn_down_weight.len();
count += self.ffn_down_bias.as_ref().map_or(0, Vec::len);
count += self.ffn_norm_weight.as_ref().map_or(0, Vec::len);
count += self.ffn_norm_bias.as_ref().map_or(0, Vec::len);
count
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AprTransformer {
pub config: AprTransformerConfig,
pub token_embedding: Vec<f32>,
pub layers: Vec<AprTransformerLayer>,
pub output_norm_weight: Vec<f32>,
pub output_norm_bias: Option<Vec<f32>>,
pub lm_head_weight: Vec<f32>,
pub lm_head_bias: Option<Vec<f32>>,
}
impl AprTransformer {
pub fn from_apr_file<P: AsRef<Path>>(path: P) -> Result<Self> {
use std::io::Read;
let mut file = File::open(path.as_ref()).map_err(|e| RealizarError::IoError {
message: format!("Failed to open APR file: {e}"),
})?;
let mut data = Vec::new();
file.read_to_end(&mut data)
.map_err(|e| RealizarError::IoError {
message: format!("Failed to read APR file: {e}"),
})?;
Self::from_apr_bytes(&data)
}
pub fn from_apr_bytes(data: &[u8]) -> Result<Self> {
if data.len() < 64 {
return Err(RealizarError::FormatError {
reason: format!("APR file too small: {} bytes (need 64)", data.len()),
});
}
let magic = &data[0..4];
if magic != b"APR2" && magic != b"APRN" {
return Err(RealizarError::FormatError {
reason: format!(
"Invalid APR magic: {:?}, expected APR2 or APRN",
String::from_utf8_lossy(magic)
),
});
}
let tensor_count = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
let metadata_offset = u64::from_le_bytes([
data[12], data[13], data[14], data[15], data[16], data[17], data[18], data[19],
]) as usize;
let metadata_size = u32::from_le_bytes([data[20], data[21], data[22], data[23]]) as usize;
let tensor_index_offset = u64::from_le_bytes([
data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
]) as usize;
let data_offset = u64::from_le_bytes([
data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
]) as usize;
let metadata_end = metadata_offset + metadata_size;
if metadata_end > data.len() {
return Err(RealizarError::FormatError {
reason: "Metadata extends beyond file".to_string(),
});
}
let metadata_json = &data[metadata_offset..metadata_end];
let metadata: serde_json::Value = serde_json::from_slice(metadata_json).unwrap_or_default();
let hidden_dim = metadata
.get("hidden_size")
.or_else(|| metadata.get("hidden_dim"))
.and_then(serde_json::Value::as_u64)
.unwrap_or(64) as usize;
let num_layers = metadata
.get("num_hidden_layers")
.or_else(|| metadata.get("num_layers"))
.and_then(serde_json::Value::as_u64)
.unwrap_or(1) as usize;
let num_heads = metadata
.get("num_attention_heads")
.or_else(|| metadata.get("num_heads"))
.and_then(serde_json::Value::as_u64)
.unwrap_or(4) as usize;
let vocab_size = metadata
.get("vocab_size")
.and_then(serde_json::Value::as_u64)
.unwrap_or(32000) as usize;
let intermediate_dim = metadata
.get("intermediate_size")
.or_else(|| metadata.get("intermediate_dim"))
.and_then(serde_json::Value::as_u64)
.unwrap_or((hidden_dim * 4) as u64) as usize;
let config = AprTransformerConfig {
hidden_dim,
num_layers,
num_heads,
num_kv_heads: num_heads,
vocab_size,
intermediate_dim,
context_length: 2048,
..Default::default()
};
let mut tensors: std::collections::BTreeMap<String, (usize, usize, Vec<usize>)> =
std::collections::BTreeMap::new();
let mut pos = tensor_index_offset;
for _ in 0..tensor_count {
if pos + 4 > data.len() {
break;
}
let name_len = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
pos += 2;
if pos + name_len + 18 > data.len() {
break;
}
let name = String::from_utf8_lossy(&data[pos..pos + name_len]).to_string();
pos += name_len;
let _dtype = data[pos];
pos += 1;
let ndim = data[pos] as usize;
pos += 1;
let mut dims = Vec::with_capacity(ndim);
for _ in 0..ndim {
if pos + 8 > data.len() {
break;
}
let dim = u64::from_le_bytes([
data[pos],
data[pos + 1],
data[pos + 2],
data[pos + 3],
data[pos + 4],
data[pos + 5],
data[pos + 6],
data[pos + 7],
]) as usize;
dims.push(dim);
pos += 8;
}
if pos + 16 > data.len() {
break;
}
let offset = u64::from_le_bytes([
data[pos],
data[pos + 1],
data[pos + 2],
data[pos + 3],
data[pos + 4],
data[pos + 5],
data[pos + 6],
data[pos + 7],
]) as usize;
pos += 8;
let size = u64::from_le_bytes([
data[pos],
data[pos + 1],
data[pos + 2],
data[pos + 3],
data[pos + 4],
data[pos + 5],
data[pos + 6],
data[pos + 7],
]) as usize;
pos += 8;
tensors.insert(name, (data_offset + offset, size, dims));
}
let get_f32_tensor = |name: &str| -> Option<Vec<f32>> {
tensors.get(name).map(|(offset, size, _)| {
let end = offset + size;
if end > data.len() {
return Vec::new();
}
data[*offset..end]
.chunks_exact(4)
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
.collect()
})
};
let token_embedding = get_f32_tensor("model.embed_tokens.weight")
.or_else(|| get_f32_tensor("token_embd.weight"))
.or_else(|| get_f32_tensor("tok_embeddings.weight"))
.unwrap_or_else(|| vec![0.0; vocab_size * hidden_dim]);
let output_norm_weight = get_f32_tensor("model.norm.weight")
.or_else(|| get_f32_tensor("output_norm.weight"))
.unwrap_or_else(|| vec![1.0; hidden_dim]);
let lm_head_weight = get_f32_tensor("lm_head.weight")
.or_else(|| get_f32_tensor("output.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * vocab_size]);
let mut layers = Vec::with_capacity(num_layers);
for i in 0..num_layers {
let prefix = format!("model.layers.{i}");
let qkv_dim = 3 * hidden_dim;
let qkv_weight =
if let Some(qkv) = get_f32_tensor(&format!("{prefix}.self_attn.qkv_proj.weight")) {
qkv
} else {
let q = get_f32_tensor(&format!("{prefix}.self_attn.q_proj.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * hidden_dim]);
let k = get_f32_tensor(&format!("{prefix}.self_attn.k_proj.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * hidden_dim]);
let v = get_f32_tensor(&format!("{prefix}.self_attn.v_proj.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * hidden_dim]);
let mut qkv = Vec::with_capacity(hidden_dim * qkv_dim);
for row in 0..hidden_dim {
let row_start = row * hidden_dim;
qkv.extend_from_slice(&q[row_start..row_start + hidden_dim]);
qkv.extend_from_slice(&k[row_start..row_start + hidden_dim]);
qkv.extend_from_slice(&v[row_start..row_start + hidden_dim]);
}
qkv
};
let attn_output = get_f32_tensor(&format!("{prefix}.self_attn.o_proj.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * hidden_dim]);
let attn_norm = get_f32_tensor(&format!("{prefix}.input_layernorm.weight"))
.unwrap_or_else(|| vec![1.0; hidden_dim]);
let ffn_norm = get_f32_tensor(&format!("{prefix}.post_attention_layernorm.weight"));
let ffn_gate = get_f32_tensor(&format!("{prefix}.mlp.gate_proj.weight"));
let ffn_up = get_f32_tensor(&format!("{prefix}.mlp.up_proj.weight"))
.unwrap_or_else(|| vec![0.0; hidden_dim * intermediate_dim]);
let ffn_down = get_f32_tensor(&format!("{prefix}.mlp.down_proj.weight"))
.unwrap_or_else(|| vec![0.0; intermediate_dim * hidden_dim]);
layers.push(AprTransformerLayer {
attn_norm_weight: attn_norm,
attn_norm_bias: None,
qkv_weight,
qkv_bias: None,
attn_output_weight: attn_output,
attn_output_bias: None,
ffn_gate_weight: ffn_gate,
ffn_gate_bias: None,
ffn_up_weight: ffn_up,
ffn_up_bias: None,
ffn_down_weight: ffn_down,
ffn_down_bias: None,
ffn_norm_weight: ffn_norm,
ffn_norm_bias: None,
});
}
Ok(Self {
config,
token_embedding,
layers,
output_norm_weight,
output_norm_bias: None,
lm_head_weight,
lm_head_bias: None,
})
}
pub fn new(config: AprTransformerConfig) -> Self {
let hidden_dim = config.hidden_dim;
let vocab_size = config.vocab_size;
let intermediate_dim = config.intermediate_dim;
let layers = (0..config.num_layers)
.map(|_| AprTransformerLayer::empty(hidden_dim, intermediate_dim))
.collect();
Self {
config,
token_embedding: vec![0.0; vocab_size * hidden_dim],
layers,
output_norm_weight: vec![1.0; hidden_dim],
output_norm_bias: None,
lm_head_weight: vec![0.0; hidden_dim * vocab_size],
lm_head_bias: None,
}
}
#[must_use]
pub fn config(&self) -> &AprTransformerConfig {
&self.config
}
pub fn generate(&self, prompt: &[u32], max_tokens: usize) -> Result<Vec<u32>> {
let mut tokens = prompt.to_vec();
for _ in 0..max_tokens {
let logits = self.forward(&tokens)?;
let next_token = logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(idx, _)| idx as u32);
tokens.push(next_token);
if next_token == 2 {
break;
}
}
Ok(tokens)
}
#[must_use]
pub fn num_parameters(&self) -> usize {
let mut count = 0;
count += self.token_embedding.len();
for layer in &self.layers {
count += layer.num_parameters();
}
count += self.output_norm_weight.len();
count += self.output_norm_bias.as_ref().map_or(0, Vec::len);
count += self.lm_head_weight.len();
count += self.lm_head_bias.as_ref().map_or(0, Vec::len);
count
}
#[must_use]
pub fn memory_size(&self) -> usize {
self.num_parameters() * 4
}
#[must_use]
pub fn embed(&self, token_ids: &[u32]) -> Vec<f32> {
let hidden_dim = self.config.hidden_dim;
let mut embeddings = Vec::with_capacity(token_ids.len() * hidden_dim);
for &token_id in token_ids {
let offset = (token_id as usize) * hidden_dim;
if offset + hidden_dim <= self.token_embedding.len() {
embeddings.extend_from_slice(&self.token_embedding[offset..offset + hidden_dim]);
} else {
embeddings.extend(std::iter::repeat(0.0).take(hidden_dim));
}
}
embeddings
}
fn layer_norm(
&self,
input: &[f32],
weight: &[f32],
bias: Option<&[f32]>,
eps: f32,
) -> Vec<f32> {
let hidden_dim = self.config.hidden_dim;
let seq_len = input.len() / hidden_dim;
let mut output = Vec::with_capacity(input.len());
for s in 0..seq_len {
let start = s * hidden_dim;
let slice = &input[start..start + hidden_dim];
let mean: f32 = slice.iter().sum::<f32>() / hidden_dim as f32;
let variance: f32 =
slice.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / hidden_dim as f32;
let std_dev = (variance + eps).sqrt();
for (i, &x) in slice.iter().enumerate() {
let normalized = (x - mean) / std_dev;
let scaled = normalized * weight[i];
let shifted = if let Some(b) = bias {
scaled + b[i]
} else {
scaled
};
output.push(shifted);
}
}
output
}
#[allow(clippy::unused_self)]
fn matmul(&self, input: &[f32], weight: &[f32], in_dim: usize, out_dim: usize) -> Vec<f32> {
let seq_len = input.len() / in_dim;
let mut weight_transposed = vec![0.0f32; out_dim * in_dim];
for i in 0..in_dim {
for o in 0..out_dim {
weight_transposed[o * in_dim + i] = weight[i * out_dim + o];
}
}
let weight_matrix = match TruenoMatrix::from_vec(out_dim, in_dim, weight_transposed) {
Ok(m) => m,
Err(_) => {
return self.matmul_scalar(input, weight, in_dim, out_dim);
}
};
let mut output = Vec::with_capacity(seq_len * out_dim);
for s in 0..seq_len {
let input_start = s * in_dim;
let input_slice = &input[input_start..input_start + in_dim];
let x_vec = TruenoVector::from_slice(input_slice);
match weight_matrix.matvec(&x_vec) {
Ok(r) => output.extend_from_slice(r.as_slice()),
Err(_) => {
for o in 0..out_dim {
let mut sum = 0.0;
for (i, &input_val) in input_slice.iter().enumerate() {
let weight_idx = i * out_dim + o;
if weight_idx < weight.len() {
sum += input_val * weight[weight_idx];
}
}
output.push(sum);
}
}
}
}
output
}
#[allow(clippy::unused_self)]
fn matmul_scalar(&self, input: &[f32], weight: &[f32], in_dim: usize, out_dim: usize) -> Vec<f32> {
let seq_len = input.len() / in_dim;
let mut output = Vec::with_capacity(seq_len * out_dim);
for s in 0..seq_len {
let input_start = s * in_dim;
let input_slice = &input[input_start..input_start + in_dim];
for o in 0..out_dim {
let mut sum = 0.0;
for (i, &input_val) in input_slice.iter().enumerate() {
let weight_idx = i * out_dim + o;
if weight_idx < weight.len() {
sum += input_val * weight[weight_idx];
}
}
output.push(sum);
}
}
output
}
#[allow(clippy::unused_self)]
fn add_bias(&self, data: &mut [f32], bias: &[f32]) {
let dim = bias.len();
for (i, val) in data.iter_mut().enumerate() {
*val += bias[i % dim];
}
}
#[allow(clippy::unused_self)]
fn gelu(&self, data: &mut [f32]) {
const SQRT_2_OVER_PI: f32 = 0.797_884_6;
const GELU_COEFF: f32 = 0.044_715;
for x in data.iter_mut() {
let x3 = *x * *x * *x;
let inner = SQRT_2_OVER_PI * (*x + GELU_COEFF * x3);
*x = 0.5 * *x * (1.0 + inner.tanh());
}
}
pub fn forward(&self, token_ids: &[u32]) -> Result<Vec<f32>> {
if token_ids.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Token sequence cannot be empty".to_string(),
});
}
let hidden_dim = self.config.hidden_dim;
let intermediate_dim = self.config.intermediate_dim;
let mut hidden = self.embed(token_ids);
for layer in &self.layers {
let normed = self.layer_norm(
&hidden,
&layer.attn_norm_weight,
layer.attn_norm_bias.as_deref(),
self.config.eps,
);
let qkv_dim = 3 * hidden_dim;
let mut qkv = self.matmul(&normed, &layer.qkv_weight, hidden_dim, qkv_dim);
if let Some(ref bias) = layer.qkv_bias {
self.add_bias(&mut qkv, bias);
}
let seq_len = token_ids.len();
let mut attn_out = Vec::with_capacity(seq_len * hidden_dim);
for s in 0..seq_len {
let qkv_start = s * qkv_dim;
for h in 0..hidden_dim {
attn_out.push(qkv[qkv_start + h]); }
}
let mut attn_output =
self.matmul(&attn_out, &layer.attn_output_weight, hidden_dim, hidden_dim);
if let Some(ref bias) = layer.attn_output_bias {
self.add_bias(&mut attn_output, bias);
}
for i in 0..hidden.len() {
hidden[i] += attn_output[i];
}
let mut ffn_hidden =
self.matmul(&hidden, &layer.ffn_up_weight, hidden_dim, intermediate_dim);
if let Some(ref bias) = layer.ffn_up_bias {
self.add_bias(&mut ffn_hidden, bias);
}
self.gelu(&mut ffn_hidden);
let mut ffn_output = self.matmul(
&ffn_hidden,
&layer.ffn_down_weight,
intermediate_dim,
hidden_dim,
);
if let Some(ref bias) = layer.ffn_down_bias {
self.add_bias(&mut ffn_output, bias);
}
for i in 0..hidden.len() {
hidden[i] += ffn_output[i];
}
}
let normed = self.layer_norm(
&hidden,
&self.output_norm_weight,
self.output_norm_bias.as_deref(),
self.config.eps,
);
let seq_len = token_ids.len();
let last_hidden_start = (seq_len - 1) * hidden_dim;
let last_hidden = &normed[last_hidden_start..last_hidden_start + hidden_dim];
let mut logits = self.matmul(
last_hidden,
&self.lm_head_weight,
hidden_dim,
self.config.vocab_size,
);
if let Some(ref bias) = self.lm_head_bias {
self.add_bias(&mut logits, bias);
}
Ok(logits)
}
pub fn predict_next(&self, token_ids: &[u32]) -> Result<u32> {
let logits = self.forward(token_ids)?;
let (max_idx, _) = logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.ok_or_else(|| RealizarError::InvalidShape {
reason: "Empty logits".to_string(),
})?;
Ok(max_idx as u32)
}
pub fn forward_with_cache(
&self,
token_id: u32,
cache: &mut AprKVCache,
position: usize,
) -> Result<Vec<f32>> {
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let num_kv_heads = self.config.num_kv_heads;
let head_dim = hidden_dim / num_heads;
let mut hidden = self.embed(&[token_id]);
for (layer_idx, layer) in self.layers.iter().enumerate() {
let normed = self.layer_norm(
&hidden,
&layer.attn_norm_weight,
layer.attn_norm_bias.as_deref(),
self.config.eps,
);
let qkv_dim = 3 * hidden_dim;
let mut qkv = self.matmul(&normed, &layer.qkv_weight, hidden_dim, qkv_dim);
if let Some(ref bias) = layer.qkv_bias {
self.add_bias(&mut qkv, bias);
}
let q = &qkv[0..hidden_dim];
let k = &qkv[hidden_dim..2 * hidden_dim];
let v = &qkv[2 * hidden_dim..3 * hidden_dim];
let kv_size = num_kv_heads * head_dim;
cache.append(layer_idx, &k[0..kv_size], &v[0..kv_size]);
let (k_cache, v_cache) = cache.get(layer_idx);
let seq_len = cache.len();
let mut attn_out = vec![0.0f32; hidden_dim];
for h in 0..num_heads {
let kv_head = h * num_kv_heads / num_heads; let q_start = h * head_dim;
let q_slice = &q[q_start..q_start + head_dim];
let mut scores = Vec::with_capacity(seq_len);
for pos in 0..seq_len {
let k_start = pos * kv_size + kv_head * head_dim;
let k_slice = &k_cache[k_start..k_start + head_dim];
let mut dot = 0.0f32;
for i in 0..head_dim {
dot += q_slice[i] * k_slice[i];
}
scores.push(dot / (head_dim as f32).sqrt());
}
for pos in (position + 1)..seq_len {
scores[pos] = f32::NEG_INFINITY;
}
let max_score = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let mut exp_scores: Vec<f32> =
scores.iter().map(|s| (s - max_score).exp()).collect();
let sum: f32 = exp_scores.iter().sum();
if sum > 0.0 {
for s in &mut exp_scores {
*s /= sum;
}
}
for pos in 0..seq_len {
let v_start = pos * kv_size + kv_head * head_dim;
let v_slice = &v_cache[v_start..v_start + head_dim];
for i in 0..head_dim {
attn_out[q_start + i] += exp_scores[pos] * v_slice[i];
}
}
}
let mut attn_output =
self.matmul(&attn_out, &layer.attn_output_weight, hidden_dim, hidden_dim);
if let Some(ref bias) = layer.attn_output_bias {
self.add_bias(&mut attn_output, bias);
}
for i in 0..hidden.len() {
hidden[i] += attn_output[i];
}
let mut ffn_hidden = self.matmul(
&hidden,
&layer.ffn_up_weight,
hidden_dim,
self.config.intermediate_dim,
);
if let Some(ref bias) = layer.ffn_up_bias {
self.add_bias(&mut ffn_hidden, bias);
}
self.gelu(&mut ffn_hidden);
let mut ffn_output = self.matmul(
&ffn_hidden,
&layer.ffn_down_weight,
self.config.intermediate_dim,
hidden_dim,
);
if let Some(ref bias) = layer.ffn_down_bias {
self.add_bias(&mut ffn_output, bias);
}
for i in 0..hidden.len() {
hidden[i] += ffn_output[i];
}
}
let normed = self.layer_norm(
&hidden,
&self.output_norm_weight,
self.output_norm_bias.as_deref(),
self.config.eps,
);
let mut logits = self.matmul(
&normed,
&self.lm_head_weight,
hidden_dim,
self.config.vocab_size,
);
if let Some(ref bias) = self.lm_head_bias {
self.add_bias(&mut logits, bias);
}
Ok(logits)
}
pub fn generate_with_cache(&self, prompt: &[u32], config: &GenerateConfig) -> Result<Vec<u32>> {
if prompt.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Prompt cannot be empty".to_string(),
});
}
let mut cache = AprKVCache::new(&self.config);
let mut output = prompt.to_vec();
for (pos, &token) in prompt.iter().enumerate() {
let _ = self.forward_with_cache(token, &mut cache, pos)?;
}
for _ in 0..config.max_tokens {
let last_token = *output.last().ok_or_else(|| RealizarError::InvalidShape {
reason: "Empty output".to_string(),
})?;
let logits = self.forward_with_cache(last_token, &mut cache, output.len() - 1)?;
let next_token = if config.temperature == 0.0 {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(i, _)| i as u32)
} else {
let scaled: Vec<f32> = logits.iter().map(|l| l / config.temperature).collect();
let max_val = scaled.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let exp_vals: Vec<f32> = scaled.iter().map(|s| (s - max_val).exp()).collect();
let sum: f32 = exp_vals.iter().sum();
let probs: Vec<f32> = exp_vals.iter().map(|e| e / sum).collect();
probs
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(i, _)| i as u32)
};
output.push(next_token);
if next_token == 0 {
break;
}
}
Ok(output)
}
}
#[cfg(feature = "default")]
impl From<&crate::gguf::GGUFTransformer> for AprTransformer {
fn from(gguf: &crate::gguf::GGUFTransformer) -> Self {
let config = AprTransformerConfig {
architecture: gguf.config.architecture.clone(),
hidden_dim: gguf.config.hidden_dim,
num_layers: gguf.config.num_layers,
num_heads: gguf.config.num_heads,
num_kv_heads: gguf.config.num_kv_heads,
vocab_size: gguf.config.vocab_size,
intermediate_dim: gguf.config.intermediate_dim,
context_length: gguf.config.context_length,
rope_theta: gguf.config.rope_theta,
eps: gguf.config.eps,
};
let layers = gguf
.layers
.iter()
.map(|l| AprTransformerLayer {
attn_norm_weight: l.attn_norm_weight.clone(),
attn_norm_bias: l.attn_norm_bias.clone(),
qkv_weight: l.qkv_weight.clone(),
qkv_bias: l.qkv_bias.clone(),
attn_output_weight: l.attn_output_weight.clone(),
attn_output_bias: l.attn_output_bias.clone(),
ffn_gate_weight: l.ffn_gate_weight.clone(),
ffn_gate_bias: l.ffn_gate_bias.clone(),
ffn_up_weight: l.ffn_up_weight.clone(),
ffn_up_bias: l.ffn_up_bias.clone(),
ffn_down_weight: l.ffn_down_weight.clone(),
ffn_down_bias: l.ffn_down_bias.clone(),
ffn_norm_weight: l.ffn_norm_weight.clone(),
ffn_norm_bias: l.ffn_norm_bias.clone(),
})
.collect();
Self {
config,
token_embedding: gguf.token_embedding.clone(),
layers,
output_norm_weight: gguf.output_norm_weight.clone(),
output_norm_bias: gguf.output_norm_bias.clone(),
lm_head_weight: gguf.lm_head_weight.clone(),
lm_head_bias: gguf.lm_head_bias.clone(),
}
}
}
pub const APR_CPU_DECODE_THRESHOLD_TOK_S: f64 = 50.0;
pub const APR_PREFILL_THRESHOLD_TOK_S: f64 = 100.0;
pub const APR_PARITY_THRESHOLD_PCT: f64 = 95.0;
#[derive(Debug, Clone, Default)]
pub struct AprBenchmarkResult {
pub tokens_generated: usize,
pub total_time_ms: f64,
pub tokens_per_second: f64,
pub throughput_p50: f64,
pub throughput_p99: f64,
pub throughput_std_dev: f64,
pub peak_memory_mb: f64,
pub model_memory_mb: f64,
}
impl AprBenchmarkResult {
#[must_use]
pub fn meets_threshold(&self, threshold_tok_s: f64) -> bool {
self.tokens_per_second >= threshold_tok_s
}
#[must_use]
pub fn compare_to_baseline(&self, baseline: &AprBenchmarkResult) -> AprParityComparison {
let throughput_ratio = if baseline.tokens_per_second > 0.0 {
self.tokens_per_second / baseline.tokens_per_second
} else {
1.0
};
let memory_ratio = if baseline.peak_memory_mb > 0.0 {
self.peak_memory_mb / baseline.peak_memory_mb
} else {
1.0
};
AprParityComparison {
throughput_ratio,
memory_ratio,
parity_threshold_pct: APR_PARITY_THRESHOLD_PCT,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct AprPrefillResult {
pub prompt_tokens: usize,
pub prefill_time_ms: f64,
pub prefill_tok_s: f64,
}
#[derive(Debug, Clone, Default)]
pub struct AprLoadResult {
pub load_time_ms: f64,
}
#[derive(Debug, Clone)]
pub struct AprParityComparison {
pub throughput_ratio: f64,
pub memory_ratio: f64,
pub parity_threshold_pct: f64,
}
impl AprParityComparison {
#[must_use]
pub fn is_parity(&self) -> bool {
self.throughput_ratio >= (self.parity_threshold_pct / 100.0)
}
}
#[derive(Debug)]
pub struct AprBenchmarkRunner {
transformer: AprTransformer,
warmup_iterations: usize,
measure_iterations: usize,
}
impl AprBenchmarkRunner {
#[must_use]
pub fn new(transformer: AprTransformer) -> Self {
Self {
transformer,
warmup_iterations: 3,
measure_iterations: 10,
}
}
#[must_use]
pub fn warmup_iterations(&self) -> usize {
self.warmup_iterations
}
#[must_use]
pub fn measure_iterations(&self) -> usize {
self.measure_iterations
}
pub fn set_warmup_iterations(&mut self, n: usize) {
self.warmup_iterations = n;
}
pub fn set_measure_iterations(&mut self, n: usize) {
self.measure_iterations = n.max(1);
}
pub fn benchmark_decode(
&mut self,
prompt: &[u32],
num_tokens: usize,
) -> Result<AprBenchmarkResult> {
use std::time::Instant;
for _ in 0..self.warmup_iterations {
let gen_config = GenerateConfig {
max_tokens: num_tokens.min(5),
temperature: 0.0,
..Default::default()
};
let _ = self.transformer.generate_with_cache(prompt, &gen_config)?;
}
let mut throughputs = Vec::with_capacity(self.measure_iterations);
let mut total_tokens = 0usize;
let mut total_time_ms = 0.0f64;
for _ in 0..self.measure_iterations {
let gen_config = GenerateConfig {
max_tokens: num_tokens,
temperature: 0.0,
..Default::default()
};
let start = Instant::now();
let output = self.transformer.generate_with_cache(prompt, &gen_config)?;
let elapsed = start.elapsed();
let generated = output.len().saturating_sub(prompt.len());
let time_ms = elapsed.as_secs_f64() * 1000.0;
let throughput = if time_ms > 0.0 {
(generated as f64) / (time_ms / 1000.0)
} else {
0.0
};
throughputs.push(throughput);
total_tokens += generated;
total_time_ms += time_ms;
}
let mean_throughput = if !throughputs.is_empty() {
throughputs.iter().sum::<f64>() / throughputs.len() as f64
} else {
0.0
};
let mut sorted = throughputs.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p50 = if !sorted.is_empty() {
sorted[sorted.len() / 2]
} else {
0.0
};
let p99_idx =
((sorted.len() as f64 * 0.01).floor() as usize).min(sorted.len().saturating_sub(1));
let p99 = if !sorted.is_empty() {
sorted[p99_idx]
} else {
0.0
};
let std_dev = if throughputs.len() > 1 {
let variance = throughputs
.iter()
.map(|t| (t - mean_throughput).powi(2))
.sum::<f64>()
/ (throughputs.len() - 1) as f64;
variance.sqrt()
} else {
0.0
};
let model_memory_mb = (self.transformer.memory_size() as f64) / (1024.0 * 1024.0);
Ok(AprBenchmarkResult {
tokens_generated: total_tokens / self.measure_iterations.max(1),
total_time_ms: total_time_ms / self.measure_iterations.max(1) as f64,
tokens_per_second: mean_throughput,
throughput_p50: p50,
throughput_p99: p99,
throughput_std_dev: std_dev,
peak_memory_mb: model_memory_mb * 1.5, model_memory_mb,
})
}
pub fn benchmark_prefill(&mut self, prompt: &[u32]) -> Result<AprPrefillResult> {
use std::time::Instant;
for _ in 0..self.warmup_iterations {
let _ = self.transformer.forward(prompt)?;
}
let mut prefill_times_ms = Vec::with_capacity(self.measure_iterations);
for _ in 0..self.measure_iterations {
let start = Instant::now();
let _ = self.transformer.forward(prompt)?;
let elapsed = start.elapsed();
prefill_times_ms.push(elapsed.as_secs_f64() * 1000.0);
}
let mean_time_ms = if !prefill_times_ms.is_empty() {
prefill_times_ms.iter().sum::<f64>() / prefill_times_ms.len() as f64
} else {
0.0
};
let prefill_tok_s = if mean_time_ms > 0.0 {
(prompt.len() as f64) / (mean_time_ms / 1000.0)
} else {
0.0
};
Ok(AprPrefillResult {
prompt_tokens: prompt.len(),
prefill_time_ms: mean_time_ms,
prefill_tok_s,
})
}
pub fn benchmark_load<F>(loader: F) -> Result<AprLoadResult>
where
F: Fn() -> AprTransformer,
{
use std::time::Instant;
let start = Instant::now();
let _transformer = loader();
let elapsed = start.elapsed();
Ok(AprLoadResult {
load_time_ms: elapsed.as_secs_f64() * 1000.0,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = AprTransformerConfig::default();
assert_eq!(config.architecture, "unknown");
assert_eq!(config.hidden_dim, 512);
assert_eq!(config.num_layers, 6);
assert_eq!(config.vocab_size, 32000);
}
#[test]
fn test_config_serialization() {
let config = AprTransformerConfig {
architecture: "test_arch".to_string(),
hidden_dim: 256,
num_layers: 4,
num_heads: 4,
num_kv_heads: 4,
vocab_size: 1000,
intermediate_dim: 1024,
context_length: 512,
rope_theta: 10000.0,
eps: 1e-6,
};
let json = serde_json::to_string(&config).expect("serialize");
let decoded: AprTransformerConfig = serde_json::from_str(&json).expect("deserialize");
assert_eq!(config, decoded);
}
#[test]
fn test_layer_empty() {
let layer = AprTransformerLayer::empty(64, 256);
assert_eq!(layer.attn_norm_weight.len(), 64);
assert_eq!(layer.qkv_weight.len(), 64 * 3 * 64);
assert_eq!(layer.ffn_up_weight.len(), 64 * 256);
assert_eq!(layer.ffn_down_weight.len(), 256 * 64);
}
#[test]
fn test_layer_num_parameters() {
let layer = AprTransformerLayer::empty(64, 256);
let expected = 64 + 64 * 3 * 64 + 64 * 64 + 64 * 256 + 256 * 64; assert_eq!(layer.num_parameters(), expected);
}
#[test]
fn test_transformer_new() {
let config = AprTransformerConfig {
hidden_dim: 64,
num_layers: 2,
vocab_size: 100,
intermediate_dim: 128,
..Default::default()
};
let transformer = AprTransformer::new(config);
assert_eq!(transformer.layers.len(), 2);
assert_eq!(transformer.token_embedding.len(), 100 * 64);
assert_eq!(transformer.output_norm_weight.len(), 64);
assert_eq!(transformer.lm_head_weight.len(), 64 * 100);
}
#[test]
fn test_transformer_num_parameters() {
let config = AprTransformerConfig {
hidden_dim: 64,
num_layers: 2,
vocab_size: 100,
intermediate_dim: 128,
..Default::default()
};
let transformer = AprTransformer::new(config);
let params = transformer.num_parameters();
assert!(params > 0);
assert!(params < 100_000_000); }
#[test]
fn test_transformer_memory_size() {
let config = AprTransformerConfig {
hidden_dim: 64,
num_layers: 1,
vocab_size: 100,
intermediate_dim: 128,
..Default::default()
};
let transformer = AprTransformer::new(config);
let params = transformer.num_parameters();
let mem = transformer.memory_size();
assert_eq!(mem, params * 4); }
#[test]
fn test_embed_single_token() {
let config = AprTransformerConfig {
hidden_dim: 4,
vocab_size: 10,
..Default::default()
};
let mut transformer = AprTransformer::new(config);
transformer.token_embedding[3 * 4..3 * 4 + 4].copy_from_slice(&[1.0, 2.0, 3.0, 4.0]);
let embedded = transformer.embed(&[3]);
assert_eq!(embedded, vec![1.0, 2.0, 3.0, 4.0]);
}
#[test]
fn test_embed_multiple_tokens() {
let config = AprTransformerConfig {
hidden_dim: 2,
vocab_size: 5,
..Default::default()
};
let mut transformer = AprTransformer::new(config);
transformer.token_embedding[0..2].copy_from_slice(&[1.0, 2.0]); transformer.token_embedding[2..4].copy_from_slice(&[3.0, 4.0]); transformer.token_embedding[4..6].copy_from_slice(&[5.0, 6.0]);
let embedded = transformer.embed(&[0, 1, 2]);
assert_eq!(embedded, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
}
#[test]
fn test_embed_out_of_vocab() {
let config = AprTransformerConfig {
hidden_dim: 2,
vocab_size: 5,
..Default::default()
};
let transformer = AprTransformer::new(config);
let embedded = transformer.embed(&[100]);
assert_eq!(embedded, vec![0.0, 0.0]); }
#[test]
fn test_layer_norm_identity() {
let config = AprTransformerConfig {
hidden_dim: 4,
..Default::default()
};
let transformer = AprTransformer::new(config);
let input = vec![1.0, 2.0, 3.0, 4.0];
let weight = vec![1.0, 1.0, 1.0, 1.0];
let output = transformer.layer_norm(&input, &weight, None, 1e-5);
let mean: f32 = output.iter().sum::<f32>() / 4.0;
assert!((mean).abs() < 0.001);
}
#[test]
fn test_layer_norm_with_bias() {
let config = AprTransformerConfig {
hidden_dim: 2,
..Default::default()
};
let transformer = AprTransformer::new(config);
let input = vec![1.0, 3.0]; let weight = vec![1.0, 1.0];
let bias = vec![10.0, 20.0];
let output = transformer.layer_norm(&input, &weight, Some(&bias), 1e-5);
assert!((output[0] - 9.0).abs() < 0.01);
assert!((output[1] - 21.0).abs() < 0.01);
}
#[test]
fn test_gelu_zero() {
let config = AprTransformerConfig::default();
let transformer = AprTransformer::new(config);
let mut data = vec![0.0];
transformer.gelu(&mut data);
assert!((data[0]).abs() < 0.0001);
}
#[test]
fn test_gelu_positive() {
let config = AprTransformerConfig::default();
let transformer = AprTransformer::new(config);
let mut data = vec![1.0];
transformer.gelu(&mut data);
assert!((data[0] - 0.841).abs() < 0.01);
}
#[test]
fn test_gelu_negative() {
let config = AprTransformerConfig::default();
let transformer = AprTransformer::new(config);
let mut data = vec![-1.0];
transformer.gelu(&mut data);
assert!((data[0] - (-0.159)).abs() < 0.01);
}
#[test]
fn test_matmul_identity() {
let config = AprTransformerConfig {
hidden_dim: 2,
..Default::default()
};
let transformer = AprTransformer::new(config);
let input = vec![1.0, 2.0];
let weight = vec![1.0, 0.0, 0.0, 1.0];
let output = transformer.matmul(&input, &weight, 2, 2);
assert_eq!(output, vec![1.0, 2.0]);
}
#[test]
fn test_matmul_simple() {
let config = AprTransformerConfig {
hidden_dim: 2,
..Default::default()
};
let transformer = AprTransformer::new(config);
let input = vec![1.0, 2.0];
let weight = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let output = transformer.matmul(&input, &weight, 2, 3);
assert_eq!(output, vec![9.0, 12.0, 15.0]);
}
#[test]
fn test_forward_empty_tokens() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let result = transformer.forward(&[]);
assert!(result.is_err());
}
#[test]
fn test_forward_single_token() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let result = transformer.forward(&[1]);
assert!(result.is_ok());
let logits = result.expect("forward succeeded");
assert_eq!(logits.len(), 10); }
#[test]
fn test_forward_multiple_tokens() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let result = transformer.forward(&[1, 2, 3]);
assert!(result.is_ok());
let logits = result.expect("forward succeeded");
assert_eq!(logits.len(), 10); }
#[test]
fn test_predict_next() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let result = transformer.predict_next(&[1]);
assert!(result.is_ok());
let token = result.expect("predict succeeded");
assert!(token < 10); }
#[test]
fn test_reproducibility_same_input_same_output() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let tokens = vec![1, 2, 3];
let output1 = transformer.forward(&tokens).expect("forward 1");
let output2 = transformer.forward(&tokens).expect("forward 2");
assert_eq!(output1, output2, "Same input should produce same output");
}
#[test]
fn test_reproducibility_predict_deterministic() {
let config = AprTransformerConfig {
hidden_dim: 4,
num_layers: 1,
vocab_size: 10,
intermediate_dim: 8,
..Default::default()
};
let transformer = AprTransformer::new(config);
let tokens = vec![1, 2, 3];
let pred1 = transformer.predict_next(&tokens).expect("predict 1");
let pred2 = transformer.predict_next(&tokens).expect("predict 2");
assert_eq!(pred1, pred2, "Predictions should be deterministic");
}
#[test]
fn test_transformer_serialization_roundtrip() {
let config = AprTransformerConfig {
architecture: "test".to_string(),
hidden_dim: 4,
num_layers: 1,
num_heads: 2,
num_kv_heads: 2,
vocab_size: 10,
intermediate_dim: 8,
context_length: 128,
rope_theta: 10000.0,
eps: 1e-5,
};
let transformer = AprTransformer::new(config);
let json = serde_json::to_string(&transformer).expect("serialize");
let decoded: AprTransformer = serde_json::from_str(&json).expect("deserialize");
assert_eq!(transformer.config, decoded.config);
assert_eq!(transformer.token_embedding, decoded.token_embedding);
assert_eq!(transformer.layers.len(), decoded.layers.len());
}
}