use realizar::gguf::{
MappedGGUFModel, OwnedQuantizedKVCache, OwnedQuantizedModel, QuantizedGenerateConfig,
};
use realizar::RealizarError;
use std::time::Instant;
fn main() -> Result<(), RealizarError> {
let model_path = "/home/noah/models/TinyLlama-1.1B-Chat-v1.0-Q4_K_M.gguf";
println!("Loading model...");
let mapped = MappedGGUFModel::from_path(model_path)?;
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let hidden_dim = model.config().hidden_dim;
let num_layers = model.config().num_layers;
let num_heads = model.config().num_heads;
println!(
"Model: {} layers, {} hidden, {} heads",
num_layers, hidden_dim, num_heads
);
let prompt = vec![1u32, 29871, 29896];
let config = QuantizedGenerateConfig {
max_tokens: 5,
temperature: 0.0,
top_k: 1,
stop_tokens: vec![2],
trace: false,
..Default::default()
};
for _ in 0..3 {
let _ = model.generate_with_cache(&prompt, &config)?;
}
println!("\n=== Profiling with growing KV cache ===");
let max_tokens = 32;
let _config = QuantizedGenerateConfig {
max_tokens,
temperature: 0.0,
top_k: 1,
stop_tokens: vec![],
trace: false,
..Default::default()
};
let mut times: Vec<f64> = Vec::new();
let mut cache = OwnedQuantizedKVCache::new(num_layers, hidden_dim, 512);
for (i, &token) in prompt.iter().enumerate() {
let start = Instant::now();
let _ = model.forward_single_with_cache(token, &mut cache, i)?;
times.push(start.elapsed().as_secs_f64() * 1000.0);
}
let mut last_token = prompt[prompt.len() - 1];
for i in 0..max_tokens {
let pos = prompt.len() + i;
let start = Instant::now();
let logits = model.forward_single_with_cache(last_token, &mut cache, pos)?;
times.push(start.elapsed().as_secs_f64() * 1000.0);
let mut max_idx = 0;
let mut max_val = f32::NEG_INFINITY;
for (idx, &val) in logits.iter().enumerate() {
if val > max_val {
max_val = val;
max_idx = idx;
}
}
last_token = max_idx as u32;
}
println!("\n=== Time per token (ms) ===");
for (i, &t) in times.iter().enumerate() {
let cache_len = if i < prompt.len() {
i
} else {
prompt.len() + (i - prompt.len())
};
println!("Token {:2}: {:6.2} ms (cache_len={})", i, t, cache_len);
}
let prompt_avg = times[..prompt.len()].iter().sum::<f64>() / prompt.len() as f64;
let gen_avg = times[prompt.len()..].iter().sum::<f64>() / max_tokens as f64;
println!("\n=== Summary ===");
println!("Prompt tokens (cache small): {:.2} ms avg", prompt_avg);
println!("Generated tokens (cache grows): {:.2} ms avg", gen_avg);
println!("Slowdown with growing cache: {:.2}x", gen_avg / prompt_avg);
let avg_cache_len =
(0..max_tokens).map(|i| prompt.len() + i).sum::<usize>() as f64 / max_tokens as f64;
println!(
"\nAverage cache length during generation: {:.1}",
avg_cache_len
);
println!("Attention scales as O(n) with cache length");
println!(
"Total heads × layers = {} attention computations per token",
num_heads * num_layers
);
let allocations_per_token = num_heads * num_layers; println!(
"\nEstimated Vec allocations per token: {}",
allocations_per_token
);
Ok(())
}