use llama_rs::{
backend::{Backend, cpu::CpuBackend},
model::load_llama_model,
tensor::{DType, Tensor},
};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let model_path = "/home/joseph/Models/qwen2.5-0.5b-instruct-q4_k_m.gguf";
println!("Loading model from {}...", model_path);
let backend = CpuBackend::new();
let model = load_llama_model(model_path)?;
let token_id = 28u32;
let pos = 0usize;
println!(
"\n=== Testing token {} ('=') at position {} ===",
token_id, pos
);
let hidden_size = model.config().hidden_size;
let num_heads = model.config().num_heads;
let num_kv_heads = model.config().num_kv_heads;
let head_dim = model.config().head_dim;
let freq_base = model.config().rope_config.freq_base;
println!("Model config:");
println!(" hidden_size: {}", hidden_size);
println!(" num_heads: {}", num_heads);
println!(" num_kv_heads: {}", num_kv_heads);
println!(" head_dim: {}", head_dim);
println!(" freq_base: {}", freq_base);
println!("\n--- Step 1: Token Embedding ---");
let embedding = model.embed_tokens(&[token_id], &backend)?;
let emb_data = embedding.as_f32()?;
println!("Embedding shape: {:?}", embedding.shape());
let emb_min = emb_data.iter().cloned().fold(f32::INFINITY, f32::min);
let emb_max = emb_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let emb_sum: f32 = emb_data.iter().sum();
println!(
"Embedding stats: min={:.6}, max={:.6}, sum={:.6}",
emb_min, emb_max, emb_sum
);
println!(
"Embedding first 10: {:?}",
&emb_data[..10.min(emb_data.len())]
);
println!("\n--- Step 2: Attention RMSNorm ---");
let layer = &model.layers()[0];
let embedding = embedding.reshape(vec![hidden_size])?;
let emb_data = embedding.as_f32()?;
let mut normed = Tensor::zeros(vec![hidden_size], DType::F32);
layer.attn_norm.forward(&embedding, &mut normed, &backend)?;
let normed_data = normed.as_f32()?;
let norm_min = normed_data.iter().cloned().fold(f32::INFINITY, f32::min);
let norm_max = normed_data
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
println!("After norm stats: min={:.6}, max={:.6}", norm_min, norm_max);
println!(
"After norm first 10: {:?}",
&normed_data[..10.min(normed_data.len())]
);
println!("\n--- Step 3: Q, K, V Projections ---");
let x_vec = normed.reshape(vec![hidden_size])?;
let mut q = Tensor::zeros(vec![num_heads * head_dim], DType::F32);
layer.attention().unwrap().wq.forward(&x_vec, &mut q, &backend)?;
let q_data = q.as_f32()?;
let q_min = q_data.iter().cloned().fold(f32::INFINITY, f32::min);
let q_max = q_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
println!("Q (after bias) stats: min={:.2}, max={:.2}", q_min, q_max);
println!("Q first 10: {:?}", &q_data[..10.min(q_data.len())]);
let mut k = Tensor::zeros(vec![num_kv_heads * head_dim], DType::F32);
layer.attention().unwrap().wk.forward(&x_vec, &mut k, &backend)?;
let k_data = k.as_f32()?;
let k_min = k_data.iter().cloned().fold(f32::INFINITY, f32::min);
let k_max = k_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
println!("K (after bias) stats: min={:.2}, max={:.2}", k_min, k_max);
println!("K first 10: {:?}", &k_data[..10.min(k_data.len())]);
let mut v = Tensor::zeros(vec![num_kv_heads * head_dim], DType::F32);
layer.attention().unwrap().wv.forward(&x_vec, &mut v, &backend)?;
let v_data = v.as_f32()?;
let v_min = v_data.iter().cloned().fold(f32::INFINITY, f32::min);
let v_max = v_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
println!("V stats: min={:.4}, max={:.4}", v_min, v_max);
println!("V first 10: {:?}", &v_data[..10.min(v_data.len())]);
println!("\n--- Step 4: Apply RoPE ---");
let mut q_reshaped = q.reshape(vec![num_heads, 1, head_dim])?;
let mut k_reshaped = k.reshape(vec![num_kv_heads, 1, head_dim])?;
backend.rope(&mut q_reshaped, &mut k_reshaped, pos, freq_base, 1.0, true)?;
let q_rope_data = q_reshaped.as_f32()?;
let k_rope_data = k_reshaped.as_f32()?;
println!("At pos=0, RoPE should be identity (cos(0)=1, sin(0)=0)");
println!(
"Q after RoPE first 10: {:?}",
&q_rope_data[..10.min(q_rope_data.len())]
);
println!(
"K after RoPE first 10: {:?}",
&k_rope_data[..10.min(k_rope_data.len())]
);
let q_diff: f32 = q_data
.iter()
.zip(q_rope_data.iter())
.map(|(a, b)| (a - b).abs())
.sum();
let k_diff: f32 = k_data
.iter()
.zip(k_rope_data.iter())
.map(|(a, b)| (a - b).abs())
.sum();
println!("Q diff from pre-RoPE: {:.6}", q_diff);
println!("K diff from pre-RoPE: {:.6}", k_diff);
println!("\n--- Step 5: Self-Attention (pos=0) ---");
let scale = 1.0 / (head_dim as f32).sqrt();
println!("Attention scale: {}", scale);
let q_head0: f32 = q_rope_data[..head_dim]
.iter()
.zip(k_rope_data[..head_dim].iter())
.map(|(qi, ki)| qi * ki)
.sum();
let score_head0 = q_head0 * scale;
println!("Head 0 raw attention score: {:.4}", score_head0);
let v_head0 = &v_data[..head_dim];
println!("V head 0 first 10: {:?}", &v_head0[..10.min(v_head0.len())]);
println!("Attention output head 0 = V head 0 (softmax of single element = 1)");
println!("\n--- Step 6: Full Attention Output ---");
let v_reshaped = v.reshape(vec![num_kv_heads, 1, head_dim])?;
let mut attn_out = Tensor::zeros(vec![num_heads, 1, head_dim], DType::F32);
backend.attention(&q_reshaped, &k_reshaped, &v_reshaped, &mut attn_out, scale)?;
let attn_out_data = attn_out.as_f32()?;
let attn_min = attn_out_data.iter().cloned().fold(f32::INFINITY, f32::min);
let attn_max = attn_out_data
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
println!(
"Attention output stats: min={:.4}, max={:.4}",
attn_min, attn_max
);
println!(
"Attention output first 10: {:?}",
&attn_out_data[..10.min(attn_out_data.len())]
);
let expected_attn_out: Vec<f32> = (0..num_heads)
.flat_map(|h| {
let kv_h = h / (num_heads / num_kv_heads);
v_data[kv_h * head_dim..(kv_h + 1) * head_dim].to_vec()
})
.collect();
let attn_diff: f32 = attn_out_data
.iter()
.zip(expected_attn_out.iter())
.map(|(a, b)| (a - b).abs())
.sum();
println!("Attention output diff from expected (V): {:.6}", attn_diff);
println!("\n--- Step 7: Output Projection ---");
let attn_out_flat = attn_out.reshape(vec![num_heads * head_dim])?;
let mut output = Tensor::zeros(vec![hidden_size], DType::F32);
layer
.attention()
.unwrap()
.wo
.forward(&attn_out_flat, &mut output, &backend)?;
let out_data = output.as_f32()?;
let out_min = out_data.iter().cloned().fold(f32::INFINITY, f32::min);
let out_max = out_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
println!(
"Output projection stats: min={:.4}, max={:.4}",
out_min, out_max
);
println!(
"Output projection first 10: {:?}",
&out_data[..10.min(out_data.len())]
);
println!("\n--- Step 8: Residual Connection ---");
let mut hidden_after_attn = Tensor::zeros(vec![hidden_size], DType::F32);
{
let hidden_data = hidden_after_attn.as_f32_mut()?;
for i in 0..hidden_size {
hidden_data[i] = emb_data[i] + out_data[i];
}
}
let hidden_data = hidden_after_attn.as_f32()?;
let hidden_min = hidden_data.iter().cloned().fold(f32::INFINITY, f32::min);
let hidden_max = hidden_data
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
let hidden_sum: f32 = hidden_data.iter().sum();
println!(
"After attention+residual stats: min={:.4}, max={:.4}, sum={:.4}",
hidden_min, hidden_max, hidden_sum
);
println!(
"After attention+residual first 10: {:?}",
&hidden_data[..10.min(hidden_data.len())]
);
println!("\n=== SUMMARY ===");
println!("Embedding sum: {:.4}", emb_sum);
println!("After attention sum: {:.4}", hidden_sum);
println!("Output projection sum: {:.4}", out_data.iter().sum::<f32>());
Ok(())
}