use llama_rs::{
backend::{Backend, cpu::CpuBackend},
model::load_llama_model,
tensor::{DType, Tensor},
};
fn print_stats(name: &str, data: &[f32]) {
let min = data.iter().cloned().fold(f32::INFINITY, f32::min);
let max = data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let sum: f32 = data.iter().sum();
let mean = sum / data.len() as f32;
println!("{}: min={:.4}, max={:.4}, mean={:.4}", name, min, max, mean);
println!(
" first 5: [{:.4}, {:.4}, {:.4}, {:.4}, {:.4}]",
data[0], data[1], data[2], data[3], data[4]
);
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let model_path = "/home/joseph/Models/qwen2.5-0.5b-instruct-q4_k_m.gguf";
eprintln!("Loading model...");
let backend = CpuBackend::new();
let model = load_llama_model(model_path)?;
let hidden_size = model.config().hidden_size;
let num_heads = model.config().num_heads;
let num_kv_heads = model.config().num_kv_heads;
let head_dim = model.config().head_dim;
let freq_base = model.config().rope_config.freq_base;
let token_id = 28u32;
let pos = 0usize;
println!("=== Debugging Layer 0 for token {} ===\n", token_id);
let embedding = model.embed_tokens(&[token_id], &backend)?;
let embedding = embedding.reshape(vec![hidden_size])?;
let emb_data = embedding.as_f32()?;
print_stats("Embedding", emb_data);
let layer = &model.layers()[0];
println!("\n--- Step 1: Attention RMSNorm ---");
let mut normed = Tensor::zeros(vec![hidden_size], DType::F32);
layer.attn_norm.forward(&embedding, &mut normed, &backend)?;
let normed_data = normed.as_f32()?;
print_stats("After attn_norm", normed_data);
println!("\n--- Step 2: Q, K, V Projections ---");
let x_vec = normed.reshape(vec![hidden_size])?;
let mut q = Tensor::zeros(vec![num_heads * head_dim], DType::F32);
let mut k = Tensor::zeros(vec![num_kv_heads * head_dim], DType::F32);
let mut v = Tensor::zeros(vec![num_kv_heads * head_dim], DType::F32);
layer.attention().unwrap().wq.forward(&x_vec, &mut q, &backend)?;
layer.attention().unwrap().wk.forward(&x_vec, &mut k, &backend)?;
layer.attention().unwrap().wv.forward(&x_vec, &mut v, &backend)?;
print_stats("Q (with bias)", q.as_f32()?);
print_stats("K (with bias)", k.as_f32()?);
print_stats("V", v.as_f32()?);
println!("\n--- Step 3: RoPE ---");
let mut q_reshaped = q.reshape(vec![num_heads, 1, head_dim])?;
let mut k_reshaped = k.reshape(vec![num_kv_heads, 1, head_dim])?;
backend.rope(&mut q_reshaped, &mut k_reshaped, pos, freq_base, 1.0, true)?;
println!("At pos=0, RoPE should be identity (cos(0)=1, sin(0)=0)");
let q_rope_data = q_reshaped.as_f32()?;
let k_rope_data = k_reshaped.as_f32()?;
print_stats("Q after RoPE", q_rope_data);
print_stats("K after RoPE", k_rope_data);
println!("\n--- Step 4: Attention ---");
let v_reshaped = v.reshape(vec![num_kv_heads, 1, head_dim])?;
let scale = 1.0 / (head_dim as f32).sqrt();
let mut attn_out = Tensor::zeros(vec![num_heads, 1, head_dim], DType::F32);
backend.attention(&q_reshaped, &k_reshaped, &v_reshaped, &mut attn_out, scale)?;
let attn_out_data = attn_out.as_f32()?;
print_stats("Attention output", attn_out_data);
let v_data = v.as_f32()?;
let num_q_per_kv = num_heads / num_kv_heads;
println!(
"\n Expected (V head 0 replicated): first 5 = [{:.4}, {:.4}, {:.4}, {:.4}, {:.4}]",
v_data[0], v_data[1], v_data[2], v_data[3], v_data[4]
);
println!("\n--- Step 5: Output Projection ---");
let attn_out_flat = attn_out.reshape(vec![num_heads * head_dim])?;
let mut output = Tensor::zeros(vec![hidden_size], DType::F32);
layer
.attention()
.unwrap()
.wo
.forward(&attn_out_flat, &mut output, &backend)?;
let output_data = output.as_f32()?;
print_stats("Output projection", output_data);
println!("\n--- Step 6: First Residual ---");
let mut hidden_after_attn = Tensor::zeros(vec![hidden_size], DType::F32);
{
let h_data = hidden_after_attn.as_f32_mut()?;
for i in 0..hidden_size {
h_data[i] = emb_data[i] + output_data[i];
}
}
let hidden_after_attn_data = hidden_after_attn.as_f32()?;
print_stats(
"After first residual (emb + attn_out)",
hidden_after_attn_data,
);
println!("\n--- Step 7: FFN RMSNorm ---");
let mut normed_ffn = Tensor::zeros(vec![hidden_size], DType::F32);
layer
.ffn_norm
.forward(&hidden_after_attn, &mut normed_ffn, &backend)?;
let normed_ffn_data = normed_ffn.as_f32()?;
print_stats("After ffn_norm", normed_ffn_data);
println!("\n--- Step 8: FFN ---");
let intermediate_size = layer.ffn().unwrap().intermediate_size;
println!(" intermediate_size = {}", intermediate_size);
let mut ffn_out = Tensor::zeros(vec![hidden_size], DType::F32);
layer.ffn().unwrap().forward(&normed_ffn, &mut ffn_out, &backend)?;
let ffn_out_data = ffn_out.as_f32()?;
print_stats("FFN output", ffn_out_data);
println!("\n--- Step 9: Second Residual (Layer 0 Output) ---");
let mut layer_output = Tensor::zeros(vec![hidden_size], DType::F32);
{
let l_data = layer_output.as_f32_mut()?;
for i in 0..hidden_size {
l_data[i] = hidden_after_attn_data[i] + ffn_out_data[i];
}
}
let layer_output_data = layer_output.as_f32()?;
print_stats("FINAL Layer 0 output", layer_output_data);
println!("\n=== COMPARISON WITH PYTHON REFERENCE ===");
println!("Python layer_0: min=-4.4456, max=4.0055, mean=-0.0860");
println!("Python first 5: [-1.5666, 0.0268, -1.2803, -0.5673, 1.2690]");
Ok(())
}