use realizar::gguf::{MappedGGUFModel, OwnedQKVWeights, OwnedQuantizedModel};
use realizar::quantize::{fused_q4k_parallel_matvec, fused_q6k_parallel_matvec};
use realizar::rms_norm;
const GGUF_TYPE_Q4_K: u32 = 12;
const GGUF_TYPE_Q6_K: u32 = 14;
fn fused_matmul(input: &[f32], data: &[u8], qtype: u32, in_dim: usize, out_dim: usize) -> Vec<f32> {
match qtype {
GGUF_TYPE_Q4_K => fused_q4k_parallel_matvec(data, input, in_dim, out_dim).expect("q4k"),
GGUF_TYPE_Q6_K => fused_q6k_parallel_matvec(data, input, in_dim, out_dim).expect("q6k"),
_ => panic!("Unsupported qtype: {}", qtype),
}
}
fn main() {
let path = std::env::args().nth(1).unwrap_or_else(|| {
"../aprender/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf".to_string()
});
let mapped = MappedGGUFModel::from_path(&path).expect("Failed to load");
let model = OwnedQuantizedModel::from_mapped(&mapped).expect("Failed to parse");
let hidden_dim = model.config().hidden_dim;
let eps = model.config().eps;
let token_id = 791u32;
let start = token_id as usize * hidden_dim;
let embedding: Vec<f32> = model.token_embedding()[start..start + hidden_dim].to_vec();
println!("=== CPU Layer 0 Trace ===");
println!("Embedding first 5: {:?}", &embedding[..5]);
let layer = &model.layers()[0];
let normed = rms_norm(&embedding, &layer.attn_norm_weight, eps);
println!("RMSNorm first 3: {:?}", &normed[..3]);
let OwnedQKVWeights::Separate {
q: q_weight,
k: k_weight,
v: v_weight,
} = &layer.qkv_weight
else {
panic!("Expected separate QKV")
};
let mut q = fused_matmul(
&normed,
&q_weight.data,
q_weight.qtype,
q_weight.in_dim,
q_weight.out_dim,
);
let mut k = fused_matmul(
&normed,
&k_weight.data,
k_weight.qtype,
k_weight.in_dim,
k_weight.out_dim,
);
let mut v = fused_matmul(
&normed,
&v_weight.data,
v_weight.qtype,
v_weight.in_dim,
v_weight.out_dim,
);
if let Some(ref bias) = layer.qkv_bias {
let q_dim = q.len();
let k_dim = k.len();
for i in 0..q_dim {
q[i] += bias[i];
}
for i in 0..k_dim {
k[i] += bias[q_dim + i];
}
for i in 0..v.len() {
v[i] += bias[q_dim + k_dim + i];
}
println!("QKV bias applied: len={}", bias.len());
} else {
println!("WARNING: No QKV bias found!");
}
println!("Q first 3: {:?}", &q[..3]);
println!("K first 5: {:?}", &k[..5]);
println!("V first 5: {:?}", &v[..5]);
let head_dim = hidden_dim / model.config().num_heads;
let group_size = model.config().num_heads / model.config().num_kv_heads;
let mut attn_out = Vec::with_capacity(hidden_dim);
for h in 0..model.config().num_heads {
let kv_head = h / group_size;
let kv_start = kv_head * head_dim;
attn_out.extend_from_slice(&v[kv_start..kv_start + head_dim]);
}
println!("Attn out first 3: {:?}", &attn_out[..3]);
let out_proj = fused_matmul(
&attn_out,
&layer.attn_output_weight.data,
layer.attn_output_weight.qtype,
layer.attn_output_weight.in_dim,
layer.attn_output_weight.out_dim,
);
println!("Output proj first 3: {:?}", &out_proj[..3]);
let residual1: Vec<f32> = embedding
.iter()
.zip(out_proj.iter())
.map(|(e, o)| e + o)
.collect();
println!("Residual1 first 3: {:?}", &residual1[..3]);
let ffn_normed = rms_norm(
&residual1,
layer.ffn_norm_weight.as_ref().expect("ffn"),
eps,
);
let gate_weight = layer.ffn_gate_weight.as_ref().expect("gate");
let gate = fused_matmul(
&ffn_normed,
&gate_weight.data,
gate_weight.qtype,
gate_weight.in_dim,
gate_weight.out_dim,
);
let up = fused_matmul(
&ffn_normed,
&layer.ffn_up_weight.data,
layer.ffn_up_weight.qtype,
layer.ffn_up_weight.in_dim,
layer.ffn_up_weight.out_dim,
);
println!("FFN gate first 3: {:?}", &gate[..3]);
println!("FFN up first 3: {:?}", &up[..3]);
let swiglu: Vec<f32> = gate
.iter()
.zip(up.iter())
.map(|(g, u)| (g / (1.0 + (-*g).exp())) * u)
.collect();
println!("SwiGLU first 3: {:?}", &swiglu[..3]);
let ffn_down = fused_matmul(
&swiglu,
&layer.ffn_down_weight.data,
layer.ffn_down_weight.qtype,
layer.ffn_down_weight.in_dim,
layer.ffn_down_weight.out_dim,
);
println!("FFN down first 3: {:?}", &ffn_down[..3]);
let output: Vec<f32> = residual1
.iter()
.zip(ffn_down.iter())
.map(|(r, f)| r + f)
.collect();
println!("Layer output first 3: {:?}", &output[..3]);
println!("\n=== Comparison with GPU (from debug_layer_by_layer) ===");
println!("GPU RMSNorm: [-0.9617413, 0.3485948, 0.40064743]");
println!("GPU Q: [-0.25516012, -0.6839622, -0.04699441]");
println!("GPU Attn: [-0.11200829, -0.066736706, 0.2103174]");
println!("GPU Out proj: [-0.95968825, 0.011617601, -0.059631474]");
println!("GPU Residual1: [-0.9881397, 0.023871362, -0.047377713]");
println!("GPU FFN gate: [-0.6621405, -0.15498713, -1.1224853]");
println!("GPU FFN up: [0.58142, 0.092911005, 0.32663453]");
println!("GPU SwiGLU: [-0.13099347, -0.0066431654, -0.09002926]");
println!("GPU FFN down: [-0.029813662, 0.17109162, 0.08764193]");
println!("GPU Layer out: [-1.0179534, 0.19496298, 0.04026422]");
}