use realizar::gguf::{
MappedGGUFModel, OwnedQKVWeights, OwnedQuantizedKVCache, OwnedQuantizedModel,
OwnedQuantizedModelCuda,
};
use realizar::quantize::fused_q4k_parallel_matvec;
fn main() -> Result<(), Box<dyn std::error::Error>> {
std::env::set_var("CUDA_GRAPH_DISABLE", "1");
let path = "/home/noah/src/single-shot-eval/models/raw/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf";
let mapped = MappedGGUFModel::from_path(path)?;
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let hidden_dim = model.config().hidden_dim;
let eps = model.config().eps;
let test_token: u32 = 791;
let embedding = model.embed(&[test_token]);
println!("=== Step 1: Embedding ===");
println!(" first 5: {:?}", &embedding[..5]);
println!(" sum: {:.6}", embedding.iter().sum::<f32>());
let gamma = &model.layers()[0].attn_norm_weight;
let sum_sq: f32 = embedding.iter().map(|x| x * x).sum();
let rms = (sum_sq / hidden_dim as f32 + eps).sqrt();
let normed: Vec<f32> = embedding
.iter()
.zip(gamma)
.map(|(x, g)| x / rms * g)
.collect();
println!("\n=== Step 2: RMSNorm ===");
println!(" first 5: {:?}", &normed[..5]);
println!(" sum: {:.6}", normed.iter().sum::<f32>());
let OwnedQKVWeights::Separate {
q: q_weight,
k: _k_weight,
v: _v_weight,
} = &model.layers()[0].qkv_weight
else {
panic!("Expected separate QKV")
};
let q_cpu = fused_q4k_parallel_matvec(&q_weight.data, &normed, hidden_dim, q_weight.out_dim)?;
println!("\n=== Step 3: Q projection (CPU fused_q4k_parallel_matvec) ===");
println!(" first 5: {:?}", &q_cpu[..5]);
println!(" sum: {:.6}", q_cpu.iter().sum::<f32>());
println!("\n=== GPU Path ===");
let mapped_gpu = MappedGGUFModel::from_path(path)?;
let gpu_model = OwnedQuantizedModel::from_mapped(&mapped_gpu)?;
let mut cuda_model = OwnedQuantizedModelCuda::new(gpu_model, 0)?;
cuda_model.preload_weights_gpu()?;
let executor = cuda_model.executor_mut();
let mut gpu_q = vec![0.0f32; q_weight.out_dim];
executor.q4k_gemv_cached_tiled(
"blk.0.attn_q.weight",
&normed,
&mut gpu_q,
q_weight.out_dim as u32,
hidden_dim as u32,
)?;
println!("GPU Q projection:");
println!(" first 5: {:?}", &gpu_q[..5]);
println!(" sum: {:.6}", gpu_q.iter().sum::<f32>());
let max_diff = q_cpu
.iter()
.zip(&gpu_q)
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
println!("\nQ CPU vs GPU max diff: {:.6}", max_diff);
println!("\n=== CPU model.qkv_matmul() ===");
let cpu_qkv = model.qkv_matmul(&normed, &model.layers()[0].qkv_weight)?;
let cpu_q_from_qkv = &cpu_qkv[0..q_weight.out_dim];
println!(" first 5: {:?}", &cpu_q_from_qkv[..5]);
println!(" sum: {:.6}", cpu_q_from_qkv.iter().sum::<f32>());
let max_diff_qkv = q_cpu
.iter()
.zip(cpu_q_from_qkv)
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
println!(
" vs fused_q4k_parallel_matvec: max diff = {:.6}",
max_diff_qkv
);
println!("\n=== Full CPU forward ===");
let cpu_logits = model.forward(&[test_token])?;
let cpu_argmax = cpu_logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map(|(i, _)| i)
.unwrap();
println!(" argmax: {}", cpu_argmax);
println!("\n=== Full GPU forward ===");
let kv_dim = model.config().num_kv_heads * (hidden_dim / model.config().num_heads);
let mut gpu_cache = OwnedQuantizedKVCache::new(model.config().num_layers, kv_dim, 64);
let gpu_logits = cuda_model.forward_gpu_resident(test_token, &mut gpu_cache, 0)?;
let gpu_argmax = gpu_logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.map(|(i, _)| i)
.unwrap();
println!(" argmax: {}", gpu_argmax);
println!(
"\nResult: CPU argmax={}, GPU argmax={}",
cpu_argmax, gpu_argmax
);
if cpu_argmax == gpu_argmax {
println!("PASS");
} else {
println!("FAIL - outputs differ!");
}
Ok(())
}