use llama_rs::{
backend::{Backend, cpu::CpuBackend},
model::load_llama_model,
tensor::{DType, Tensor},
};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let model_path = "/home/joseph/Models/qwen2.5-0.5b-instruct-q4_k_m.gguf";
eprintln!("Loading model...");
let backend = CpuBackend::new();
let model = load_llama_model(model_path)?;
let layer = &model.layers()[0];
let w_down = &layer.ffn().unwrap().w_down.weight;
println!("FFN down weight:");
println!(" Shape: {:?}", w_down.shape());
println!(" DType: {:?}", w_down.dtype());
let mut w_down_f32 = Tensor::zeros(w_down.shape().to_vec(), DType::F32);
backend.dequantize(w_down, &mut w_down_f32)?;
let data = w_down_f32.as_f32()?;
let in_features = 4864;
let out_features = 896;
println!("\nFirst 256 dequantized elements (should be first Q6_K block):");
for i in 0..256 {
if i < 10 {
print!("{:.6}, ", data[i]);
}
}
println!("...");
let first_block: Vec<f32> = data[..256].to_vec();
let min = first_block.iter().cloned().fold(f32::INFINITY, f32::min);
let max = first_block
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
println!("First block stats: min={:.6}, max={:.6}", min, max);
println!("\nPython first block first 10:");
println!("[-0.006, -0.217, -0.196, -0.219, -0.005, -0.007, -0.018, -0.221, -0.219, -0.221]");
Ok(())
}