use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let path = "/home/noah/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct-GGUF/snapshots/198f08841147e5196a6a69bd0053690fb1fd3857/qwen2-0_5b-instruct-q4_0.gguf";
let mapped = MappedGGUFModel::from_path(path)?;
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let hidden_dim = model.config().hidden_dim;
let vocab_size = model.token_embedding().len() / hidden_dim;
println!("Model info:");
println!(" hidden_dim: {}", hidden_dim);
println!(" vocab_size: {}", vocab_size);
println!(" lm_head qtype: {}", model.lm_head_weight().qtype);
println!(
" lm_head dims: in={}, out={}",
model.lm_head_weight().in_dim,
model.lm_head_weight().out_dim
);
let bos = 151643u32;
let logits = model.forward(&[bos])?;
println!("\nLogits (first 20 tokens):");
for (i, logit) in logits.iter().enumerate().take(20) {
println!(" token {}: {:.4}", i, logit);
}
let (max_idx, max_val) = logits
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.unwrap();
println!("\nArgmax: token {} with logit {:.4}", max_idx, max_val);
let tok0_emb = &model.token_embedding()[0..hidden_dim];
let tok0_norm: f32 = tok0_emb.iter().map(|x| x * x).sum::<f32>().sqrt();
let bos_emb =
&model.token_embedding()[bos as usize * hidden_dim..(bos as usize + 1) * hidden_dim];
let bos_norm: f32 = bos_emb.iter().map(|x| x * x).sum::<f32>().sqrt();
println!("\nEmbedding analysis:");
println!(" token_0 norm: {:.4}", tok0_norm);
println!(" BOS norm: {:.4}", bos_norm);
println!("\nRequired hidden state norm (if perfectly aligned with token 0):");
println!(
" ||hidden|| = {:.4} / {:.4} = {:.4}",
logits[0],
tok0_norm,
logits[0] / tok0_norm
);
let dot: f32 = tok0_emb
.iter()
.zip(bos_emb.iter())
.map(|(a, b)| a * b)
.sum();
let cos_sim = dot / (tok0_norm * bos_norm);
println!(" cosine_sim(token_0, BOS) = {:.4}", cos_sim);
let bos_logit = logits[bos as usize];
println!(" BOS logit: {:.4}", bos_logit);
Ok(())
}