use std::env;
use std::fs;
use std::process::ExitCode;
use rullama::gguf::GgufReader;
use rullama::tokenizer::BpeTokenizer;
fn main() -> ExitCode {
let path = match env::args().nth(1) {
Some(p) => p,
None => {
eprintln!("usage: encode_check <gguf>");
return ExitCode::from(2);
}
};
let bytes = fs::read(&path).expect("read");
let r = GgufReader::new(bytes).expect("parse");
let tok = match BpeTokenizer::from_gguf(&r) {
Ok(t) => t,
Err(e) => {
eprintln!("tokenizer build error: {e}");
return ExitCode::from(1);
}
};
println!("vocab_size = {}", tok.vocab_size());
let cases: &[(&str, &[u32])] = &[
("Hello, world!", &[9259, 236764, 1902, 236888]),
(
"<|turn>user\nWhat is 2+2?<turn|>\n<|turn>model\n",
&[
105, 2364, 107, 3689, 563, 236743, 236778, 236862, 236778, 236881, 106, 107, 105,
4368, 107,
],
),
];
let mut all_pass = true;
for (input, expected) in cases {
let got = tok.encode(input);
let ok = got == *expected;
if !ok {
all_pass = false;
}
println!(
"{} {:?}\n got = {:?}\n expected = {:?}",
if ok { "PASS" } else { "FAIL" },
input,
got,
expected
);
}
if all_pass {
ExitCode::SUCCESS
} else {
ExitCode::from(1)
}
}