#![allow(clippy::disallowed_methods)]
#[test]
#[ignore] fn test_tinyllama_tokenizer() {
use aprender::text::llama_tokenizer::LlamaTokenizer;
use std::fs;
let gguf_path = "/home/noah/src/aprender/tinyllama-1.1b-chat-v1.0.Q4_0.gguf";
if !std::path::Path::new(gguf_path).exists() {
println!("Skipping test: TinyLlama GGUF not found at {}", gguf_path);
return;
}
let data = fs::read(gguf_path).expect("Failed to read GGUF");
println!("Loaded GGUF: {} MB", data.len() / 1_000_000);
let tokenizer = LlamaTokenizer::from_gguf_bytes(&data).expect("Failed to load tokenizer");
println!("Tokenizer vocab size: {}", tokenizer.vocab_size());
println!("BOS token ID: {}", tokenizer.bos_token_id());
println!("EOS token ID: {}", tokenizer.eos_token_id());
let test_cases = [
"Hello",
"Hello, world!",
"What is 2+2?",
"The quick brown fox jumps over the lazy dog.",
];
for text in &test_cases {
let tokens = tokenizer.encode(text);
let with_bos = tokenizer.encode_with_bos(text);
let decoded = tokenizer.decode(&with_bos);
println!("\nText: {:?}", text);
println!(" Tokens: {:?} ({} tokens)", tokens, tokens.len());
println!(" With BOS: {:?}", with_bos);
println!(" Decoded: {:?}", decoded);
assert_eq!(
with_bos[0],
tokenizer.bos_token_id(),
"BOS token should be first"
);
assert!(!tokens.is_empty(), "Encoding should produce tokens");
}
println!("\nTokenizer validation complete!");
println!("To test generation, use: apr chat {}", gguf_path);
}