use std::{
fs,
path::PathBuf,
sync::{Mutex, OnceLock},
};
const TINYLLAMA_TOKENIZER_URL: &str =
"https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/resolve/main/tokenizer.json";
const CACHE_DIR: &str = ".tokenizer_cache";
const TINYLLAMA_TOKENIZER_FILENAME: &str = "tinyllama_tokenizer.json";
static DOWNLOAD_MUTEX: OnceLock<Mutex<()>> = OnceLock::new();
pub fn ensure_tokenizer_cached() -> PathBuf {
let mutex = DOWNLOAD_MUTEX.get_or_init(|| Mutex::new(()));
let _guard = mutex.lock().unwrap();
let cache_dir = PathBuf::from(CACHE_DIR);
let tokenizer_path = cache_dir.join(TINYLLAMA_TOKENIZER_FILENAME);
if !cache_dir.exists() {
fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
}
if !tokenizer_path.exists() {
println!("Downloading TinyLlama tokenizer from HuggingFace...");
let client = reqwest::blocking::Client::new();
let response = client
.get(TINYLLAMA_TOKENIZER_URL)
.send()
.expect("Failed to download tokenizer");
if !response.status().is_success() {
panic!("Failed to download tokenizer: HTTP {}", response.status());
}
let content = response.bytes().expect("Failed to read tokenizer content");
if content.len() < 100 {
panic!("Downloaded content too small: {} bytes", content.len());
}
fs::write(&tokenizer_path, content).expect("Failed to write tokenizer to cache");
println!(
"Tokenizer downloaded and cached successfully ({} bytes)",
tokenizer_path.metadata().unwrap().len()
);
}
tokenizer_path
}
pub const TEST_PROMPTS: [&str; 4] = [
"deep learning is",
"Deep learning is",
"has anyone seen nemo lately",
"another prompt",
];
pub const EXPECTED_HASHES: [u64; 4] = [
1209591529327510910,
4181375434596349981,
6245658446118930933,
5097285695902185237,
];