use std::fs;
use std::path::{Path, PathBuf};
use hf_hub::api::sync::{Api, ApiBuilder};
use crate::tokenizer::{Tokenizer, TokenizerError};
pub fn ensure_on_disk(root: &Path, family: Tokenizer) -> Result<PathBuf, TokenizerError> {
let dest_dir = root.join(family.as_str());
let dest_file = dest_dir.join("tokenizer.json");
if dest_file.exists() {
return Ok(dest_file);
}
fs::create_dir_all(&dest_dir).map_err(|source| TokenizerError::Io {
path: dest_dir.display().to_string(),
source,
})?;
let (repo_id, filename) = family.hf_source();
tracing::info!(
target: "rover::tokenizer",
family = %family,
repo = repo_id,
"downloading tokenizer from HuggingFace"
);
let api: Api = ApiBuilder::new()
.with_progress(false)
.build()
.map_err(|e| TokenizerError::Download { family, source: e })?;
let staged: PathBuf = api
.model(repo_id.to_string())
.get(filename)
.map_err(|e| TokenizerError::Download { family, source: e })?;
fs::copy(&staged, &dest_file).map_err(|source| TokenizerError::Io {
path: dest_file.display().to_string(),
source,
})?;
let size = fs::metadata(&dest_file).map(|m| m.len()).unwrap_or(0);
tracing::info!(
target: "rover::tokenizer",
family = %family,
bytes = size,
path = %dest_file.display(),
"downloaded tokenizer"
);
Ok(dest_file)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn already_present_file_is_returned_directly() {
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
let dir = root.join("cl100k");
fs::create_dir_all(&dir).unwrap();
let f = dir.join("tokenizer.json");
fs::write(&f, "{}").unwrap();
let result = ensure_on_disk(root, Tokenizer::Cl100k).unwrap();
assert_eq!(result, f);
}
}