use std::path::Path;
use tokenizers::Tokenizer;
use super::model::MoonshineError;
pub struct MoonshineTokenizer {
tokenizer: Tokenizer,
}
impl MoonshineTokenizer {
pub fn new(model_dir: &Path) -> Result<Self, MoonshineError> {
let tokenizer_path = model_dir.join("tokenizer.json");
if !tokenizer_path.exists() {
return Err(MoonshineError::TokenizerNotFound(
tokenizer_path.display().to_string(),
));
}
log::info!("Loading tokenizer from {:?}...", tokenizer_path);
let tokenizer = Tokenizer::from_file(&tokenizer_path)
.map_err(|e| MoonshineError::Tokenization(e.to_string()))?;
Ok(Self { tokenizer })
}
pub fn decode(&self, token_ids: &[i64]) -> Result<String, MoonshineError> {
let ids: Vec<u32> = token_ids.iter().map(|&id| id as u32).collect();
let text = self
.tokenizer
.decode(&ids, true)
.map_err(|e| MoonshineError::Tokenization(e.to_string()))?;
Ok(text)
}
}