use anyhow::{Result, anyhow};
use std::path::Path;
use tokenizers::Tokenizer;
pub struct WhisperTokenizer {
tokenizer: Tokenizer,
pub sot: u32, pub eot: u32, pub transcribe: u32, pub translate: u32, pub no_speech: u32, pub no_timestamps: u32, }
impl WhisperTokenizer {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let tokenizer =
Tokenizer::from_file(path).map_err(|e| anyhow!("failed to load tokenizer: {e}"))?;
let lookup = |name: &str| -> Result<u32> {
tokenizer
.token_to_id(name)
.ok_or_else(|| anyhow!("special token not found: {name}"))
};
let no_speech = lookup("<|nospeech|>").or_else(|_| lookup("<|nocaptions|>"))?;
Ok(Self {
sot: lookup("<|startoftranscript|>")?,
eot: lookup("<|endoftext|>")?,
transcribe: lookup("<|transcribe|>")?,
translate: lookup("<|translate|>")?,
no_speech,
no_timestamps: lookup("<|notimestamps|>")?,
tokenizer,
})
}
pub fn language_token(&self, lang: &str) -> Option<u32> {
self.tokenizer.token_to_id(&format!("<|{lang}|>"))
}
pub fn initial_tokens(&self, language: Option<&str>) -> Vec<u32> {
let mut tokens = vec![self.sot];
if let Some(lang) = language
&& let Some(id) = self.language_token(lang)
{
tokens.push(id);
}
tokens.push(self.transcribe);
tokens.push(self.no_timestamps);
tokens
}
pub fn encode(&self, text: &str) -> Result<Vec<u32>> {
let encoding = self
.tokenizer
.encode(text, false)
.map_err(|e| anyhow!("encode error: {e}"))?;
Ok(encoding.get_ids().to_vec())
}
pub fn decode(&self, ids: &[u32]) -> Result<String> {
self.tokenizer
.decode(ids, false)
.map_err(|e| anyhow!("decode error: {e}"))
}
pub fn id_to_token(&self, id: u32) -> Option<String> {
self.tokenizer.id_to_token(id)
}
}