use tokenizers::tokenizer::Tokenizer as HfTokenizer;
use super::{
traits::{Decoder, Encoder, Tokenizer},
Encoding, Error, Result, TokenIdType,
};
pub struct HuggingFaceTokenizer {
tokenizer: HfTokenizer,
}
impl HuggingFaceTokenizer {
pub fn from_file(model_name: &str) -> Result<Self> {
let tokenizer = HfTokenizer::from_file(model_name)
.map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
Ok(HuggingFaceTokenizer { tokenizer })
}
pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self {
HuggingFaceTokenizer { tokenizer }
}
}
impl Encoder for HuggingFaceTokenizer {
fn encode(&self, input: &str) -> Result<Encoding> {
let encoding = self
.tokenizer
.encode(input, false)
.map_err(|err| Error::msg(format!("Error encoding input: {}", err)))?;
let token_ids = encoding.get_ids().to_vec();
let tokens = encoding.get_tokens().to_vec();
let spans = encoding.get_offsets().to_vec();
Ok(Encoding {
token_ids,
tokens,
spans,
})
}
}
impl Decoder for HuggingFaceTokenizer {
fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
let text = self
.tokenizer
.decode(token_ids, skip_special_tokens)
.map_err(|err| Error::msg(format!("Error decoding input: {}", err)))?;
Ok(text)
}
}
impl Tokenizer for HuggingFaceTokenizer {}
impl From<HfTokenizer> for HuggingFaceTokenizer {
fn from(tokenizer: HfTokenizer) -> Self {
HuggingFaceTokenizer { tokenizer }
}
}