use std::ops::{Deref, DerefMut};
use std::path::Path;
use anyhow::{anyhow, Result};
use tokenizers::Decoder;
const TOKENIZER_FILENAME: &str = "tokenizer.json";
pub struct Tokenizer {
tokenizer: tokenizers::Tokenizer,
special_token: bool,
}
impl Tokenizer {
pub fn new<T: AsRef<Path>>(path: T) -> Result<Self> {
Tokenizer::from_file(path.as_ref().join(TOKENIZER_FILENAME))
}
pub fn from_file<T: AsRef<Path>>(path: T) -> Result<Self> {
Ok(Self {
tokenizer: tokenizers::tokenizer::Tokenizer::from_file(path)
.map_err(|err| anyhow!("failed to load a tokenizer: {err}"))?,
special_token: true,
})
}
pub fn disable_spacial_token(&mut self) -> &mut Self {
self.special_token = false;
self
}
#[inline]
pub fn inner(&mut self) -> &mut tokenizers::Tokenizer {
self.deref_mut()
}
}
impl crate::Tokenizer for Tokenizer {
fn encode(&self, input: &str) -> Result<Vec<String>> {
self.tokenizer
.encode(input, self.special_token)
.map(|r| r.get_tokens().to_vec())
.map_err(|err| anyhow!("failed to encode the given input: {err}"))
}
fn decode(&self, tokens: Vec<String>) -> Result<String> {
let decoder = self
.tokenizer
.get_decoder()
.ok_or_else(|| anyhow!("no decoder is provided"))?;
decoder
.decode(tokens)
.map_err(|err| anyhow!("failed to decode: {err}"))
}
}
impl From<tokenizers::Tokenizer> for Tokenizer {
fn from(tokenizer: tokenizers::Tokenizer) -> Self {
Self {
tokenizer,
special_token: true,
}
}
}
impl Deref for Tokenizer {
type Target = tokenizers::Tokenizer;
#[inline]
fn deref(&self) -> &Self::Target {
&self.tokenizer
}
}
impl DerefMut for Tokenizer {
#[inline]
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.tokenizer
}
}