use crate::tokenizers::{
traits::{Decoder, Encoder, Tokenizer},
Encoding, Error, Result, TokenIdType,
};
use sentencepiece::SentencePieceProcessor;
pub struct SentencePieceTokenizer {
spp: SentencePieceProcessor,
}
impl SentencePieceTokenizer {
pub fn from_file(tokenizer_name: &str) -> Result<Self> {
let spp = SentencePieceProcessor::open(tokenizer_name)
.map_err(|err| Error::msg(format!("Error loading tokenizer: {}", err)))?;
Ok(Self { spp })
}
}
impl Encoder for SentencePieceTokenizer {
fn encode(&self, input: &str) -> Result<Encoding> {
let encoding = self
.spp
.encode(input)
.map_err(|err| Error::msg(format!("Error encoding input: {}", err)))?;
let mut token_ids = Vec::new();
let mut tokens = Vec::new();
let mut spans = Vec::new();
for piece in encoding {
token_ids.push(piece.id);
tokens.push(piece.piece);
spans.push((piece.span.0 as usize, piece.span.1 as usize));
}
Ok(Encoding {
token_ids,
tokens,
spans,
})
}
}
impl Decoder for SentencePieceTokenizer {
fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
if skip_special_tokens {
return Err(Error::msg(
"SentencePieceTokenizer does not support skip_special_tokens=true.",
));
}
let text = self
.spp
.decode_piece_ids(token_ids)
.map_err(|err| Error::msg(format!("Error decoding input: {}", err)))?;
Ok(text)
}
}
impl Tokenizer for SentencePieceTokenizer {}