use std::path::Path;
use crate::error::TokenizerError;
use crate::tokenizer::tokenization_utils::{clean_text, decompose_nfkc, is_whitespace, lowercase};
use crate::tokenizer::{MultiThreadedTokenizer, Tokenizer};
use crate::vocab::{SentencePieceBpeModel, SentencePieceVocab, Vocab};
use crate::{Token, TokenRef};
pub struct SentencePieceBpeTokenizer {
model: SentencePieceBpeModel,
vocab: SentencePieceVocab,
lower_case: bool,
}
impl SentencePieceBpeTokenizer {
pub fn from_file_with_special_token_mapping<P: AsRef<Path>, S: AsRef<Path>>(
path: P,
lower_case: bool,
special_token_mapping_path: S,
) -> Result<SentencePieceBpeTokenizer, TokenizerError> {
let model = SentencePieceBpeModel::from_file(&path)?;
let vocab = SentencePieceVocab::from_file_with_special_token_mapping(
path,
special_token_mapping_path,
)?;
Ok(SentencePieceBpeTokenizer {
model,
vocab,
lower_case,
})
}
pub fn from_file<P: AsRef<Path>>(
path: P,
lower_case: bool,
) -> Result<SentencePieceBpeTokenizer, TokenizerError> {
let model = SentencePieceBpeModel::from_file(&path)?;
let vocab = SentencePieceVocab::from_file(path)?;
Ok(SentencePieceBpeTokenizer {
model,
vocab,
lower_case,
})
}
pub fn from_existing_vocab_and_model(
vocab: SentencePieceVocab,
model: SentencePieceBpeModel,
lower_case: bool,
) -> SentencePieceBpeTokenizer {
SentencePieceBpeTokenizer {
model,
vocab,
lower_case,
}
}
}
impl Tokenizer<SentencePieceVocab> for SentencePieceBpeTokenizer {
fn vocab(&self) -> &SentencePieceVocab {
&self.vocab
}
fn vocab_mut(&mut self) -> &mut SentencePieceVocab {
&mut self.vocab
}
fn tokenize_to_tokens(&self, text: TokenRef) -> Vec<Token> {
let mut token = text.to_owned();
clean_text(&mut token, true);
decompose_nfkc(&mut token);
if self.lower_case {
lowercase(&mut token);
}
token.text = token.text.replace(|c: char| is_whitespace(&c), "\u{2581}");
if !token.text.starts_with('\u{2581}') {
token.text.insert(0, '\u{2581}');
token.reference_offsets.insert(0, 0);
};
self.model.tokenize_to_tokens(token.as_ref())
}
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String {
tokens
.into_iter()
.map(|v| v.replace('\u{2581}', " "))
.collect::<Vec<String>>()
.join("")
}
}
impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceBpeTokenizer {}