use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
use tokenizers::models::bpe::BPE;
use tokenizers::models::wordpiece::WordPiece;
use tokenizers::normalizers::bert::BertNormalizer;
use tokenizers::pre_tokenizers::bert::BertPreTokenizer;
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::processors::bert::BertProcessing;
use tokenizers::tokenizer::{Model, Tokenizer};
#[allow(dead_code)]
pub fn get_empty() -> Tokenizer {
Tokenizer::new(BPE::default())
}
#[allow(dead_code)]
pub fn get_byte_level_bpe() -> BPE {
BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt")
.build()
.expect("Files not found, run `make test` to download these files")
}
#[allow(dead_code)]
pub fn get_byte_level(add_prefix_space: bool, trim_offsets: bool) -> Tokenizer {
let mut tokenizer = Tokenizer::new(get_byte_level_bpe());
tokenizer
.with_pre_tokenizer(Some(
ByteLevel::default().add_prefix_space(add_prefix_space),
))
.with_decoder(Some(ByteLevel::default()))
.with_post_processor(Some(ByteLevel::default().trim_offsets(trim_offsets)));
tokenizer
}
#[allow(dead_code)]
pub fn get_bert_wordpiece() -> WordPiece {
WordPiece::from_file("data/bert-base-uncased-vocab.txt")
.build()
.expect("Files not found, run `make test` to download these files")
}
#[allow(dead_code)]
pub fn get_bert() -> Tokenizer {
let mut tokenizer = Tokenizer::new(get_bert_wordpiece());
let sep = tokenizer.get_model().token_to_id("[SEP]").unwrap();
let cls = tokenizer.get_model().token_to_id("[CLS]").unwrap();
tokenizer
.with_normalizer(Some(BertNormalizer::default()))
.with_pre_tokenizer(Some(BertPreTokenizer))
.with_decoder(Some(WordPieceDecoder::default()))
.with_post_processor(Some(BertProcessing::new(
(String::from("[SEP]"), sep),
(String::from("[CLS]"), cls),
)));
tokenizer
}