pub trait Tokenizer: Send + Sync {
fn count(&self, text: &str) -> usize;
}
#[derive(Debug, Clone, Copy, Default)]
pub struct CharApproxTokenizer;
impl Tokenizer for CharApproxTokenizer {
fn count(&self, text: &str) -> usize {
let n = text.chars().count();
(n + 3) / 4
}
}
#[cfg(feature = "tiktoken")]
pub use tiktoken_impl::TiktokenTokenizer;
#[cfg(feature = "tiktoken")]
mod tiktoken_impl {
use super::Tokenizer;
use tiktoken_rs::CoreBPE;
pub struct TiktokenTokenizer {
bpe: CoreBPE,
}
impl TiktokenTokenizer {
pub fn cl100k_base() -> Self {
Self {
bpe: tiktoken_rs::cl100k_base().expect("cl100k_base loads"),
}
}
pub fn o200k_base() -> Self {
Self {
bpe: tiktoken_rs::o200k_base().expect("o200k_base loads"),
}
}
pub fn from_bpe(bpe: CoreBPE) -> Self {
Self { bpe }
}
}
impl Tokenizer for TiktokenTokenizer {
fn count(&self, text: &str) -> usize {
self.bpe.encode_with_special_tokens(text).len()
}
}
}