#[derive(Debug, Clone, Default)]
pub struct CharacterTokenizer {
chars_per_token: usize,
}
impl CharacterTokenizer {
pub fn new() -> Self {
Self { chars_per_token: 4 }
}
pub fn with_chars_per_token(chars_per_token: usize) -> Self {
Self { chars_per_token }
}
pub fn estimate_tokens(&self, text: &str) -> usize {
text.len().div_ceil(self.chars_per_token)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_character_tokenizer() {
let tokenizer = CharacterTokenizer::new();
assert_eq!(tokenizer.estimate_tokens("hell"), 1); assert_eq!(tokenizer.estimate_tokens("hello"), 2); assert_eq!(tokenizer.estimate_tokens("hello world"), 3); assert_eq!(tokenizer.estimate_tokens("this is a longer sentence"), 7); }
}