mixtape_core/tokenizer.rs
1//! Token estimation utilities
2//!
3//! Simple utilities for estimating token counts. The default implementations
4//! in `Model` and `ModelProvider` use ~4 characters per token, but you can
5//! use these utilities for custom token estimation.
6
7/// Simple character-based token estimator
8/// Uses ~4 characters per token heuristic (common approximation)
9#[derive(Debug, Clone, Default)]
10pub struct CharacterTokenizer {
11 chars_per_token: usize,
12}
13
14impl CharacterTokenizer {
15 /// Create a new tokenizer with the default 4 characters per token
16 pub fn new() -> Self {
17 Self { chars_per_token: 4 }
18 }
19
20 /// Create a tokenizer with a custom characters-per-token ratio
21 pub fn with_chars_per_token(chars_per_token: usize) -> Self {
22 Self { chars_per_token }
23 }
24
25 /// Estimate the number of tokens in the given text
26 pub fn estimate_tokens(&self, text: &str) -> usize {
27 text.len().div_ceil(self.chars_per_token)
28 }
29}
30
31#[cfg(test)]
32mod tests {
33 use super::*;
34
35 #[test]
36 fn test_character_tokenizer() {
37 let tokenizer = CharacterTokenizer::new();
38
39 // ~4 chars per token (rounds up)
40 assert_eq!(tokenizer.estimate_tokens("hell"), 1); // 4 chars = 1 token
41 assert_eq!(tokenizer.estimate_tokens("hello"), 2); // 5 chars rounds up to 2 tokens
42 assert_eq!(tokenizer.estimate_tokens("hello world"), 3); // 11 chars = 3 tokens
43 assert_eq!(tokenizer.estimate_tokens("this is a longer sentence"), 7); // 26 chars = 7 tokens
44 }
45}