Skip to main content

mixtape_core/
tokenizer.rs

1//! Token estimation utilities
2//!
3//! Simple utilities for estimating token counts. The default implementations
4//! in `Model` and `ModelProvider` use ~4 characters per token, but you can
5//! use these utilities for custom token estimation.
6
7/// Simple character-based token estimator
8/// Uses ~4 characters per token heuristic (common approximation)
9#[derive(Debug, Clone, Default)]
10pub struct CharacterTokenizer {
11    chars_per_token: usize,
12}
13
14impl CharacterTokenizer {
15    /// Create a new tokenizer with the default 4 characters per token
16    pub fn new() -> Self {
17        Self { chars_per_token: 4 }
18    }
19
20    /// Create a tokenizer with a custom characters-per-token ratio
21    pub fn with_chars_per_token(chars_per_token: usize) -> Self {
22        Self { chars_per_token }
23    }
24
25    /// Estimate the number of tokens in the given text
26    pub fn estimate_tokens(&self, text: &str) -> usize {
27        text.len().div_ceil(self.chars_per_token)
28    }
29}
30
31#[cfg(test)]
32mod tests {
33    use super::*;
34
35    #[test]
36    fn test_character_tokenizer() {
37        let tokenizer = CharacterTokenizer::new();
38
39        // ~4 chars per token (rounds up)
40        assert_eq!(tokenizer.estimate_tokens("hell"), 1); // 4 chars = 1 token
41        assert_eq!(tokenizer.estimate_tokens("hello"), 2); // 5 chars rounds up to 2 tokens
42        assert_eq!(tokenizer.estimate_tokens("hello world"), 3); // 11 chars = 3 tokens
43        assert_eq!(tokenizer.estimate_tokens("this is a longer sentence"), 7); // 26 chars = 7 tokens
44    }
45}