Skip to main content

mermaid_cli/utils/
tokenizer.rs

1use anyhow::Result;
2
3/// Token counting utility using character-based estimation.
4/// Uses ~4 characters per token as a reasonable approximation.
5///
6/// Stateless. Only counting is implemented; context-window budget is
7/// enforced via `MAX_CONTEXT_TOKENS` / `CONTEXT_RESERVE_TOKENS`
8/// constants and the user's `num_ctx` Ollama option. The old
9/// per-model-family table (`get_max_tokens` / `remaining_tokens`) and
10/// the `base_model_name` field were never consulted by the budget
11/// logic and have been deleted.
12pub struct Tokenizer;
13
14impl Tokenizer {
15    /// Create a new tokenizer. The `model_name` parameter is accepted
16    /// for call-site compatibility but ignored — the count is a pure
17    /// function of text length.
18    pub fn new(_model_name: &str) -> Self {
19        Self
20    }
21
22    /// Count tokens in a single text string (~4 chars per token)
23    pub fn count_tokens(&self, text: &str) -> Result<usize> {
24        Ok(text.len().div_ceil(4))
25    }
26
27    /// Count tokens in a chat message format
28    pub fn count_chat_tokens(&self, messages: &[(String, String)]) -> Result<usize> {
29        let total_chars: usize = messages
30            .iter()
31            .map(|(role, content)| role.len() + content.len() + 4) // +4 for message overhead
32            .sum();
33        Ok(total_chars.div_ceil(4))
34    }
35}
36
37#[cfg(test)]
38mod tests {
39    use super::*;
40
41    #[test]
42    fn test_token_counting() {
43        let tokenizer = Tokenizer::new("gpt-3.5-turbo");
44        let text = "Hello, world! This is a test message.";
45        let count = tokenizer.count_tokens(text).unwrap();
46        assert!(count > 0);
47        assert!(count < text.len());
48    }
49
50    #[test]
51    fn test_count_chat_tokens() {
52        let tokenizer = Tokenizer::new("any-model");
53        let messages = vec![
54            ("user".to_string(), "Hello".to_string()),
55            ("assistant".to_string(), "Hi there".to_string()),
56        ];
57        let count = tokenizer.count_chat_tokens(&messages).unwrap();
58        assert!(count > 0);
59    }
60}