Skip to main content

token_count/tokenizers/
mod.rs

1//! Tokenizer implementations for various LLM models
2//!
3//! This module provides the core tokenization functionality for supported LLM models.
4//!
5//! # Architecture
6//!
7//! The tokenization system uses a trait-based design for extensibility:
8//!
9//! - [`Tokenizer`] - Trait for all tokenizer implementations
10//! - [`openai::OpenAITokenizer`] - OpenAI model tokenizer using tiktoken
11//! - [`registry::ModelRegistry`] - Registry of supported models with lazy initialization
12//!
13//! # Example
14//!
15//! ```
16//! use token_count::tokenizers::registry::ModelRegistry;
17//!
18//! // Get the global model registry
19//! let registry = ModelRegistry::global();
20//!
21//! // Get a tokenizer for a specific model
22//! let tokenizer = registry.get_tokenizer("gpt-4", false).unwrap();
23//!
24//! // Count tokens
25//! let count = tokenizer.count_tokens("Hello world").unwrap();
26//! assert_eq!(count, 2);
27//!
28//! // Get model information
29//! let info = tokenizer.get_model_info();
30//! assert_eq!(info.name, "gpt-4");
31//! assert_eq!(info.encoding, "cl100k_base");
32//! ```
33//!
34//! # Supported Models
35//!
36//! Currently supports:
37//! - OpenAI models: GPT-3.5 Turbo, GPT-4, GPT-4 Turbo, GPT-4o
38//! - Claude models: Claude 4.0-4.6 (Opus, Sonnet, Haiku variants)
39//!
40//! See [`registry::ModelRegistry`] for model configuration and aliases.
41
42pub mod claude;
43pub mod google;
44pub mod openai;
45pub mod registry;
46
47use std::fmt;
48
49/// Result of token counting, indicating whether count is estimated or exact
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum TokenCount {
52    /// Estimated count using heuristics (displays with ~ prefix)
53    Estimated(usize),
54
55    /// Exact count from official API (displays without prefix)
56    Exact(usize),
57}
58
59impl TokenCount {
60    /// Get the numeric value regardless of estimation status
61    pub fn value(&self) -> usize {
62        match self {
63            Self::Estimated(n) | Self::Exact(n) => *n,
64        }
65    }
66
67    /// Check if this count is estimated
68    pub fn is_estimated(&self) -> bool {
69        matches!(self, Self::Estimated(_))
70    }
71
72    /// Check if this count is exact
73    pub fn is_exact(&self) -> bool {
74        matches!(self, Self::Exact(_))
75    }
76}
77
78impl fmt::Display for TokenCount {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        match self {
81            Self::Estimated(n) => write!(f, "~{}", n),
82            Self::Exact(n) => write!(f, "{}", n),
83        }
84    }
85}
86
87/// Detailed information about a single token
88#[derive(Debug, Clone, PartialEq, Eq)]
89pub struct TokenDetail {
90    /// The token ID
91    pub id: u32,
92    /// The decoded text representation of this token
93    pub text: String,
94}
95
96/// Trait for tokenizing text with a specific model
97pub trait Tokenizer: Send + Sync {
98    /// Count the number of tokens in the given text
99    fn count_tokens(&self, text: &str) -> anyhow::Result<usize>;
100
101    /// Get information about the model
102    fn get_model_info(&self) -> ModelInfo;
103
104    /// Encode text and return detailed token information (optional, for debug mode)
105    ///
106    /// Returns `None` if the tokenizer doesn't support detailed tokenization.
107    /// This is used for debug output (`-vvv` flag).
108    fn encode_with_details(&self, _text: &str) -> anyhow::Result<Option<Vec<TokenDetail>>> {
109        Ok(None)
110    }
111}
112
113/// Information about a tokenization model
114#[derive(Debug, Clone)]
115pub struct ModelInfo {
116    pub name: String,
117    pub encoding: String,
118    pub context_window: usize,
119    pub description: String,
120}
121
122impl fmt::Display for ModelInfo {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        write!(f, "{} ({})", self.name, self.encoding)
125    }
126}
127
128/// Result of tokenization operation
129#[derive(Debug, Clone)]
130pub struct TokenizationResult {
131    pub token_count: usize,
132    pub model_info: ModelInfo,
133    /// Optional detailed token information (for debug mode)
134    pub token_details: Option<Vec<TokenDetail>>,
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn test_token_count_display_estimated() {
143        let count = TokenCount::Estimated(42);
144        assert_eq!(format!("{}", count), "~42");
145    }
146
147    #[test]
148    fn test_token_count_display_exact() {
149        let count = TokenCount::Exact(42);
150        assert_eq!(format!("{}", count), "42");
151    }
152
153    #[test]
154    fn test_token_count_value() {
155        assert_eq!(TokenCount::Estimated(42).value(), 42);
156        assert_eq!(TokenCount::Exact(42).value(), 42);
157    }
158
159    #[test]
160    fn test_token_count_is_estimated() {
161        assert!(TokenCount::Estimated(42).is_estimated());
162        assert!(!TokenCount::Exact(42).is_estimated());
163    }
164
165    #[test]
166    fn test_token_count_is_exact() {
167        assert!(!TokenCount::Estimated(42).is_exact());
168        assert!(TokenCount::Exact(42).is_exact());
169    }
170
171    #[test]
172    fn test_token_count_equality() {
173        assert_eq!(TokenCount::Estimated(42), TokenCount::Estimated(42));
174        assert_eq!(TokenCount::Exact(42), TokenCount::Exact(42));
175        assert_ne!(TokenCount::Estimated(42), TokenCount::Exact(42));
176    }
177}