Skip to main content

token_count/tokenizers/
mod.rs

1//! Tokenizer implementations for various LLM models
2//!
3//! This module provides the core tokenization functionality for supported LLM models.
4//!
5//! # Architecture
6//!
7//! The tokenization system uses a trait-based design for extensibility:
8//!
9//! - [`Tokenizer`] - Trait for all tokenizer implementations
10//! - [`openai::OpenAITokenizer`] - OpenAI model tokenizer using tiktoken
11//! - [`registry::ModelRegistry`] - Registry of supported models with lazy initialization
12//!
13//! # Example
14//!
15//! ```
16//! use token_count::tokenizers::registry::ModelRegistry;
17//!
18//! // Get the global model registry
19//! let registry = ModelRegistry::global();
20//!
21//! // Get a tokenizer for a specific model
22//! let tokenizer = registry.get_tokenizer("gpt-4", false).unwrap();
23//!
24//! // Count tokens
25//! let count = tokenizer.count_tokens("Hello world").unwrap();
26//! assert_eq!(count, 2);
27//!
28//! // Get model information
29//! let info = tokenizer.get_model_info();
30//! assert_eq!(info.name, "gpt-4");
31//! assert_eq!(info.encoding, "cl100k_base");
32//! ```
33//!
34//! # Supported Models
35//!
36//! Currently supports:
37//! - OpenAI models: GPT-3.5 Turbo, GPT-4, GPT-4 Turbo, GPT-4o
38//! - Claude models: Claude 4.0-4.6 (Opus, Sonnet, Haiku variants)
39//!
40//! See [`registry::ModelRegistry`] for model configuration and aliases.
41
42pub mod claude;
43pub mod google;
44pub mod openai;
45pub mod registry;
46
47use std::fmt;
48
49/// Result of token counting, indicating whether count is estimated or exact
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum TokenCount {
52    /// Estimated count using heuristics (displays with ~ prefix)
53    Estimated(usize),
54
55    /// Exact count from official API (displays without prefix)
56    Exact(usize),
57}
58
59impl TokenCount {
60    /// Get the numeric value regardless of estimation status
61    pub fn value(&self) -> usize {
62        match self {
63            Self::Estimated(n) | Self::Exact(n) => *n,
64        }
65    }
66
67    /// Check if this count is estimated
68    pub fn is_estimated(&self) -> bool {
69        matches!(self, Self::Estimated(_))
70    }
71
72    /// Check if this count is exact
73    pub fn is_exact(&self) -> bool {
74        matches!(self, Self::Exact(_))
75    }
76}
77
78impl fmt::Display for TokenCount {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        match self {
81            Self::Estimated(n) => write!(f, "~{}", n),
82            Self::Exact(n) => write!(f, "{}", n),
83        }
84    }
85}
86
87/// Trait for tokenizing text with a specific model
88pub trait Tokenizer: Send + Sync {
89    /// Count the number of tokens in the given text
90    fn count_tokens(&self, text: &str) -> anyhow::Result<usize>;
91
92    /// Get information about the model
93    fn get_model_info(&self) -> ModelInfo;
94}
95
96/// Information about a tokenization model
97#[derive(Debug, Clone)]
98pub struct ModelInfo {
99    pub name: String,
100    pub encoding: String,
101    pub context_window: usize,
102    pub description: String,
103}
104
105impl fmt::Display for ModelInfo {
106    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
107        write!(f, "{} ({})", self.name, self.encoding)
108    }
109}
110
111/// Result of tokenization operation
112#[derive(Debug, Clone)]
113pub struct TokenizationResult {
114    pub token_count: usize,
115    pub model_info: ModelInfo,
116}
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121
122    #[test]
123    fn test_token_count_display_estimated() {
124        let count = TokenCount::Estimated(42);
125        assert_eq!(format!("{}", count), "~42");
126    }
127
128    #[test]
129    fn test_token_count_display_exact() {
130        let count = TokenCount::Exact(42);
131        assert_eq!(format!("{}", count), "42");
132    }
133
134    #[test]
135    fn test_token_count_value() {
136        assert_eq!(TokenCount::Estimated(42).value(), 42);
137        assert_eq!(TokenCount::Exact(42).value(), 42);
138    }
139
140    #[test]
141    fn test_token_count_is_estimated() {
142        assert!(TokenCount::Estimated(42).is_estimated());
143        assert!(!TokenCount::Exact(42).is_estimated());
144    }
145
146    #[test]
147    fn test_token_count_is_exact() {
148        assert!(!TokenCount::Estimated(42).is_exact());
149        assert!(TokenCount::Exact(42).is_exact());
150    }
151
152    #[test]
153    fn test_token_count_equality() {
154        assert_eq!(TokenCount::Estimated(42), TokenCount::Estimated(42));
155        assert_eq!(TokenCount::Exact(42), TokenCount::Exact(42));
156        assert_ne!(TokenCount::Estimated(42), TokenCount::Exact(42));
157    }
158}