Skip to main content

token_count/tokenizers/
mod.rs

1//! Tokenizer implementations for various LLM models
2//!
3//! This module provides the core tokenization functionality for supported LLM models.
4//!
5//! # Architecture
6//!
7//! The tokenization system uses a trait-based design for extensibility:
8//!
9//! - [`Tokenizer`] - Trait for all tokenizer implementations
10//! - [`openai::OpenAITokenizer`] - OpenAI model tokenizer using tiktoken
11//! - [`registry::ModelRegistry`] - Registry of supported models with lazy initialization
12//!
13//! # Example
14//!
15//! ```
16//! use token_count::tokenizers::registry::ModelRegistry;
17//!
18//! // Get the global model registry
19//! let registry = ModelRegistry::global();
20//!
21//! // Get a tokenizer for a specific model
22//! let tokenizer = registry.get_tokenizer("gpt-4", false).unwrap();
23//!
24//! // Count tokens
25//! let count = tokenizer.count_tokens("Hello world").unwrap();
26//! assert_eq!(count, 2);
27//!
28//! // Get model information
29//! let info = tokenizer.get_model_info();
30//! assert_eq!(info.name, "gpt-4");
31//! assert_eq!(info.encoding, "cl100k_base");
32//! ```
33//!
34//! # Supported Models
35//!
36//! Currently supports:
37//! - OpenAI models: GPT-3.5 Turbo, GPT-4, GPT-4 Turbo, GPT-4o
38//! - Claude models: Claude 4.0-4.6 (Opus, Sonnet, Haiku variants)
39//!
40//! See [`registry::ModelRegistry`] for model configuration and aliases.
41
42pub mod claude;
43pub mod openai;
44pub mod registry;
45
46use std::fmt;
47
48/// Result of token counting, indicating whether count is estimated or exact
49#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50pub enum TokenCount {
51    /// Estimated count using heuristics (displays with ~ prefix)
52    Estimated(usize),
53
54    /// Exact count from official API (displays without prefix)
55    Exact(usize),
56}
57
58impl TokenCount {
59    /// Get the numeric value regardless of estimation status
60    pub fn value(&self) -> usize {
61        match self {
62            Self::Estimated(n) | Self::Exact(n) => *n,
63        }
64    }
65
66    /// Check if this count is estimated
67    pub fn is_estimated(&self) -> bool {
68        matches!(self, Self::Estimated(_))
69    }
70
71    /// Check if this count is exact
72    pub fn is_exact(&self) -> bool {
73        matches!(self, Self::Exact(_))
74    }
75}
76
77impl fmt::Display for TokenCount {
78    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79        match self {
80            Self::Estimated(n) => write!(f, "~{}", n),
81            Self::Exact(n) => write!(f, "{}", n),
82        }
83    }
84}
85
86/// Trait for tokenizing text with a specific model
87pub trait Tokenizer: Send + Sync {
88    /// Count the number of tokens in the given text
89    fn count_tokens(&self, text: &str) -> anyhow::Result<usize>;
90
91    /// Get information about the model
92    fn get_model_info(&self) -> ModelInfo;
93}
94
95/// Information about a tokenization model
96#[derive(Debug, Clone)]
97pub struct ModelInfo {
98    pub name: String,
99    pub encoding: String,
100    pub context_window: usize,
101    pub description: String,
102}
103
104impl fmt::Display for ModelInfo {
105    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
106        write!(f, "{} ({})", self.name, self.encoding)
107    }
108}
109
110/// Result of tokenization operation
111#[derive(Debug, Clone)]
112pub struct TokenizationResult {
113    pub token_count: usize,
114    pub model_info: ModelInfo,
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    #[test]
122    fn test_token_count_display_estimated() {
123        let count = TokenCount::Estimated(42);
124        assert_eq!(format!("{}", count), "~42");
125    }
126
127    #[test]
128    fn test_token_count_display_exact() {
129        let count = TokenCount::Exact(42);
130        assert_eq!(format!("{}", count), "42");
131    }
132
133    #[test]
134    fn test_token_count_value() {
135        assert_eq!(TokenCount::Estimated(42).value(), 42);
136        assert_eq!(TokenCount::Exact(42).value(), 42);
137    }
138
139    #[test]
140    fn test_token_count_is_estimated() {
141        assert!(TokenCount::Estimated(42).is_estimated());
142        assert!(!TokenCount::Exact(42).is_estimated());
143    }
144
145    #[test]
146    fn test_token_count_is_exact() {
147        assert!(!TokenCount::Estimated(42).is_exact());
148        assert!(TokenCount::Exact(42).is_exact());
149    }
150
151    #[test]
152    fn test_token_count_equality() {
153        assert_eq!(TokenCount::Estimated(42), TokenCount::Estimated(42));
154        assert_eq!(TokenCount::Exact(42), TokenCount::Exact(42));
155        assert_ne!(TokenCount::Estimated(42), TokenCount::Exact(42));
156    }
157}