token_count/tokenizers/mod.rs
1//! Tokenizer implementations for various LLM models
2//!
3//! This module provides the core tokenization functionality for supported LLM models.
4//!
5//! # Architecture
6//!
7//! The tokenization system uses a trait-based design for extensibility:
8//!
9//! - [`Tokenizer`] - Trait for all tokenizer implementations
10//! - [`openai::OpenAITokenizer`] - OpenAI model tokenizer using tiktoken
11//! - [`registry::ModelRegistry`] - Registry of supported models with lazy initialization
12//!
13//! # Example
14//!
15//! ```
16//! use token_count::tokenizers::registry::ModelRegistry;
17//!
18//! // Get the global model registry
19//! let registry = ModelRegistry::global();
20//!
21//! // Get a tokenizer for a specific model
22//! let tokenizer = registry.get_tokenizer("gpt-4").unwrap();
23//!
24//! // Count tokens
25//! let count = tokenizer.count_tokens("Hello world").unwrap();
26//! assert_eq!(count, 2);
27//!
28//! // Get model information
29//! let info = tokenizer.get_model_info();
30//! assert_eq!(info.name, "gpt-4");
31//! assert_eq!(info.encoding, "cl100k_base");
32//! ```
33//!
34//! # Supported Models
35//!
36//! Currently supports OpenAI models:
37//! - GPT-3.5 Turbo (cl100k_base encoding)
38//! - GPT-4 (cl100k_base encoding)
39//! - GPT-4 Turbo (cl100k_base encoding)
40//! - GPT-4o (o200k_base encoding)
41//!
42//! See [`registry::ModelRegistry`] for model configuration and aliases.
43
44pub mod openai;
45pub mod registry;
46
47use std::fmt;
48
49/// Trait for tokenizing text with a specific model
50pub trait Tokenizer: Send + Sync {
51 /// Count the number of tokens in the given text
52 fn count_tokens(&self, text: &str) -> anyhow::Result<usize>;
53
54 /// Get information about the model
55 fn get_model_info(&self) -> ModelInfo;
56}
57
58/// Information about a tokenization model
59#[derive(Debug, Clone)]
60pub struct ModelInfo {
61 pub name: String,
62 pub encoding: String,
63 pub context_window: usize,
64 pub description: String,
65}
66
67impl fmt::Display for ModelInfo {
68 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69 write!(f, "{} ({})", self.name, self.encoding)
70 }
71}
72
73/// Result of tokenization operation
74#[derive(Debug, Clone)]
75pub struct TokenizationResult {
76 pub token_count: usize,
77 pub model_info: ModelInfo,
78}