token_count/lib.rs
1//! Token counting library for LLM models
2//!
3//! This library provides exact tokenization for various LLM models using their official tokenizers.
4//!
5//! # Features
6//!
7//! - **Exact tokenization** for OpenAI models (GPT-3.5, GPT-4, GPT-4 Turbo, GPT-4o)
8//! - **Model aliases** with case-insensitive matching
9//! - **Fuzzy suggestions** for typos and unknown models
10//! - **Zero runtime dependencies** - all tokenizers embedded
11//! - **Fast and efficient** - ~2.7µs for small inputs
12//!
13//! # Quick Start
14//!
15//! ```
16//! use token_count::count_tokens;
17//!
18//! // Count tokens for a specific model
19//! let result = count_tokens("Hello world", "gpt-4").unwrap();
20//! assert_eq!(result.token_count, 2);
21//! println!("Tokens: {}", result.token_count);
22//! println!("Model: {}", result.model_info.name);
23//! ```
24//!
25//! # Supported Models
26//!
27//! - `gpt-3.5-turbo` - GPT-3.5 Turbo (16K context)
28//! - `gpt-4` - GPT-4 (128K context)
29//! - `gpt-4-turbo` - GPT-4 Turbo (128K context)
30//! - `gpt-4o` - GPT-4o (128K context)
31//!
32//! All models support aliases (e.g., `gpt4`, `GPT-4`, `openai/gpt-4`).
33//!
34//! # Error Handling
35//!
36//! ```
37//! use token_count::{count_tokens, TokenError};
38//!
39//! // Unknown model returns an error with suggestions
40//! match count_tokens("test", "gpt-5") {
41//! Ok(_) => panic!("Should have failed"),
42//! Err(TokenError::UnknownModel { model, suggestion }) => {
43//! assert_eq!(model, "gpt-5");
44//! assert!(suggestion.contains("Did you mean"));
45//! }
46//! Err(_) => panic!("Wrong error type"),
47//! }
48//! ```
49//!
50//! # Architecture
51//!
52//! The library is organized into several modules:
53//!
54//! - [`tokenizers`] - Core tokenization engine and model registry
55//! - [`output`] - Output formatting (simple, verbose, debug)
56//! - [`cli`] - Command-line interface components
57//! - [`error`] - Error types and handling
58//!
59//! The main entry point is the [`count_tokens`] function, which takes text and a model name
60//! and returns a [`TokenizationResult`] with the token count and model information.
61
62pub mod cli;
63pub mod error;
64pub mod output;
65pub mod tokenizers;
66
67pub use error::TokenError;
68pub use output::{select_formatter, OutputFormatter};
69pub use tokenizers::{ModelInfo, TokenizationResult, Tokenizer};
70
71/// Count tokens in the given text using the specified model
72///
73/// # Arguments
74///
75/// * `text` - The input text to tokenize
76/// * `model_name` - The name of the model to use (canonical name or alias)
77///
78/// # Returns
79///
80/// Returns a `TokenizationResult` containing the token count and model information
81///
82/// # Errors
83///
84/// Returns an error if:
85/// - The model name is unknown
86/// - Tokenization fails
87///
88/// # Example
89///
90/// ```
91/// use token_count::count_tokens;
92///
93/// let result = count_tokens("Hello world", "gpt-4").unwrap();
94/// assert_eq!(result.token_count, 2);
95/// ```
96pub fn count_tokens(text: &str, model_name: &str) -> Result<TokenizationResult, TokenError> {
97 use tokenizers::registry::ModelRegistry;
98
99 let registry = ModelRegistry::global();
100 let tokenizer = registry.get_tokenizer(model_name)?;
101
102 let token_count =
103 tokenizer.count_tokens(text).map_err(|e| TokenError::Tokenization(e.to_string()))?;
104 let model_info = tokenizer.get_model_info();
105
106 Ok(TokenizationResult { token_count, model_info })
107}