token_count/lib.rs
1//! Token counting library for LLM models
2//!
3//! This library provides exact tokenization for various LLM models using their official tokenizers.
4//!
5//! # Features
6//!
7//! - **Exact tokenization** for OpenAI models (GPT-3.5, GPT-4, GPT-4 Turbo, GPT-4o)
8//! - **Model aliases** with case-insensitive matching
9//! - **Fuzzy suggestions** for typos and unknown models
10//! - **Zero runtime dependencies** - all tokenizers embedded
11//! - **Fast and efficient** - ~2.7µs for small inputs
12//!
13//! # Quick Start
14//!
15//! ```
16//! use token_count::count_tokens;
17//!
18//! // Count tokens for a specific model
19//! let result = count_tokens("Hello world", "gpt-4", false, 0).unwrap();
20//! assert_eq!(result.token_count, 2);
21//! println!("Tokens: {}", result.token_count);
22//! println!("Model: {}", result.model_info.name);
23//! ```
24//!
25//! # Supported Models
26//!
27//! - `gpt-3.5-turbo` - GPT-3.5 Turbo (16K context)
28//! - `gpt-4` - GPT-4 (128K context)
29//! - `gpt-4-turbo` - GPT-4 Turbo (128K context)
30//! - `gpt-4o` - GPT-4o (128K context)
31//!
32//! All models support aliases (e.g., `gpt4`, `GPT-4`, `openai/gpt-4`).
33//!
34//! # Error Handling
35//!
36//! ```
37//! use token_count::{count_tokens, TokenError};
38//!
39//! // Unknown model returns an error with suggestions
40//! match count_tokens("test", "gpt-5", false, 0) {
41//! Ok(_) => panic!("Should have failed"),
42//! Err(TokenError::UnknownModel { model, suggestion }) => {
43//! assert_eq!(model, "gpt-5");
44//! assert!(suggestion.contains("Did you mean"));
45//! }
46//! Err(_) => panic!("Wrong error type"),
47//! }
48//! ```
49//!
50//! # Architecture
51//!
52//! The library is organized into several modules:
53//!
54//! - [`tokenizers`] - Core tokenization engine and model registry
55//! - [`output`] - Output formatting (simple, verbose, debug)
56//! - [`cli`] - Command-line interface components
57//! - [`error`] - Error types and handling
58//! - [`api`] - API integration utilities (consent prompts, etc.)
59//!
60//! The main entry point is the [`count_tokens`] function, which takes text and a model name
61//! and returns a [`TokenizationResult`] with the token count and model information.
62
63pub mod api;
64pub mod cli;
65pub mod error;
66pub mod output;
67pub mod tokenizers;
68
69pub use error::TokenError;
70pub use output::{select_formatter, OutputFormatter};
71pub use tokenizers::{ModelInfo, TokenizationResult, Tokenizer};
72
73/// Count tokens in the given text using the specified model
74///
75/// # Arguments
76///
77/// * `text` - The text to tokenize
78/// * `model_name` - The model to use for tokenization (e.g., "gpt-4", "claude-sonnet-4-6")
79/// * `accurate` - Whether to use accurate mode for models that support it (Claude API)
80/// * `verbosity` - Verbosity level (3+ includes detailed token information for debug mode)
81///
82/// # Returns
83///
84/// A `Result` containing the token count, model information, and optionally token details
85///
86/// # Errors
87///
88/// Returns `TokenError` if:
89/// - The model is not supported
90/// - The tokenizer fails to initialize
91/// - Token counting fails
92///
93/// # Example
94///
95/// ```
96/// use token_count::count_tokens;
97///
98/// let result = count_tokens("Hello world", "gpt-4", false, 0).unwrap();
99/// assert_eq!(result.token_count, 2);
100/// ```
101pub fn count_tokens(
102 text: &str,
103 model_name: &str,
104 accurate: bool,
105 verbosity: u8,
106) -> Result<TokenizationResult, TokenError> {
107 use tokenizers::registry::ModelRegistry;
108
109 let registry = ModelRegistry::global();
110 let tokenizer = registry.get_tokenizer(model_name, accurate)?;
111
112 let token_count =
113 tokenizer.count_tokens(text).map_err(|e| TokenError::Tokenization(e.to_string()))?;
114 let model_info = tokenizer.get_model_info();
115
116 // For debug mode (verbosity >= 3), get detailed token information
117 let token_details =
118 if verbosity >= 3 { tokenizer.encode_with_details(text).ok().flatten() } else { None };
119
120 Ok(TokenizationResult { token_count, model_info, token_details })
121}