char_token_est/lib.rs
1//! # char-token-est
2//!
3//! Estimate token counts from raw text without invoking a BPE tokenizer.
4//!
5//! Real tokenization is fast but pulls in tens of MB of vocab data. For
6//! routing, budget gating, log lines, and progress bars you can get
7//! within ~10% accuracy with a per-model-family chars-per-token constant
8//! and no dependencies.
9//!
10//! ## Example
11//!
12//! ```
13//! use char_token_est::{estimate, Family};
14//! let text = "The quick brown fox jumps over the lazy dog.";
15//! let n = estimate(text, Family::Gpt);
16//! assert!(n >= 9 && n <= 14, "got {n}");
17//! ```
18//!
19//! ## Calibration
20//!
21//! Constants are derived from average chars-per-token over a multilingual
22//! corpus of typical prompts (English + code + JSON). Pure-code or
23//! non-Latin inputs deviate further; pass [`estimate_with_ratio`] to
24//! supply your own ratio.
25//!
26//! | Family | chars/token |
27//! | --- | --- |
28//! | `Gpt` (GPT-4/5, o3/o4 cl100k\_base) | 4.0 |
29//! | `Claude` | 3.5 |
30//! | `Gemini` | 4.0 |
31//! | `Llama` (Llama 3 tiktoken-32k) | 3.7 |
32//! | `Cohere` | 3.8 |
33
34#![deny(missing_docs)]
35
36/// Model family used to pick a chars-per-token ratio.
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
38pub enum Family {
39 /// GPT-4 / GPT-5 / o3 / o4 (cl100k_base, o200k_base).
40 Gpt,
41 /// Anthropic Claude.
42 Claude,
43 /// Google Gemini.
44 Gemini,
45 /// Meta Llama 3.
46 Llama,
47 /// Cohere Command R / R+.
48 Cohere,
49}
50
51impl Family {
52 /// Characters-per-token ratio for this family.
53 pub fn chars_per_token(self) -> f64 {
54 match self {
55 Family::Gpt => 4.0,
56 Family::Claude => 3.5,
57 Family::Gemini => 4.0,
58 Family::Llama => 3.7,
59 Family::Cohere => 3.8,
60 }
61 }
62
63 /// Best-effort guess from a model id string. Falls back to `Gpt`
64 /// when nothing matches.
65 pub fn guess_from_model_id(id: &str) -> Self {
66 let s = id.to_ascii_lowercase();
67 if s.contains("claude") {
68 Family::Claude
69 } else if s.contains("gemini") {
70 Family::Gemini
71 } else if s.contains("llama") {
72 Family::Llama
73 } else if s.contains("cohere") || s.contains("command-r") {
74 Family::Cohere
75 } else {
76 Family::Gpt
77 }
78 }
79}
80
81/// Estimate token count for `text` using the family's chars-per-token.
82pub fn estimate(text: &str, family: Family) -> u64 {
83 estimate_with_ratio(text, family.chars_per_token())
84}
85
86/// Estimate token count using a caller-supplied chars-per-token ratio.
87///
88/// Returns at least 1 if `text` is non-empty.
89pub fn estimate_with_ratio(text: &str, chars_per_token: f64) -> u64 {
90 if text.is_empty() {
91 return 0;
92 }
93 let chars = text.chars().count() as f64;
94 let est = (chars / chars_per_token).ceil() as u64;
95 est.max(1)
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101
102 #[test]
103 fn empty_string_is_zero() {
104 assert_eq!(estimate("", Family::Gpt), 0);
105 }
106
107 #[test]
108 fn ratio_picks_floor_of_one() {
109 // 1 char / 4 chars-per-token -> 0.25 -> ceil = 1
110 assert_eq!(estimate("a", Family::Gpt), 1);
111 }
112
113 #[test]
114 fn family_guess_works() {
115 assert_eq!(
116 Family::guess_from_model_id("claude-sonnet-4-5"),
117 Family::Claude
118 );
119 assert_eq!(
120 Family::guess_from_model_id("meta.llama3-70b"),
121 Family::Llama
122 );
123 assert_eq!(Family::guess_from_model_id("gemini-2.5-pro"), Family::Gemini);
124 assert_eq!(
125 Family::guess_from_model_id("cohere.command-r-plus"),
126 Family::Cohere
127 );
128 assert_eq!(Family::guess_from_model_id("gpt-5"), Family::Gpt);
129 assert_eq!(
130 Family::guess_from_model_id("something-else"),
131 Family::Gpt
132 );
133 }
134}