Skip to main content

char_token_est/
lib.rs

1//! # char-token-est
2//!
3//! Estimate token counts from raw text without invoking a BPE tokenizer.
4//!
5//! Real tokenization is fast but pulls in tens of MB of vocab data. For
6//! routing, budget gating, log lines, and progress bars you can get
7//! within ~10% accuracy with a per-model-family chars-per-token constant
8//! and no dependencies.
9//!
10//! ## Example
11//!
12//! ```
13//! use char_token_est::{estimate, Family};
14//! let text = "The quick brown fox jumps over the lazy dog.";
15//! let n = estimate(text, Family::Gpt);
16//! assert!(n >= 9 && n <= 14, "got {n}");
17//! ```
18//!
19//! ## Calibration
20//!
21//! Constants are derived from average chars-per-token over a multilingual
22//! corpus of typical prompts (English + code + JSON). Pure-code or
23//! non-Latin inputs deviate further; pass [`estimate_with_ratio`] to
24//! supply your own ratio.
25//!
26//! | Family | chars/token |
27//! | --- | --- |
28//! | `Gpt` (GPT-4/5, o3/o4 cl100k\_base) | 4.0 |
29//! | `Claude` | 3.5 |
30//! | `Gemini` | 4.0 |
31//! | `Llama` (Llama 3 tiktoken-32k) | 3.7 |
32//! | `Cohere` | 3.8 |
33
34#![deny(missing_docs)]
35
36/// Model family used to pick a chars-per-token ratio.
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
38pub enum Family {
39    /// GPT-4 / GPT-5 / o3 / o4 (cl100k_base, o200k_base).
40    Gpt,
41    /// Anthropic Claude.
42    Claude,
43    /// Google Gemini.
44    Gemini,
45    /// Meta Llama 3.
46    Llama,
47    /// Cohere Command R / R+.
48    Cohere,
49}
50
51impl Family {
52    /// Characters-per-token ratio for this family.
53    pub fn chars_per_token(self) -> f64 {
54        match self {
55            Family::Gpt => 4.0,
56            Family::Claude => 3.5,
57            Family::Gemini => 4.0,
58            Family::Llama => 3.7,
59            Family::Cohere => 3.8,
60        }
61    }
62
63    /// Best-effort guess from a model id string. Falls back to `Gpt`
64    /// when nothing matches.
65    pub fn guess_from_model_id(id: &str) -> Self {
66        let s = id.to_ascii_lowercase();
67        if s.contains("claude") {
68            Family::Claude
69        } else if s.contains("gemini") {
70            Family::Gemini
71        } else if s.contains("llama") {
72            Family::Llama
73        } else if s.contains("cohere") || s.contains("command-r") {
74            Family::Cohere
75        } else {
76            Family::Gpt
77        }
78    }
79}
80
81/// Estimate token count for `text` using the family's chars-per-token.
82pub fn estimate(text: &str, family: Family) -> u64 {
83    estimate_with_ratio(text, family.chars_per_token())
84}
85
86/// Estimate token count using a caller-supplied chars-per-token ratio.
87///
88/// Returns at least 1 if `text` is non-empty.
89pub fn estimate_with_ratio(text: &str, chars_per_token: f64) -> u64 {
90    if text.is_empty() {
91        return 0;
92    }
93    let chars = text.chars().count() as f64;
94    let est = (chars / chars_per_token).ceil() as u64;
95    est.max(1)
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn empty_string_is_zero() {
104        assert_eq!(estimate("", Family::Gpt), 0);
105    }
106
107    #[test]
108    fn ratio_picks_floor_of_one() {
109        // 1 char / 4 chars-per-token -> 0.25 -> ceil = 1
110        assert_eq!(estimate("a", Family::Gpt), 1);
111    }
112
113    #[test]
114    fn family_guess_works() {
115        assert_eq!(
116            Family::guess_from_model_id("claude-sonnet-4-5"),
117            Family::Claude
118        );
119        assert_eq!(
120            Family::guess_from_model_id("meta.llama3-70b"),
121            Family::Llama
122        );
123        assert_eq!(Family::guess_from_model_id("gemini-2.5-pro"), Family::Gemini);
124        assert_eq!(
125            Family::guess_from_model_id("cohere.command-r-plus"),
126            Family::Cohere
127        );
128        assert_eq!(Family::guess_from_model_id("gpt-5"), Family::Gpt);
129        assert_eq!(
130            Family::guess_from_model_id("something-else"),
131            Family::Gpt
132        );
133    }
134}