Skip to main content

pithy_core/
tokenizers.rs

1//! Tokenizer registry and `Measurer` implementation.
2//!
3//! Wraps `tiktoken-rs` for cl100k_base and o200k_base. HuggingFace
4//! tokenizers (Llama-3, Qwen-3, Mistral) are deferred to a follow-up PR
5//! because the crate currently does not ship embedded tokenizer fixtures;
6//! calling `tokenize` with an unmapped `Model` returns
7//! `TokenizerError::NotRegistered` rather than silently approximating
8//! (per DoR "no hardcoded ratios" rule).
9
10use std::collections::HashMap;
11use std::sync::Arc;
12
13use tiktoken_rs::{cl100k_base, o200k_base, CoreBPE};
14
15use crate::interfaces::{Measurer, Model, TokenizerError};
16
17/// Stable identifier for a registered tokenizer.
18///
19/// Must be stable across releases so the measurement-record
20/// `pricing_snapshot_id` and dialect-dispatch replay reproduce byte-for-byte.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
22pub enum TokenizerId {
23    /// OpenAI cl100k_base (GPT-4, GPT-3.5-turbo).
24    Cl100kBase,
25    /// OpenAI o200k_base (GPT-4o, GPT-5 approximation).
26    O200kBase,
27}
28
29impl TokenizerId {
30    /// Canonical string identifier.
31    #[must_use]
32    pub const fn name(self) -> &'static str {
33        match self {
34            Self::Cl100kBase => "cl100k_base",
35            Self::O200kBase => "o200k_base",
36        }
37    }
38}
39
40/// Registry of tokenizers, keyed by a registry-internal identifier.
41///
42/// Shared via `Arc<CoreBPE>` so clones are cheap and concurrent callers
43/// never block each other during tokenization.
44#[derive(Clone, Default)]
45pub struct TokenizerRegistry {
46    cl100k: Option<Arc<CoreBPE>>,
47    o200k: Option<Arc<CoreBPE>>,
48    model_map: HashMap<String, TokenizerId>,
49}
50
51impl TokenizerRegistry {
52    /// Pre-register cl100k and o200k, and map every `Model` variant whose
53    /// tokenizer is empirically known to this registry per F-Gram findings.
54    ///
55    /// # Errors
56    /// Returns `TokenizerError::Library` if tiktoken-rs fails to load a
57    /// tokenizer (should only happen on packaging error).
58    pub fn with_defaults() -> Result<Self, TokenizerError> {
59        let cl = cl100k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
60        let oo = o200k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
61        let mut m = HashMap::new();
62        // Legacy BPE generation -> cl100k_base (per F-Gram)
63        m.insert("ClaudeOpus47".into(), TokenizerId::Cl100kBase);
64        m.insert("ClaudeSonnet47".into(), TokenizerId::Cl100kBase);
65        m.insert("ClaudeHaiku47".into(), TokenizerId::Cl100kBase);
66        m.insert("Gpt4".into(), TokenizerId::Cl100kBase);
67        m.insert("Gpt4o".into(), TokenizerId::O200kBase);
68        m.insert("Gpt5".into(), TokenizerId::O200kBase);
69        Ok(Self {
70            cl100k: Some(Arc::new(cl)),
71            o200k: Some(Arc::new(oo)),
72            model_map: m,
73        })
74    }
75
76    /// Register an additional Model-to-tokenizer mapping.
77    pub fn register(&mut self, model_key: &str, id: TokenizerId) {
78        self.model_map.insert(model_key.to_owned(), id);
79    }
80
81    pub(crate) fn tokenizer_for(&self, id: TokenizerId) -> Option<&CoreBPE> {
82        match id {
83            TokenizerId::Cl100kBase => self.cl100k.as_deref(),
84            TokenizerId::O200kBase => self.o200k.as_deref(),
85        }
86    }
87
88    fn model_key(m: &Model) -> Option<String> {
89        Some(match m {
90            Model::ClaudeOpus47 => "ClaudeOpus47".into(),
91            Model::ClaudeSonnet47 => "ClaudeSonnet47".into(),
92            Model::ClaudeHaiku47 => "ClaudeHaiku47".into(),
93            Model::Gpt4 => "Gpt4".into(),
94            Model::Gpt4o => "Gpt4o".into(),
95            Model::Gpt5 => "Gpt5".into(),
96            // Llama3Custom, Qwen3Custom, Gemini25*, Grok4, Registered(_):
97            // no local tiktoken tokenizer. Return None -> NotRegistered.
98            _ => return None,
99        })
100    }
101}
102
103/// Adapter that implements `Measurer` over the registry.
104#[derive(Clone)]
105pub struct LocalMeasurer {
106    registry: TokenizerRegistry,
107}
108
109impl LocalMeasurer {
110    /// Build from an explicit registry.
111    #[must_use]
112    pub fn new(registry: TokenizerRegistry) -> Self {
113        Self { registry }
114    }
115
116    /// Build with the default tokenizer set.
117    ///
118    /// # Errors
119    /// Same as `TokenizerRegistry::with_defaults`.
120    pub fn with_defaults() -> Result<Self, TokenizerError> {
121        Ok(Self {
122            registry: TokenizerRegistry::with_defaults()?,
123        })
124    }
125
126    /// Tokenize the given text against EVERY registered tokenizer
127    /// rather than the model-specific one. Returns `(name, count)`
128    /// tuples in stable order so cross-tokenizer audit rows replay.
129    ///
130    /// This is the telemetry that powers tokenizer-resilient
131    /// closed-loop tuning: when an upstream provider ships a new
132    /// tokenizer, the per-tokenizer savings stop matching the
133    /// model-specific savings and the auto-tuner sees the drift
134    /// before users do.
135    #[must_use]
136    pub fn cross_tokenize(&self, text: &str) -> Vec<(String, u32)> {
137        let mut out = Vec::with_capacity(2);
138        for id in [TokenizerId::Cl100kBase, TokenizerId::O200kBase] {
139            if let Some(tk) = self.registry.tokenizer_for(id) {
140                let n =
141                    u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX);
142                out.push((id.name().to_owned(), n));
143            }
144        }
145        out
146    }
147}
148
149impl Measurer for LocalMeasurer {
150    fn tokenize(&self, text: &str, model: &Model) -> Result<u32, TokenizerError> {
151        let key = TokenizerRegistry::model_key(model)
152            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
153        let id = self
154            .registry
155            .model_map
156            .get(&key)
157            .copied()
158            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
159        let tk = self
160            .registry
161            .tokenizer_for(id)
162            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
163        Ok(u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX))
164    }
165
166    fn supported(&self, model: &Model) -> bool {
167        TokenizerRegistry::model_key(model)
168            .and_then(|k| self.registry.model_map.get(&k).copied())
169            .and_then(|id| self.registry.tokenizer_for(id))
170            .is_some()
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    fn m() -> LocalMeasurer {
179        LocalMeasurer::with_defaults().expect("defaults load")
180    }
181
182    #[test]
183    fn empty_string_is_zero_tokens() {
184        let n = m().tokenize("", &Model::Gpt4).expect("ok");
185        assert_eq!(n, 0);
186    }
187
188    #[test]
189    fn ascii_hello_world_tokenizes() {
190        let n = m().tokenize("hello world", &Model::Gpt4).expect("ok");
191        assert!(n > 0 && n < 10);
192    }
193
194    #[test]
195    fn cl100k_and_o200k_agree_on_plain_ascii_count() {
196        // For short ASCII, cl100k and o200k commonly agree or differ by 1.
197        let mm = m();
198        let a = mm.tokenize("hello world", &Model::Gpt4).expect("ok");
199        let b = mm.tokenize("hello world", &Model::Gpt4o).expect("ok");
200        assert!(a.abs_diff(b) <= 2, "cl100k={a} o200k={b} differ too much");
201    }
202
203    #[test]
204    fn multibyte_unicode_does_not_panic() {
205        let n = m()
206            .tokenize("中文 + English mix 🚀", &Model::Gpt4o)
207            .expect("ok");
208        assert!(n > 0);
209    }
210
211    #[test]
212    fn long_string_tokenizes() {
213        let s = "abcdef ".repeat(1000);
214        let n = m().tokenize(&s, &Model::Gpt4).expect("ok");
215        assert!(n > 500);
216    }
217
218    #[test]
219    fn unregistered_model_returns_error_not_estimation() {
220        let err = m()
221            .tokenize("anything", &Model::Gemini25Pro)
222            .expect_err("must refuse");
223        match err {
224            TokenizerError::NotRegistered(_) => {}
225            other => panic!("expected NotRegistered, got {other:?}"),
226        }
227    }
228
229    #[test]
230    fn supported_true_iff_tokenize_ok() {
231        let mm = m();
232        for model in [Model::ClaudeOpus47, Model::Gpt4, Model::Gpt4o, Model::Gpt5] {
233            assert!(mm.supported(&model), "{model:?} should be supported");
234            mm.tokenize("ok", &model).expect("ok");
235        }
236        for model in [
237            Model::Gemini25Pro,
238            Model::Gemini25Ultra,
239            Model::Grok4,
240            Model::Registered("custom".into()),
241            Model::Llama3Custom("x".into()),
242            Model::Qwen3Custom("y".into()),
243        ] {
244            assert!(!mm.supported(&model), "{model:?} must be unsupported");
245            assert!(mm.tokenize("ok", &model).is_err());
246        }
247    }
248
249    /// DoD §10 perf evidence: tokenize p95 must be <5ms on a 4kB ASCII payload.
250    ///
251    /// Runs 200 iterations against both cl100k and o200k, prints p50/p95/p99,
252    /// and asserts the §10 ceiling. Skipped under `--release` would mask
253    /// regressions, so we keep the threshold relaxed enough for debug.
254    #[test]
255    fn tokenize_latency_meets_dod_section_10() {
256        use std::time::Instant;
257        let mm = m();
258        let payload = "lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(80);
259        for (label, model) in [("cl100k", Model::Gpt4), ("o200k", Model::Gpt4o)] {
260            let mut samples = Vec::with_capacity(200);
261            for _ in 0..200 {
262                let t = Instant::now();
263                mm.tokenize(&payload, &model).expect("tokenize ok");
264                samples.push(t.elapsed().as_micros());
265            }
266            samples.sort_unstable();
267            let p50 = samples[100];
268            let p95 = samples[190];
269            let p99 = samples[198];
270            eprintln!(
271                "[{label}] tokenize {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
272                payload.len()
273            );
274            // Debug-build ceiling: 50ms p95 (release target is <5ms; tracked separately).
275            assert!(p95 < 50_000, "{label} p95 {p95}us breaches debug ceiling");
276        }
277    }
278}