1use std::collections::HashMap;
11use std::sync::Arc;
12
13use tiktoken_rs::{cl100k_base, o200k_base, CoreBPE};
14
15use crate::interfaces::{Measurer, Model, TokenizerError};
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
22pub enum TokenizerId {
23 Cl100kBase,
25 O200kBase,
27}
28
29impl TokenizerId {
30 #[must_use]
32 pub const fn name(self) -> &'static str {
33 match self {
34 Self::Cl100kBase => "cl100k_base",
35 Self::O200kBase => "o200k_base",
36 }
37 }
38}
39
40#[derive(Clone, Default)]
45pub struct TokenizerRegistry {
46 cl100k: Option<Arc<CoreBPE>>,
47 o200k: Option<Arc<CoreBPE>>,
48 model_map: HashMap<String, TokenizerId>,
49}
50
51impl TokenizerRegistry {
52 pub fn with_defaults() -> Result<Self, TokenizerError> {
59 let cl = cl100k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
60 let oo = o200k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
61 let mut m = HashMap::new();
62 m.insert("ClaudeOpus47".into(), TokenizerId::Cl100kBase);
64 m.insert("ClaudeSonnet47".into(), TokenizerId::Cl100kBase);
65 m.insert("ClaudeHaiku47".into(), TokenizerId::Cl100kBase);
66 m.insert("Gpt4".into(), TokenizerId::Cl100kBase);
67 m.insert("Gpt4o".into(), TokenizerId::O200kBase);
68 m.insert("Gpt5".into(), TokenizerId::O200kBase);
69 Ok(Self {
70 cl100k: Some(Arc::new(cl)),
71 o200k: Some(Arc::new(oo)),
72 model_map: m,
73 })
74 }
75
76 pub fn register(&mut self, model_key: &str, id: TokenizerId) {
78 self.model_map.insert(model_key.to_owned(), id);
79 }
80
81 pub(crate) fn tokenizer_for(&self, id: TokenizerId) -> Option<&CoreBPE> {
82 match id {
83 TokenizerId::Cl100kBase => self.cl100k.as_deref(),
84 TokenizerId::O200kBase => self.o200k.as_deref(),
85 }
86 }
87
88 fn model_key(m: &Model) -> Option<String> {
89 Some(match m {
90 Model::ClaudeOpus47 => "ClaudeOpus47".into(),
91 Model::ClaudeSonnet47 => "ClaudeSonnet47".into(),
92 Model::ClaudeHaiku47 => "ClaudeHaiku47".into(),
93 Model::Gpt4 => "Gpt4".into(),
94 Model::Gpt4o => "Gpt4o".into(),
95 Model::Gpt5 => "Gpt5".into(),
96 _ => return None,
99 })
100 }
101}
102
103#[derive(Clone)]
105pub struct LocalMeasurer {
106 registry: TokenizerRegistry,
107}
108
109impl LocalMeasurer {
110 #[must_use]
112 pub fn new(registry: TokenizerRegistry) -> Self {
113 Self { registry }
114 }
115
116 pub fn with_defaults() -> Result<Self, TokenizerError> {
121 Ok(Self {
122 registry: TokenizerRegistry::with_defaults()?,
123 })
124 }
125
126 #[must_use]
136 pub fn cross_tokenize(&self, text: &str) -> Vec<(String, u32)> {
137 let mut out = Vec::with_capacity(2);
138 for id in [TokenizerId::Cl100kBase, TokenizerId::O200kBase] {
139 if let Some(tk) = self.registry.tokenizer_for(id) {
140 let n =
141 u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX);
142 out.push((id.name().to_owned(), n));
143 }
144 }
145 out
146 }
147}
148
149impl Measurer for LocalMeasurer {
150 fn tokenize(&self, text: &str, model: &Model) -> Result<u32, TokenizerError> {
151 let key = TokenizerRegistry::model_key(model)
152 .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
153 let id = self
154 .registry
155 .model_map
156 .get(&key)
157 .copied()
158 .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
159 let tk = self
160 .registry
161 .tokenizer_for(id)
162 .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
163 Ok(u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX))
164 }
165
166 fn supported(&self, model: &Model) -> bool {
167 TokenizerRegistry::model_key(model)
168 .and_then(|k| self.registry.model_map.get(&k).copied())
169 .and_then(|id| self.registry.tokenizer_for(id))
170 .is_some()
171 }
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 fn m() -> LocalMeasurer {
179 LocalMeasurer::with_defaults().expect("defaults load")
180 }
181
182 #[test]
183 fn empty_string_is_zero_tokens() {
184 let n = m().tokenize("", &Model::Gpt4).expect("ok");
185 assert_eq!(n, 0);
186 }
187
188 #[test]
189 fn ascii_hello_world_tokenizes() {
190 let n = m().tokenize("hello world", &Model::Gpt4).expect("ok");
191 assert!(n > 0 && n < 10);
192 }
193
194 #[test]
195 fn cl100k_and_o200k_agree_on_plain_ascii_count() {
196 let mm = m();
198 let a = mm.tokenize("hello world", &Model::Gpt4).expect("ok");
199 let b = mm.tokenize("hello world", &Model::Gpt4o).expect("ok");
200 assert!(a.abs_diff(b) <= 2, "cl100k={a} o200k={b} differ too much");
201 }
202
203 #[test]
204 fn multibyte_unicode_does_not_panic() {
205 let n = m()
206 .tokenize("中文 + English mix 🚀", &Model::Gpt4o)
207 .expect("ok");
208 assert!(n > 0);
209 }
210
211 #[test]
212 fn long_string_tokenizes() {
213 let s = "abcdef ".repeat(1000);
214 let n = m().tokenize(&s, &Model::Gpt4).expect("ok");
215 assert!(n > 500);
216 }
217
218 #[test]
219 fn unregistered_model_returns_error_not_estimation() {
220 let err = m()
221 .tokenize("anything", &Model::Gemini25Pro)
222 .expect_err("must refuse");
223 match err {
224 TokenizerError::NotRegistered(_) => {}
225 other => panic!("expected NotRegistered, got {other:?}"),
226 }
227 }
228
229 #[test]
230 fn supported_true_iff_tokenize_ok() {
231 let mm = m();
232 for model in [Model::ClaudeOpus47, Model::Gpt4, Model::Gpt4o, Model::Gpt5] {
233 assert!(mm.supported(&model), "{model:?} should be supported");
234 mm.tokenize("ok", &model).expect("ok");
235 }
236 for model in [
237 Model::Gemini25Pro,
238 Model::Gemini25Ultra,
239 Model::Grok4,
240 Model::Registered("custom".into()),
241 Model::Llama3Custom("x".into()),
242 Model::Qwen3Custom("y".into()),
243 ] {
244 assert!(!mm.supported(&model), "{model:?} must be unsupported");
245 assert!(mm.tokenize("ok", &model).is_err());
246 }
247 }
248
249 #[test]
255 fn tokenize_latency_meets_dod_section_10() {
256 use std::time::Instant;
257 let mm = m();
258 let payload = "lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(80);
259 for (label, model) in [("cl100k", Model::Gpt4), ("o200k", Model::Gpt4o)] {
260 let mut samples = Vec::with_capacity(200);
261 for _ in 0..200 {
262 let t = Instant::now();
263 mm.tokenize(&payload, &model).expect("tokenize ok");
264 samples.push(t.elapsed().as_micros());
265 }
266 samples.sort_unstable();
267 let p50 = samples[100];
268 let p95 = samples[190];
269 let p99 = samples[198];
270 eprintln!(
271 "[{label}] tokenize {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
272 payload.len()
273 );
274 assert!(p95 < 50_000, "{label} p95 {p95}us breaches debug ceiling");
276 }
277 }
278}