use std::collections::HashMap;
use std::sync::Arc;
use tiktoken_rs::{cl100k_base, o200k_base, CoreBPE};
use crate::interfaces::{Measurer, Model, TokenizerError};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenizerId {
Cl100kBase,
O200kBase,
}
impl TokenizerId {
#[must_use]
pub const fn name(self) -> &'static str {
match self {
Self::Cl100kBase => "cl100k_base",
Self::O200kBase => "o200k_base",
}
}
}
#[derive(Clone, Default)]
pub struct TokenizerRegistry {
cl100k: Option<Arc<CoreBPE>>,
o200k: Option<Arc<CoreBPE>>,
model_map: HashMap<String, TokenizerId>,
}
impl TokenizerRegistry {
pub fn with_defaults() -> Result<Self, TokenizerError> {
let cl = cl100k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
let oo = o200k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
let mut m = HashMap::new();
m.insert("ClaudeOpus47".into(), TokenizerId::Cl100kBase);
m.insert("ClaudeSonnet47".into(), TokenizerId::Cl100kBase);
m.insert("ClaudeHaiku47".into(), TokenizerId::Cl100kBase);
m.insert("Gpt4".into(), TokenizerId::Cl100kBase);
m.insert("Gpt4o".into(), TokenizerId::O200kBase);
m.insert("Gpt5".into(), TokenizerId::O200kBase);
Ok(Self {
cl100k: Some(Arc::new(cl)),
o200k: Some(Arc::new(oo)),
model_map: m,
})
}
pub fn register(&mut self, model_key: &str, id: TokenizerId) {
self.model_map.insert(model_key.to_owned(), id);
}
pub(crate) fn tokenizer_for(&self, id: TokenizerId) -> Option<&CoreBPE> {
match id {
TokenizerId::Cl100kBase => self.cl100k.as_deref(),
TokenizerId::O200kBase => self.o200k.as_deref(),
}
}
fn model_key(m: &Model) -> Option<String> {
Some(match m {
Model::ClaudeOpus47 => "ClaudeOpus47".into(),
Model::ClaudeSonnet47 => "ClaudeSonnet47".into(),
Model::ClaudeHaiku47 => "ClaudeHaiku47".into(),
Model::Gpt4 => "Gpt4".into(),
Model::Gpt4o => "Gpt4o".into(),
Model::Gpt5 => "Gpt5".into(),
_ => return None,
})
}
}
#[derive(Clone)]
pub struct LocalMeasurer {
registry: TokenizerRegistry,
}
impl LocalMeasurer {
#[must_use]
pub fn new(registry: TokenizerRegistry) -> Self {
Self { registry }
}
pub fn with_defaults() -> Result<Self, TokenizerError> {
Ok(Self {
registry: TokenizerRegistry::with_defaults()?,
})
}
#[must_use]
pub fn cross_tokenize(&self, text: &str) -> Vec<(String, u32)> {
let mut out = Vec::with_capacity(2);
for id in [TokenizerId::Cl100kBase, TokenizerId::O200kBase] {
if let Some(tk) = self.registry.tokenizer_for(id) {
let n =
u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX);
out.push((id.name().to_owned(), n));
}
}
out
}
}
impl Measurer for LocalMeasurer {
fn tokenize(&self, text: &str, model: &Model) -> Result<u32, TokenizerError> {
let key = TokenizerRegistry::model_key(model)
.ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
let id = self
.registry
.model_map
.get(&key)
.copied()
.ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
let tk = self
.registry
.tokenizer_for(id)
.ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
Ok(u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX))
}
fn supported(&self, model: &Model) -> bool {
TokenizerRegistry::model_key(model)
.and_then(|k| self.registry.model_map.get(&k).copied())
.and_then(|id| self.registry.tokenizer_for(id))
.is_some()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn m() -> LocalMeasurer {
LocalMeasurer::with_defaults().expect("defaults load")
}
#[test]
fn empty_string_is_zero_tokens() {
let n = m().tokenize("", &Model::Gpt4).expect("ok");
assert_eq!(n, 0);
}
#[test]
fn ascii_hello_world_tokenizes() {
let n = m().tokenize("hello world", &Model::Gpt4).expect("ok");
assert!(n > 0 && n < 10);
}
#[test]
fn cl100k_and_o200k_agree_on_plain_ascii_count() {
let mm = m();
let a = mm.tokenize("hello world", &Model::Gpt4).expect("ok");
let b = mm.tokenize("hello world", &Model::Gpt4o).expect("ok");
assert!(a.abs_diff(b) <= 2, "cl100k={a} o200k={b} differ too much");
}
#[test]
fn multibyte_unicode_does_not_panic() {
let n = m()
.tokenize("中文 + English mix 🚀", &Model::Gpt4o)
.expect("ok");
assert!(n > 0);
}
#[test]
fn long_string_tokenizes() {
let s = "abcdef ".repeat(1000);
let n = m().tokenize(&s, &Model::Gpt4).expect("ok");
assert!(n > 500);
}
#[test]
fn unregistered_model_returns_error_not_estimation() {
let err = m()
.tokenize("anything", &Model::Gemini25Pro)
.expect_err("must refuse");
match err {
TokenizerError::NotRegistered(_) => {}
other => panic!("expected NotRegistered, got {other:?}"),
}
}
#[test]
fn supported_true_iff_tokenize_ok() {
let mm = m();
for model in [Model::ClaudeOpus47, Model::Gpt4, Model::Gpt4o, Model::Gpt5] {
assert!(mm.supported(&model), "{model:?} should be supported");
mm.tokenize("ok", &model).expect("ok");
}
for model in [
Model::Gemini25Pro,
Model::Gemini25Ultra,
Model::Grok4,
Model::Registered("custom".into()),
Model::Llama3Custom("x".into()),
Model::Qwen3Custom("y".into()),
] {
assert!(!mm.supported(&model), "{model:?} must be unsupported");
assert!(mm.tokenize("ok", &model).is_err());
}
}
#[test]
fn tokenize_latency_meets_dod_section_10() {
use std::time::Instant;
let mm = m();
let payload = "lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(80);
for (label, model) in [("cl100k", Model::Gpt4), ("o200k", Model::Gpt4o)] {
let mut samples = Vec::with_capacity(200);
for _ in 0..200 {
let t = Instant::now();
mm.tokenize(&payload, &model).expect("tokenize ok");
samples.push(t.elapsed().as_micros());
}
samples.sort_unstable();
let p50 = samples[100];
let p95 = samples[190];
let p99 = samples[198];
eprintln!(
"[{label}] tokenize {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
payload.len()
);
assert!(p95 < 50_000, "{label} p95 {p95}us breaches debug ceiling");
}
}
}