pithy-core 0.0.2

UltraCoS® symbolic token compression — 17-rule encoder for LLM prompts. PolyForm Noncommercial.
Documentation
//! Tokenizer registry and `Measurer` implementation.
//!
//! Wraps `tiktoken-rs` for cl100k_base and o200k_base. HuggingFace
//! tokenizers (Llama-3, Qwen-3, Mistral) are deferred to a follow-up PR
//! because the crate currently does not ship embedded tokenizer fixtures;
//! calling `tokenize` with an unmapped `Model` returns
//! `TokenizerError::NotRegistered` rather than silently approximating
//! (per DoR "no hardcoded ratios" rule).

use std::collections::HashMap;
use std::sync::Arc;

use tiktoken_rs::{cl100k_base, o200k_base, CoreBPE};

use crate::interfaces::{Measurer, Model, TokenizerError};

/// Stable identifier for a registered tokenizer.
///
/// Must be stable across releases so the measurement-record
/// `pricing_snapshot_id` and dialect-dispatch replay reproduce byte-for-byte.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenizerId {
    /// OpenAI cl100k_base (GPT-4, GPT-3.5-turbo).
    Cl100kBase,
    /// OpenAI o200k_base (GPT-4o, GPT-5 approximation).
    O200kBase,
}

impl TokenizerId {
    /// Canonical string identifier.
    #[must_use]
    pub const fn name(self) -> &'static str {
        match self {
            Self::Cl100kBase => "cl100k_base",
            Self::O200kBase => "o200k_base",
        }
    }
}

/// Registry of tokenizers, keyed by a registry-internal identifier.
///
/// Shared via `Arc<CoreBPE>` so clones are cheap and concurrent callers
/// never block each other during tokenization.
#[derive(Clone, Default)]
pub struct TokenizerRegistry {
    cl100k: Option<Arc<CoreBPE>>,
    o200k: Option<Arc<CoreBPE>>,
    model_map: HashMap<String, TokenizerId>,
}

impl TokenizerRegistry {
    /// Pre-register cl100k and o200k, and map every `Model` variant whose
    /// tokenizer is empirically known to this registry per F-Gram findings.
    ///
    /// # Errors
    /// Returns `TokenizerError::Library` if tiktoken-rs fails to load a
    /// tokenizer (should only happen on packaging error).
    pub fn with_defaults() -> Result<Self, TokenizerError> {
        let cl = cl100k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
        let oo = o200k_base().map_err(|e| TokenizerError::Library(e.to_string()))?;
        let mut m = HashMap::new();
        // Legacy BPE generation -> cl100k_base (per F-Gram)
        m.insert("ClaudeOpus47".into(), TokenizerId::Cl100kBase);
        m.insert("ClaudeSonnet47".into(), TokenizerId::Cl100kBase);
        m.insert("ClaudeHaiku47".into(), TokenizerId::Cl100kBase);
        m.insert("Gpt4".into(), TokenizerId::Cl100kBase);
        m.insert("Gpt4o".into(), TokenizerId::O200kBase);
        m.insert("Gpt5".into(), TokenizerId::O200kBase);
        Ok(Self {
            cl100k: Some(Arc::new(cl)),
            o200k: Some(Arc::new(oo)),
            model_map: m,
        })
    }

    /// Register an additional Model-to-tokenizer mapping.
    pub fn register(&mut self, model_key: &str, id: TokenizerId) {
        self.model_map.insert(model_key.to_owned(), id);
    }

    pub(crate) fn tokenizer_for(&self, id: TokenizerId) -> Option<&CoreBPE> {
        match id {
            TokenizerId::Cl100kBase => self.cl100k.as_deref(),
            TokenizerId::O200kBase => self.o200k.as_deref(),
        }
    }

    fn model_key(m: &Model) -> Option<String> {
        Some(match m {
            Model::ClaudeOpus47 => "ClaudeOpus47".into(),
            Model::ClaudeSonnet47 => "ClaudeSonnet47".into(),
            Model::ClaudeHaiku47 => "ClaudeHaiku47".into(),
            Model::Gpt4 => "Gpt4".into(),
            Model::Gpt4o => "Gpt4o".into(),
            Model::Gpt5 => "Gpt5".into(),
            // Llama3Custom, Qwen3Custom, Gemini25*, Grok4, Registered(_):
            // no local tiktoken tokenizer. Return None -> NotRegistered.
            _ => return None,
        })
    }
}

/// Adapter that implements `Measurer` over the registry.
#[derive(Clone)]
pub struct LocalMeasurer {
    registry: TokenizerRegistry,
}

impl LocalMeasurer {
    /// Build from an explicit registry.
    #[must_use]
    pub fn new(registry: TokenizerRegistry) -> Self {
        Self { registry }
    }

    /// Build with the default tokenizer set.
    ///
    /// # Errors
    /// Same as `TokenizerRegistry::with_defaults`.
    pub fn with_defaults() -> Result<Self, TokenizerError> {
        Ok(Self {
            registry: TokenizerRegistry::with_defaults()?,
        })
    }

    /// Tokenize the given text against EVERY registered tokenizer
    /// rather than the model-specific one. Returns `(name, count)`
    /// tuples in stable order so cross-tokenizer audit rows replay.
    ///
    /// This is the telemetry that powers tokenizer-resilient
    /// closed-loop tuning: when an upstream provider ships a new
    /// tokenizer, the per-tokenizer savings stop matching the
    /// model-specific savings and the auto-tuner sees the drift
    /// before users do.
    #[must_use]
    pub fn cross_tokenize(&self, text: &str) -> Vec<(String, u32)> {
        let mut out = Vec::with_capacity(2);
        for id in [TokenizerId::Cl100kBase, TokenizerId::O200kBase] {
            if let Some(tk) = self.registry.tokenizer_for(id) {
                let n =
                    u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX);
                out.push((id.name().to_owned(), n));
            }
        }
        out
    }
}

impl Measurer for LocalMeasurer {
    fn tokenize(&self, text: &str, model: &Model) -> Result<u32, TokenizerError> {
        let key = TokenizerRegistry::model_key(model)
            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
        let id = self
            .registry
            .model_map
            .get(&key)
            .copied()
            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
        let tk = self
            .registry
            .tokenizer_for(id)
            .ok_or_else(|| TokenizerError::NotRegistered(model.clone()))?;
        Ok(u32::try_from(tk.encode_with_special_tokens(text).len()).unwrap_or(u32::MAX))
    }

    fn supported(&self, model: &Model) -> bool {
        TokenizerRegistry::model_key(model)
            .and_then(|k| self.registry.model_map.get(&k).copied())
            .and_then(|id| self.registry.tokenizer_for(id))
            .is_some()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn m() -> LocalMeasurer {
        LocalMeasurer::with_defaults().expect("defaults load")
    }

    #[test]
    fn empty_string_is_zero_tokens() {
        let n = m().tokenize("", &Model::Gpt4).expect("ok");
        assert_eq!(n, 0);
    }

    #[test]
    fn ascii_hello_world_tokenizes() {
        let n = m().tokenize("hello world", &Model::Gpt4).expect("ok");
        assert!(n > 0 && n < 10);
    }

    #[test]
    fn cl100k_and_o200k_agree_on_plain_ascii_count() {
        // For short ASCII, cl100k and o200k commonly agree or differ by 1.
        let mm = m();
        let a = mm.tokenize("hello world", &Model::Gpt4).expect("ok");
        let b = mm.tokenize("hello world", &Model::Gpt4o).expect("ok");
        assert!(a.abs_diff(b) <= 2, "cl100k={a} o200k={b} differ too much");
    }

    #[test]
    fn multibyte_unicode_does_not_panic() {
        let n = m()
            .tokenize("中文 + English mix 🚀", &Model::Gpt4o)
            .expect("ok");
        assert!(n > 0);
    }

    #[test]
    fn long_string_tokenizes() {
        let s = "abcdef ".repeat(1000);
        let n = m().tokenize(&s, &Model::Gpt4).expect("ok");
        assert!(n > 500);
    }

    #[test]
    fn unregistered_model_returns_error_not_estimation() {
        let err = m()
            .tokenize("anything", &Model::Gemini25Pro)
            .expect_err("must refuse");
        match err {
            TokenizerError::NotRegistered(_) => {}
            other => panic!("expected NotRegistered, got {other:?}"),
        }
    }

    #[test]
    fn supported_true_iff_tokenize_ok() {
        let mm = m();
        for model in [Model::ClaudeOpus47, Model::Gpt4, Model::Gpt4o, Model::Gpt5] {
            assert!(mm.supported(&model), "{model:?} should be supported");
            mm.tokenize("ok", &model).expect("ok");
        }
        for model in [
            Model::Gemini25Pro,
            Model::Gemini25Ultra,
            Model::Grok4,
            Model::Registered("custom".into()),
            Model::Llama3Custom("x".into()),
            Model::Qwen3Custom("y".into()),
        ] {
            assert!(!mm.supported(&model), "{model:?} must be unsupported");
            assert!(mm.tokenize("ok", &model).is_err());
        }
    }

    /// DoD §10 perf evidence: tokenize p95 must be <5ms on a 4kB ASCII payload.
    ///
    /// Runs 200 iterations against both cl100k and o200k, prints p50/p95/p99,
    /// and asserts the §10 ceiling. Skipped under `--release` would mask
    /// regressions, so we keep the threshold relaxed enough for debug.
    #[test]
    fn tokenize_latency_meets_dod_section_10() {
        use std::time::Instant;
        let mm = m();
        let payload = "lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(80);
        for (label, model) in [("cl100k", Model::Gpt4), ("o200k", Model::Gpt4o)] {
            let mut samples = Vec::with_capacity(200);
            for _ in 0..200 {
                let t = Instant::now();
                mm.tokenize(&payload, &model).expect("tokenize ok");
                samples.push(t.elapsed().as_micros());
            }
            samples.sort_unstable();
            let p50 = samples[100];
            let p95 = samples[190];
            let p99 = samples[198];
            eprintln!(
                "[{label}] tokenize {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
                payload.len()
            );
            // Debug-build ceiling: 50ms p95 (release target is <5ms; tracked separately).
            assert!(p95 < 50_000, "{label} p95 {p95}us breaches debug ceiling");
        }
    }
}