use std::sync::OnceLock;
use tokenizers::Tokenizer;
static CLAUDE_TOKENIZER_JSON: &[u8] =
include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/assets/claude-tokenizer.json"));
fn tokenizer() -> Option<&'static Tokenizer> {
static TOKENIZER: OnceLock<Option<Tokenizer>> = OnceLock::new();
TOKENIZER
.get_or_init(|| match Tokenizer::from_bytes(CLAUDE_TOKENIZER_JSON) {
Ok(tokenizer) => Some(tokenizer),
Err(error) => {
tracing::warn!("Failed to load embedded Claude tokenizer, using estimate: {error}");
None
}
})
.as_ref()
}
pub fn count_tokens(text: &str) -> usize {
match encode_ids(text) {
Some(ids) => ids.len(),
None => estimate_tokens(text),
}
}
pub fn encode_ids(text: &str) -> Option<Vec<u32>> {
let tokenizer = tokenizer()?;
tokenizer.encode(text, false).ok().map(|encoding| encoding.get_ids().to_vec())
}
pub fn decode_ids(ids: &[u32]) -> Option<String> {
let tokenizer = tokenizer()?;
tokenizer.decode(ids, false).ok()
}
pub fn estimate_tokens(text: &str) -> usize {
text.chars().count().div_ceil(4).max(text.split_whitespace().count())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn counts_tokens_for_simple_text() {
assert!(count_tokens("hello world") >= 1);
assert_eq!(count_tokens(""), 0);
}
#[test]
fn estimate_is_nonzero_for_words() {
assert!(estimate_tokens("a b c d") >= 4);
}
}