winx_code_agent/utils/
encoder.rs1use std::sync::OnceLock;
9use tokenizers::Tokenizer;
10
11static CLAUDE_TOKENIZER_JSON: &[u8] =
13 include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/assets/claude-tokenizer.json"));
14
15fn tokenizer() -> Option<&'static Tokenizer> {
16 static TOKENIZER: OnceLock<Option<Tokenizer>> = OnceLock::new();
17 TOKENIZER
18 .get_or_init(|| match Tokenizer::from_bytes(CLAUDE_TOKENIZER_JSON) {
19 Ok(tokenizer) => Some(tokenizer),
20 Err(error) => {
21 tracing::warn!("Failed to load embedded Claude tokenizer, using estimate: {error}");
22 None
23 }
24 })
25 .as_ref()
26}
27
28pub fn count_tokens(text: &str) -> usize {
30 match encode_ids(text) {
31 Some(ids) => ids.len(),
32 None => estimate_tokens(text),
33 }
34}
35
36pub fn encode_ids(text: &str) -> Option<Vec<u32>> {
39 let tokenizer = tokenizer()?;
40 tokenizer.encode(text, false).ok().map(|encoding| encoding.get_ids().to_vec())
41}
42
43pub fn decode_ids(ids: &[u32]) -> Option<String> {
45 let tokenizer = tokenizer()?;
46 tokenizer.decode(ids, false).ok()
47}
48
49pub fn estimate_tokens(text: &str) -> usize {
51 text.chars().count().div_ceil(4).max(text.split_whitespace().count())
52}
53
54#[cfg(test)]
55mod tests {
56 use super::*;
57
58 #[test]
59 fn counts_tokens_for_simple_text() {
60 assert!(count_tokens("hello world") >= 1);
62 assert_eq!(count_tokens(""), 0);
63 }
64
65 #[test]
66 fn estimate_is_nonzero_for_words() {
67 assert!(estimate_tokens("a b c d") >= 4);
68 }
69}