infiniloom_engine/tokenizer/
mod.rs

1//! Accurate token counting using actual BPE tokenizers
2//!
3//! This module provides accurate token counts using tiktoken for OpenAI models
4//! and estimation-based counting for other models.
5//!
6//! # Supported Models
7//!
8//! ## OpenAI (Exact tokenization via tiktoken)
9//! - **o200k_base**: GPT-5.2, GPT-5.1, GPT-5, GPT-4o, O1, O3, O4 (all latest models)
10//! - **cl100k_base**: GPT-4, GPT-3.5-turbo (legacy models)
11//!
12//! ## Other Vendors (Estimation-based)
13//! - Claude (Anthropic): ~3.5 chars/token
14//! - Gemini (Google): ~3.8 chars/token
15//! - Llama (Meta): ~3.5 chars/token
16//! - Mistral: ~3.5 chars/token
17//! - DeepSeek: ~3.5 chars/token
18//! - Qwen (Alibaba): ~3.5 chars/token
19//! - Cohere: ~3.6 chars/token
20//! - Grok (xAI): ~3.5 chars/token
21
22mod core;
23mod counts;
24mod models;
25
26pub use core::{quick_estimate, Tokenizer};
27pub use counts::TokenCounts;
28pub use models::TokenModel;
29
30#[cfg(test)]
31mod tests {
32    use super::*;
33
34    #[test]
35    fn test_exact_gpt4o_counting() {
36        let tokenizer = Tokenizer::new();
37        let text = "Hello, world!";
38        let count = tokenizer.count(text, TokenModel::Gpt4o);
39
40        // o200k_base should give exact count
41        assert!(count > 0);
42        assert!(count < 10); // Should be around 3-4 tokens
43    }
44
45    #[test]
46    fn test_exact_gpt5_counting() {
47        let tokenizer = Tokenizer::new();
48        let text = "def hello():\n    print('Hello, World!')\n";
49
50        // All GPT-5 variants should use o200k_base and give same count
51        let count_52 = tokenizer.count(text, TokenModel::Gpt52);
52        let count_51 = tokenizer.count(text, TokenModel::Gpt51);
53        let count_5 = tokenizer.count(text, TokenModel::Gpt5);
54        let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
55
56        assert_eq!(count_52, count_51);
57        assert_eq!(count_51, count_5);
58        assert_eq!(count_5, count_4o);
59        assert!(count_52 > 5);
60        assert!(count_52 < 30);
61    }
62
63    #[test]
64    fn test_exact_o_series_counting() {
65        let tokenizer = Tokenizer::new();
66        let text = "Solve this math problem: 2 + 2 = ?";
67
68        // All O-series models should use o200k_base
69        let count_o4 = tokenizer.count(text, TokenModel::O4Mini);
70        let count_o3 = tokenizer.count(text, TokenModel::O3);
71        let count_o1 = tokenizer.count(text, TokenModel::O1);
72        let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
73
74        assert_eq!(count_o4, count_o3);
75        assert_eq!(count_o3, count_o1);
76        assert_eq!(count_o1, count_4o);
77    }
78
79    #[test]
80    fn test_exact_gpt4_counting() {
81        let tokenizer = Tokenizer::new();
82        let text = "def hello():\n    print('Hello, World!')\n";
83        let count = tokenizer.count(text, TokenModel::Gpt4);
84
85        // cl100k_base should give exact count
86        assert!(count > 5);
87        assert!(count < 30);
88    }
89
90    #[test]
91    fn test_estimation_claude() {
92        let tokenizer = Tokenizer::new();
93        let text = "This is a test string for token estimation.";
94        let count = tokenizer.count(text, TokenModel::Claude);
95
96        // Estimation should be reasonable
97        assert!(count > 5);
98        assert!(count < 30);
99    }
100
101    #[test]
102    fn test_estimation_new_vendors() {
103        let tokenizer = Tokenizer::new();
104        let text = "This is a test string for new vendor token estimation.";
105
106        // All estimation-based models should return reasonable counts
107        let mistral = tokenizer.count(text, TokenModel::Mistral);
108        let deepseek = tokenizer.count(text, TokenModel::DeepSeek);
109        let qwen = tokenizer.count(text, TokenModel::Qwen);
110        let cohere = tokenizer.count(text, TokenModel::Cohere);
111        let grok = tokenizer.count(text, TokenModel::Grok);
112
113        assert!(mistral > 5 && mistral < 50);
114        assert!(deepseek > 5 && deepseek < 50);
115        assert!(qwen > 5 && qwen < 50);
116        assert!(cohere > 5 && cohere < 50);
117        assert!(grok > 5 && grok < 50);
118    }
119
120    #[test]
121    fn test_count_all() {
122        let tokenizer = Tokenizer::new();
123        let text = "function hello() { console.log('hello'); }";
124        let counts = tokenizer.count_all(text);
125
126        assert!(counts.o200k > 0);
127        assert!(counts.cl100k > 0);
128        assert!(counts.claude > 0);
129        assert!(counts.gemini > 0);
130        assert!(counts.llama > 0);
131        assert!(counts.mistral > 0);
132        assert!(counts.deepseek > 0);
133        assert!(counts.qwen > 0);
134        assert!(counts.cohere > 0);
135        assert!(counts.grok > 0);
136    }
137
138    #[test]
139    fn test_empty_string() {
140        let tokenizer = Tokenizer::new();
141        assert_eq!(tokenizer.count("", TokenModel::Claude), 0);
142        assert_eq!(tokenizer.count("", TokenModel::Gpt4o), 0);
143        assert_eq!(tokenizer.count("", TokenModel::Gpt52), 0);
144        assert_eq!(tokenizer.count("", TokenModel::O3), 0);
145    }
146
147    #[test]
148    fn test_truncate_to_budget() {
149        let tokenizer = Tokenizer::new();
150        let text = "This is a fairly long string that we want to truncate to fit within a smaller token budget for testing purposes.";
151
152        let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4, 10);
153        let count = tokenizer.count(truncated, TokenModel::Gpt4);
154
155        assert!(count <= 10);
156        assert!(truncated.len() < text.len());
157    }
158
159    #[test]
160    fn test_quick_estimate() {
161        let count = quick_estimate("Hello world", TokenModel::Claude);
162        assert!(count > 0);
163        assert!(count < 10);
164    }
165
166    #[test]
167    fn test_token_counts_add() {
168        let a = TokenCounts {
169            o200k: 8,
170            cl100k: 9,
171            claude: 10,
172            gemini: 8,
173            llama: 10,
174            mistral: 10,
175            deepseek: 10,
176            qwen: 10,
177            cohere: 10,
178            grok: 10,
179        };
180        let b = TokenCounts {
181            o200k: 4,
182            cl100k: 5,
183            claude: 5,
184            gemini: 4,
185            llama: 5,
186            mistral: 5,
187            deepseek: 5,
188            qwen: 5,
189            cohere: 5,
190            grok: 5,
191        };
192        let sum = a + b;
193
194        assert_eq!(sum.o200k, 12);
195        assert_eq!(sum.cl100k, 14);
196        assert_eq!(sum.claude, 15);
197    }
198
199    #[test]
200    fn test_token_counts_min_max() {
201        let counts = TokenCounts {
202            o200k: 100,
203            cl100k: 110,
204            claude: 95,
205            gemini: 105,
206            llama: 98,
207            mistral: 97,
208            deepseek: 96,
209            qwen: 99,
210            cohere: 102,
211            grok: 101,
212        };
213
214        assert_eq!(counts.min(), 95);
215        assert_eq!(counts.max(), 110);
216    }
217
218    #[test]
219    fn test_most_efficient_model() {
220        let tokenizer = Tokenizer::new();
221        let text = "const x = 42;";
222        let (_model, count) = tokenizer.most_efficient_model(text);
223
224        // GPT-4o with o200k should usually be most efficient
225        assert!(count > 0);
226    }
227
228    #[test]
229    fn test_from_model_name_openai() {
230        // GPT-5.2 variants
231        assert_eq!(TokenModel::from_model_name("gpt-5.2"), Some(TokenModel::Gpt52));
232        assert_eq!(TokenModel::from_model_name("GPT-5.2"), Some(TokenModel::Gpt52));
233        assert_eq!(TokenModel::from_model_name("gpt-5.2-pro"), Some(TokenModel::Gpt52Pro));
234        assert_eq!(TokenModel::from_model_name("gpt-5.2-2025-12-11"), Some(TokenModel::Gpt52));
235
236        // GPT-5.1 variants
237        assert_eq!(TokenModel::from_model_name("gpt-5.1"), Some(TokenModel::Gpt51));
238        assert_eq!(TokenModel::from_model_name("gpt-5.1-mini"), Some(TokenModel::Gpt51Mini));
239        assert_eq!(TokenModel::from_model_name("gpt-5.1-codex"), Some(TokenModel::Gpt51Codex));
240
241        // GPT-5 variants
242        assert_eq!(TokenModel::from_model_name("gpt-5"), Some(TokenModel::Gpt5));
243        assert_eq!(TokenModel::from_model_name("gpt-5-mini"), Some(TokenModel::Gpt5Mini));
244        assert_eq!(TokenModel::from_model_name("gpt-5-nano"), Some(TokenModel::Gpt5Nano));
245
246        // O-series
247        assert_eq!(TokenModel::from_model_name("o4-mini"), Some(TokenModel::O4Mini));
248        assert_eq!(TokenModel::from_model_name("o3"), Some(TokenModel::O3));
249        assert_eq!(TokenModel::from_model_name("o3-mini"), Some(TokenModel::O3Mini));
250        assert_eq!(TokenModel::from_model_name("o1"), Some(TokenModel::O1));
251        assert_eq!(TokenModel::from_model_name("o1-mini"), Some(TokenModel::O1Mini));
252        assert_eq!(TokenModel::from_model_name("o1-preview"), Some(TokenModel::O1Preview));
253
254        // GPT-4o
255        assert_eq!(TokenModel::from_model_name("gpt-4o"), Some(TokenModel::Gpt4o));
256        assert_eq!(TokenModel::from_model_name("gpt-4o-mini"), Some(TokenModel::Gpt4oMini));
257
258        // Legacy
259        assert_eq!(TokenModel::from_model_name("gpt-4"), Some(TokenModel::Gpt4));
260        assert_eq!(TokenModel::from_model_name("gpt-3.5-turbo"), Some(TokenModel::Gpt35Turbo));
261    }
262
263    #[test]
264    fn test_from_model_name_other_vendors() {
265        // Claude
266        assert_eq!(TokenModel::from_model_name("claude"), Some(TokenModel::Claude));
267        assert_eq!(TokenModel::from_model_name("claude-sonnet"), Some(TokenModel::Claude));
268        assert_eq!(TokenModel::from_model_name("claude-opus-4.5"), Some(TokenModel::Claude));
269
270        // Gemini
271        assert_eq!(TokenModel::from_model_name("gemini"), Some(TokenModel::Gemini));
272        assert_eq!(TokenModel::from_model_name("gemini-2.5-pro"), Some(TokenModel::Gemini));
273
274        // Llama
275        assert_eq!(TokenModel::from_model_name("llama-4"), Some(TokenModel::Llama));
276        assert_eq!(TokenModel::from_model_name("codellama"), Some(TokenModel::CodeLlama));
277
278        // Mistral
279        assert_eq!(TokenModel::from_model_name("mistral"), Some(TokenModel::Mistral));
280        assert_eq!(TokenModel::from_model_name("codestral"), Some(TokenModel::Mistral));
281
282        // DeepSeek
283        assert_eq!(TokenModel::from_model_name("deepseek"), Some(TokenModel::DeepSeek));
284        assert_eq!(TokenModel::from_model_name("deepseek-r1"), Some(TokenModel::DeepSeek));
285
286        // Qwen
287        assert_eq!(TokenModel::from_model_name("qwen3"), Some(TokenModel::Qwen));
288
289        // Cohere
290        assert_eq!(TokenModel::from_model_name("cohere"), Some(TokenModel::Cohere));
291        assert_eq!(TokenModel::from_model_name("command-r+"), Some(TokenModel::Cohere));
292
293        // Grok
294        assert_eq!(TokenModel::from_model_name("grok-3"), Some(TokenModel::Grok));
295    }
296
297    #[test]
298    fn test_from_model_name_unknown() {
299        assert_eq!(TokenModel::from_model_name("unknown-model"), None);
300        assert_eq!(TokenModel::from_model_name(""), None);
301        assert_eq!(TokenModel::from_model_name("random"), None);
302    }
303
304    #[test]
305    fn test_model_properties() {
306        // Test uses_o200k
307        assert!(TokenModel::Gpt52.uses_o200k());
308        assert!(TokenModel::O3.uses_o200k());
309        assert!(TokenModel::Gpt4o.uses_o200k());
310        assert!(!TokenModel::Gpt4.uses_o200k());
311        assert!(!TokenModel::Claude.uses_o200k());
312
313        // Test uses_cl100k
314        assert!(TokenModel::Gpt4.uses_cl100k());
315        assert!(TokenModel::Gpt35Turbo.uses_cl100k());
316        assert!(!TokenModel::Gpt52.uses_cl100k());
317        assert!(!TokenModel::Claude.uses_cl100k());
318
319        // Test has_exact_tokenizer
320        assert!(TokenModel::Gpt52.has_exact_tokenizer());
321        assert!(TokenModel::Gpt4.has_exact_tokenizer());
322        assert!(!TokenModel::Claude.has_exact_tokenizer());
323        assert!(!TokenModel::Mistral.has_exact_tokenizer());
324
325        // Test vendor
326        assert_eq!(TokenModel::Gpt52.vendor(), "OpenAI");
327        assert_eq!(TokenModel::Claude.vendor(), "Anthropic");
328        assert_eq!(TokenModel::Gemini.vendor(), "Google");
329        assert_eq!(TokenModel::Llama.vendor(), "Meta");
330        assert_eq!(TokenModel::Mistral.vendor(), "Mistral AI");
331        assert_eq!(TokenModel::DeepSeek.vendor(), "DeepSeek");
332        assert_eq!(TokenModel::Qwen.vendor(), "Alibaba");
333        assert_eq!(TokenModel::Cohere.vendor(), "Cohere");
334        assert_eq!(TokenModel::Grok.vendor(), "xAI");
335    }
336
337    #[test]
338    fn test_all_models() {
339        let all = TokenModel::all();
340        assert_eq!(all.len(), 27); // 18 OpenAI (16 o200k_base + 2 cl100k_base) + 9 other vendors
341        assert!(all.contains(&TokenModel::Gpt52));
342        assert!(all.contains(&TokenModel::O3));
343        assert!(all.contains(&TokenModel::Claude));
344        assert!(all.contains(&TokenModel::Mistral));
345    }
346
347    #[test]
348    fn test_tokenizer_caching() {
349        let tokenizer = Tokenizer::new();
350        let text = "This is a test string for caching verification.";
351
352        // First call - computes and caches
353        let count1 = tokenizer.count(text, TokenModel::Gpt4o);
354
355        // Second call - should return cached value
356        let count2 = tokenizer.count(text, TokenModel::Gpt4o);
357
358        // Both should be equal
359        assert_eq!(count1, count2);
360        assert!(count1 > 0);
361
362        // Different model should have different cache entry
363        let count_claude = tokenizer.count(text, TokenModel::Claude);
364        assert!(count_claude > 0);
365    }
366
367    #[test]
368    fn test_tokenizer_without_cache() {
369        let tokenizer = Tokenizer::without_cache();
370        let text = "Test text for uncached counting.";
371
372        // Should still work correctly, just without caching
373        let count = tokenizer.count(text, TokenModel::Gpt4o);
374        assert!(count > 0);
375        assert!(count < 20);
376    }
377
378    // =========================================================================
379    // Additional edge case tests for comprehensive coverage
380    // =========================================================================
381
382    #[test]
383    fn test_all_models_return_nonzero_for_content() {
384        let tokenizer = Tokenizer::new();
385        let content = "fn main() { println!(\"Hello, world!\"); }";
386
387        // Test every single model returns a non-zero count
388        for model in TokenModel::all() {
389            let count = tokenizer.count(content, *model);
390            assert!(count > 0, "Model {:?} returned 0 tokens for non-empty content", model);
391        }
392    }
393
394    #[test]
395    fn test_unicode_content_handling() {
396        let tokenizer = Tokenizer::new();
397
398        // Test various Unicode content
399        let unicode_samples = [
400            "Hello, 世界! 🌍",         // Mixed ASCII, CJK, emoji
401            "Привет мир",              // Cyrillic
402            "مرحبا بالعالم",           // Arabic (RTL)
403            "🦀🦀🦀 Rust 🦀🦀🦀",      // Emoji-heavy
404            "const λ = (x) => x * 2;", // Greek letters in code
405        ];
406
407        for sample in unicode_samples {
408            let count = tokenizer.count(sample, TokenModel::Gpt4o);
409            assert!(count > 0, "Unicode sample '{}' returned 0 tokens", sample);
410
411            // Verify truncation doesn't break UTF-8
412            let truncated = tokenizer.truncate_to_budget(sample, TokenModel::Gpt4o, 3);
413            assert!(truncated.is_char_boundary(truncated.len()));
414        }
415    }
416
417    #[test]
418    fn test_very_long_content() {
419        let tokenizer = Tokenizer::new();
420
421        // Generate ~100KB of content
422        let long_content: String = (0..10000)
423            .map(|i| format!("Line {}: some repeated content here\n", i))
424            .collect();
425
426        // Should handle large content without panicking
427        let count = tokenizer.count(&long_content, TokenModel::Claude);
428        assert!(count > 1000, "Long content should have many tokens");
429
430        // Truncation should work efficiently
431        let truncated = tokenizer.truncate_to_budget(&long_content, TokenModel::Claude, 100);
432        let truncated_count = tokenizer.count(truncated, TokenModel::Claude);
433        assert!(truncated_count <= 100, "Truncation should respect budget");
434    }
435
436    #[test]
437    fn test_whitespace_only_content() {
438        let tokenizer = Tokenizer::new();
439
440        let whitespace_samples = [
441            "   ",        // Spaces
442            "\t\t\t",     // Tabs
443            "\n\n\n",     // Newlines
444            "  \t  \n  ", // Mixed
445        ];
446
447        for sample in whitespace_samples {
448            // Should not panic and should return some count (even if small)
449            let _count = tokenizer.count(sample, TokenModel::Gpt4o);
450        }
451    }
452
453    #[test]
454    fn test_special_characters_heavy_code() {
455        let tokenizer = Tokenizer::new();
456
457        // Code-heavy content with many special characters
458        let code = r#"
459            fn process<T: Clone + Debug>(items: &[T]) -> Result<Vec<T>, Error> {
460                items.iter()
461                    .filter(|x| x.is_valid())
462                    .map(|x| x.clone())
463                    .collect::<Result<Vec<_>, _>>()
464            }
465        "#;
466
467        let count = tokenizer.count(code, TokenModel::CodeLlama);
468        assert!(count > 10, "Code content should have meaningful token count");
469
470        // CodeLlama should handle code differently than general models
471        let claude_count = tokenizer.count(code, TokenModel::Claude);
472        // Both should be reasonable but may differ
473        assert!(claude_count > 10);
474    }
475
476    #[test]
477    fn test_model_get_consistency() {
478        // Verify TokenCounts.get() returns correct values for all model families
479        let counts = TokenCounts {
480            o200k: 100,
481            cl100k: 110,
482            claude: 95,
483            gemini: 105,
484            llama: 98,
485            mistral: 97,
486            deepseek: 96,
487            qwen: 99,
488            cohere: 102,
489            grok: 101,
490        };
491
492        // All o200k models should return the same count
493        assert_eq!(counts.get(TokenModel::Gpt52), 100);
494        assert_eq!(counts.get(TokenModel::Gpt4o), 100);
495        assert_eq!(counts.get(TokenModel::O3), 100);
496
497        // cl100k models
498        assert_eq!(counts.get(TokenModel::Gpt4), 110);
499        assert_eq!(counts.get(TokenModel::Gpt35Turbo), 110);
500
501        // Individual vendors
502        assert_eq!(counts.get(TokenModel::Claude), 95);
503        assert_eq!(counts.get(TokenModel::Gemini), 105);
504        assert_eq!(counts.get(TokenModel::Llama), 98);
505        assert_eq!(counts.get(TokenModel::CodeLlama), 98); // Same as Llama
506        assert_eq!(counts.get(TokenModel::Mistral), 97);
507        assert_eq!(counts.get(TokenModel::DeepSeek), 96);
508        assert_eq!(counts.get(TokenModel::Qwen), 99);
509        assert_eq!(counts.get(TokenModel::Cohere), 102);
510        assert_eq!(counts.get(TokenModel::Grok), 101);
511    }
512
513    #[test]
514    fn test_budget_exactly_met() {
515        let tokenizer = Tokenizer::new();
516        let text = "Hello world!";
517        let exact_budget = tokenizer.count(text, TokenModel::Gpt4o);
518
519        // Content that exactly meets budget should not be truncated
520        let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4o, exact_budget);
521        assert_eq!(truncated, text);
522    }
523
524    #[test]
525    fn test_exceeds_budget_check() {
526        let tokenizer = Tokenizer::new();
527        let text = "A fairly long string that should have a decent number of tokens.";
528
529        assert!(tokenizer.exceeds_budget(text, TokenModel::Claude, 1));
530        assert!(!tokenizer.exceeds_budget(text, TokenModel::Claude, 1000));
531        assert!(!tokenizer.exceeds_budget("", TokenModel::Claude, 0));
532    }
533}