1mod core;
23mod counts;
24mod models;
25
26pub use core::{quick_estimate, Tokenizer};
27pub use counts::TokenCounts;
28pub use models::TokenModel;
29
30#[cfg(test)]
31mod tests {
32 use super::*;
33
34 #[test]
35 fn test_exact_gpt4o_counting() {
36 let tokenizer = Tokenizer::new();
37 let text = "Hello, world!";
38 let count = tokenizer.count(text, TokenModel::Gpt4o);
39
40 assert!(count > 0);
42 assert!(count < 10); }
44
45 #[test]
46 fn test_exact_gpt5_counting() {
47 let tokenizer = Tokenizer::new();
48 let text = "def hello():\n print('Hello, World!')\n";
49
50 let count_52 = tokenizer.count(text, TokenModel::Gpt52);
52 let count_51 = tokenizer.count(text, TokenModel::Gpt51);
53 let count_5 = tokenizer.count(text, TokenModel::Gpt5);
54 let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
55
56 assert_eq!(count_52, count_51);
57 assert_eq!(count_51, count_5);
58 assert_eq!(count_5, count_4o);
59 assert!(count_52 > 5);
60 assert!(count_52 < 30);
61 }
62
63 #[test]
64 fn test_exact_o_series_counting() {
65 let tokenizer = Tokenizer::new();
66 let text = "Solve this math problem: 2 + 2 = ?";
67
68 let count_o4 = tokenizer.count(text, TokenModel::O4Mini);
70 let count_o3 = tokenizer.count(text, TokenModel::O3);
71 let count_o1 = tokenizer.count(text, TokenModel::O1);
72 let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
73
74 assert_eq!(count_o4, count_o3);
75 assert_eq!(count_o3, count_o1);
76 assert_eq!(count_o1, count_4o);
77 }
78
79 #[test]
80 fn test_exact_gpt4_counting() {
81 let tokenizer = Tokenizer::new();
82 let text = "def hello():\n print('Hello, World!')\n";
83 let count = tokenizer.count(text, TokenModel::Gpt4);
84
85 assert!(count > 5);
87 assert!(count < 30);
88 }
89
90 #[test]
91 fn test_estimation_claude() {
92 let tokenizer = Tokenizer::new();
93 let text = "This is a test string for token estimation.";
94 let count = tokenizer.count(text, TokenModel::Claude);
95
96 assert!(count > 5);
98 assert!(count < 30);
99 }
100
101 #[test]
102 fn test_estimation_new_vendors() {
103 let tokenizer = Tokenizer::new();
104 let text = "This is a test string for new vendor token estimation.";
105
106 let mistral = tokenizer.count(text, TokenModel::Mistral);
108 let deepseek = tokenizer.count(text, TokenModel::DeepSeek);
109 let qwen = tokenizer.count(text, TokenModel::Qwen);
110 let cohere = tokenizer.count(text, TokenModel::Cohere);
111 let grok = tokenizer.count(text, TokenModel::Grok);
112
113 assert!(mistral > 5 && mistral < 50);
114 assert!(deepseek > 5 && deepseek < 50);
115 assert!(qwen > 5 && qwen < 50);
116 assert!(cohere > 5 && cohere < 50);
117 assert!(grok > 5 && grok < 50);
118 }
119
120 #[test]
121 fn test_count_all() {
122 let tokenizer = Tokenizer::new();
123 let text = "function hello() { console.log('hello'); }";
124 let counts = tokenizer.count_all(text);
125
126 assert!(counts.o200k > 0);
127 assert!(counts.cl100k > 0);
128 assert!(counts.claude > 0);
129 assert!(counts.gemini > 0);
130 assert!(counts.llama > 0);
131 assert!(counts.mistral > 0);
132 assert!(counts.deepseek > 0);
133 assert!(counts.qwen > 0);
134 assert!(counts.cohere > 0);
135 assert!(counts.grok > 0);
136 }
137
138 #[test]
139 fn test_empty_string() {
140 let tokenizer = Tokenizer::new();
141 assert_eq!(tokenizer.count("", TokenModel::Claude), 0);
142 assert_eq!(tokenizer.count("", TokenModel::Gpt4o), 0);
143 assert_eq!(tokenizer.count("", TokenModel::Gpt52), 0);
144 assert_eq!(tokenizer.count("", TokenModel::O3), 0);
145 }
146
147 #[test]
148 fn test_truncate_to_budget() {
149 let tokenizer = Tokenizer::new();
150 let text = "This is a fairly long string that we want to truncate to fit within a smaller token budget for testing purposes.";
151
152 let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4, 10);
153 let count = tokenizer.count(truncated, TokenModel::Gpt4);
154
155 assert!(count <= 10);
156 assert!(truncated.len() < text.len());
157 }
158
159 #[test]
160 fn test_quick_estimate() {
161 let count = quick_estimate("Hello world", TokenModel::Claude);
162 assert!(count > 0);
163 assert!(count < 10);
164 }
165
166 #[test]
167 fn test_token_counts_add() {
168 let a = TokenCounts {
169 o200k: 8,
170 cl100k: 9,
171 claude: 10,
172 gemini: 8,
173 llama: 10,
174 mistral: 10,
175 deepseek: 10,
176 qwen: 10,
177 cohere: 10,
178 grok: 10,
179 };
180 let b = TokenCounts {
181 o200k: 4,
182 cl100k: 5,
183 claude: 5,
184 gemini: 4,
185 llama: 5,
186 mistral: 5,
187 deepseek: 5,
188 qwen: 5,
189 cohere: 5,
190 grok: 5,
191 };
192 let sum = a + b;
193
194 assert_eq!(sum.o200k, 12);
195 assert_eq!(sum.cl100k, 14);
196 assert_eq!(sum.claude, 15);
197 }
198
199 #[test]
200 fn test_token_counts_min_max() {
201 let counts = TokenCounts {
202 o200k: 100,
203 cl100k: 110,
204 claude: 95,
205 gemini: 105,
206 llama: 98,
207 mistral: 97,
208 deepseek: 96,
209 qwen: 99,
210 cohere: 102,
211 grok: 101,
212 };
213
214 assert_eq!(counts.min(), 95);
215 assert_eq!(counts.max(), 110);
216 }
217
218 #[test]
219 fn test_most_efficient_model() {
220 let tokenizer = Tokenizer::new();
221 let text = "const x = 42;";
222 let (_model, count) = tokenizer.most_efficient_model(text);
223
224 assert!(count > 0);
226 }
227
228 #[test]
229 fn test_from_model_name_openai() {
230 assert_eq!(TokenModel::from_model_name("gpt-5.2"), Some(TokenModel::Gpt52));
232 assert_eq!(TokenModel::from_model_name("GPT-5.2"), Some(TokenModel::Gpt52));
233 assert_eq!(TokenModel::from_model_name("gpt-5.2-pro"), Some(TokenModel::Gpt52Pro));
234 assert_eq!(TokenModel::from_model_name("gpt-5.2-2025-12-11"), Some(TokenModel::Gpt52));
235
236 assert_eq!(TokenModel::from_model_name("gpt-5.1"), Some(TokenModel::Gpt51));
238 assert_eq!(TokenModel::from_model_name("gpt-5.1-mini"), Some(TokenModel::Gpt51Mini));
239 assert_eq!(TokenModel::from_model_name("gpt-5.1-codex"), Some(TokenModel::Gpt51Codex));
240
241 assert_eq!(TokenModel::from_model_name("gpt-5"), Some(TokenModel::Gpt5));
243 assert_eq!(TokenModel::from_model_name("gpt-5-mini"), Some(TokenModel::Gpt5Mini));
244 assert_eq!(TokenModel::from_model_name("gpt-5-nano"), Some(TokenModel::Gpt5Nano));
245
246 assert_eq!(TokenModel::from_model_name("o4-mini"), Some(TokenModel::O4Mini));
248 assert_eq!(TokenModel::from_model_name("o3"), Some(TokenModel::O3));
249 assert_eq!(TokenModel::from_model_name("o3-mini"), Some(TokenModel::O3Mini));
250 assert_eq!(TokenModel::from_model_name("o1"), Some(TokenModel::O1));
251 assert_eq!(TokenModel::from_model_name("o1-mini"), Some(TokenModel::O1Mini));
252 assert_eq!(TokenModel::from_model_name("o1-preview"), Some(TokenModel::O1Preview));
253
254 assert_eq!(TokenModel::from_model_name("gpt-4o"), Some(TokenModel::Gpt4o));
256 assert_eq!(TokenModel::from_model_name("gpt-4o-mini"), Some(TokenModel::Gpt4oMini));
257
258 assert_eq!(TokenModel::from_model_name("gpt-4"), Some(TokenModel::Gpt4));
260 assert_eq!(TokenModel::from_model_name("gpt-3.5-turbo"), Some(TokenModel::Gpt35Turbo));
261 }
262
263 #[test]
264 fn test_from_model_name_other_vendors() {
265 assert_eq!(TokenModel::from_model_name("claude"), Some(TokenModel::Claude));
267 assert_eq!(TokenModel::from_model_name("claude-sonnet"), Some(TokenModel::Claude));
268 assert_eq!(TokenModel::from_model_name("claude-opus-4.5"), Some(TokenModel::Claude));
269
270 assert_eq!(TokenModel::from_model_name("gemini"), Some(TokenModel::Gemini));
272 assert_eq!(TokenModel::from_model_name("gemini-2.5-pro"), Some(TokenModel::Gemini));
273
274 assert_eq!(TokenModel::from_model_name("llama-4"), Some(TokenModel::Llama));
276 assert_eq!(TokenModel::from_model_name("codellama"), Some(TokenModel::CodeLlama));
277
278 assert_eq!(TokenModel::from_model_name("mistral"), Some(TokenModel::Mistral));
280 assert_eq!(TokenModel::from_model_name("codestral"), Some(TokenModel::Mistral));
281
282 assert_eq!(TokenModel::from_model_name("deepseek"), Some(TokenModel::DeepSeek));
284 assert_eq!(TokenModel::from_model_name("deepseek-r1"), Some(TokenModel::DeepSeek));
285
286 assert_eq!(TokenModel::from_model_name("qwen3"), Some(TokenModel::Qwen));
288
289 assert_eq!(TokenModel::from_model_name("cohere"), Some(TokenModel::Cohere));
291 assert_eq!(TokenModel::from_model_name("command-r+"), Some(TokenModel::Cohere));
292
293 assert_eq!(TokenModel::from_model_name("grok-3"), Some(TokenModel::Grok));
295 }
296
297 #[test]
298 fn test_from_model_name_unknown() {
299 assert_eq!(TokenModel::from_model_name("unknown-model"), None);
300 assert_eq!(TokenModel::from_model_name(""), None);
301 assert_eq!(TokenModel::from_model_name("random"), None);
302 }
303
304 #[test]
305 fn test_model_properties() {
306 assert!(TokenModel::Gpt52.uses_o200k());
308 assert!(TokenModel::O3.uses_o200k());
309 assert!(TokenModel::Gpt4o.uses_o200k());
310 assert!(!TokenModel::Gpt4.uses_o200k());
311 assert!(!TokenModel::Claude.uses_o200k());
312
313 assert!(TokenModel::Gpt4.uses_cl100k());
315 assert!(TokenModel::Gpt35Turbo.uses_cl100k());
316 assert!(!TokenModel::Gpt52.uses_cl100k());
317 assert!(!TokenModel::Claude.uses_cl100k());
318
319 assert!(TokenModel::Gpt52.has_exact_tokenizer());
321 assert!(TokenModel::Gpt4.has_exact_tokenizer());
322 assert!(!TokenModel::Claude.has_exact_tokenizer());
323 assert!(!TokenModel::Mistral.has_exact_tokenizer());
324
325 assert_eq!(TokenModel::Gpt52.vendor(), "OpenAI");
327 assert_eq!(TokenModel::Claude.vendor(), "Anthropic");
328 assert_eq!(TokenModel::Gemini.vendor(), "Google");
329 assert_eq!(TokenModel::Llama.vendor(), "Meta");
330 assert_eq!(TokenModel::Mistral.vendor(), "Mistral AI");
331 assert_eq!(TokenModel::DeepSeek.vendor(), "DeepSeek");
332 assert_eq!(TokenModel::Qwen.vendor(), "Alibaba");
333 assert_eq!(TokenModel::Cohere.vendor(), "Cohere");
334 assert_eq!(TokenModel::Grok.vendor(), "xAI");
335 }
336
337 #[test]
338 fn test_all_models() {
339 let all = TokenModel::all();
340 assert_eq!(all.len(), 27); assert!(all.contains(&TokenModel::Gpt52));
342 assert!(all.contains(&TokenModel::O3));
343 assert!(all.contains(&TokenModel::Claude));
344 assert!(all.contains(&TokenModel::Mistral));
345 }
346
347 #[test]
348 fn test_tokenizer_caching() {
349 let tokenizer = Tokenizer::new();
350 let text = "This is a test string for caching verification.";
351
352 let count1 = tokenizer.count(text, TokenModel::Gpt4o);
354
355 let count2 = tokenizer.count(text, TokenModel::Gpt4o);
357
358 assert_eq!(count1, count2);
360 assert!(count1 > 0);
361
362 let count_claude = tokenizer.count(text, TokenModel::Claude);
364 assert!(count_claude > 0);
365 }
366
367 #[test]
368 fn test_tokenizer_without_cache() {
369 let tokenizer = Tokenizer::without_cache();
370 let text = "Test text for uncached counting.";
371
372 let count = tokenizer.count(text, TokenModel::Gpt4o);
374 assert!(count > 0);
375 assert!(count < 20);
376 }
377
378 #[test]
383 fn test_all_models_return_nonzero_for_content() {
384 let tokenizer = Tokenizer::new();
385 let content = "fn main() { println!(\"Hello, world!\"); }";
386
387 for model in TokenModel::all() {
389 let count = tokenizer.count(content, *model);
390 assert!(count > 0, "Model {:?} returned 0 tokens for non-empty content", model);
391 }
392 }
393
394 #[test]
395 fn test_unicode_content_handling() {
396 let tokenizer = Tokenizer::new();
397
398 let unicode_samples = [
400 "Hello, 世界! 🌍", "Привет мир", "مرحبا بالعالم", "🦀🦀🦀 Rust 🦀🦀🦀", "const λ = (x) => x * 2;", ];
406
407 for sample in unicode_samples {
408 let count = tokenizer.count(sample, TokenModel::Gpt4o);
409 assert!(count > 0, "Unicode sample '{}' returned 0 tokens", sample);
410
411 let truncated = tokenizer.truncate_to_budget(sample, TokenModel::Gpt4o, 3);
413 assert!(truncated.is_char_boundary(truncated.len()));
414 }
415 }
416
417 #[test]
418 fn test_very_long_content() {
419 let tokenizer = Tokenizer::new();
420
421 let long_content: String = (0..10000)
423 .map(|i| format!("Line {}: some repeated content here\n", i))
424 .collect();
425
426 let count = tokenizer.count(&long_content, TokenModel::Claude);
428 assert!(count > 1000, "Long content should have many tokens");
429
430 let truncated = tokenizer.truncate_to_budget(&long_content, TokenModel::Claude, 100);
432 let truncated_count = tokenizer.count(truncated, TokenModel::Claude);
433 assert!(truncated_count <= 100, "Truncation should respect budget");
434 }
435
436 #[test]
437 fn test_whitespace_only_content() {
438 let tokenizer = Tokenizer::new();
439
440 let whitespace_samples = [
441 " ", "\t\t\t", "\n\n\n", " \t \n ", ];
446
447 for sample in whitespace_samples {
448 let _count = tokenizer.count(sample, TokenModel::Gpt4o);
450 }
451 }
452
453 #[test]
454 fn test_special_characters_heavy_code() {
455 let tokenizer = Tokenizer::new();
456
457 let code = r#"
459 fn process<T: Clone + Debug>(items: &[T]) -> Result<Vec<T>, Error> {
460 items.iter()
461 .filter(|x| x.is_valid())
462 .map(|x| x.clone())
463 .collect::<Result<Vec<_>, _>>()
464 }
465 "#;
466
467 let count = tokenizer.count(code, TokenModel::CodeLlama);
468 assert!(count > 10, "Code content should have meaningful token count");
469
470 let claude_count = tokenizer.count(code, TokenModel::Claude);
472 assert!(claude_count > 10);
474 }
475
476 #[test]
477 fn test_model_get_consistency() {
478 let counts = TokenCounts {
480 o200k: 100,
481 cl100k: 110,
482 claude: 95,
483 gemini: 105,
484 llama: 98,
485 mistral: 97,
486 deepseek: 96,
487 qwen: 99,
488 cohere: 102,
489 grok: 101,
490 };
491
492 assert_eq!(counts.get(TokenModel::Gpt52), 100);
494 assert_eq!(counts.get(TokenModel::Gpt4o), 100);
495 assert_eq!(counts.get(TokenModel::O3), 100);
496
497 assert_eq!(counts.get(TokenModel::Gpt4), 110);
499 assert_eq!(counts.get(TokenModel::Gpt35Turbo), 110);
500
501 assert_eq!(counts.get(TokenModel::Claude), 95);
503 assert_eq!(counts.get(TokenModel::Gemini), 105);
504 assert_eq!(counts.get(TokenModel::Llama), 98);
505 assert_eq!(counts.get(TokenModel::CodeLlama), 98); assert_eq!(counts.get(TokenModel::Mistral), 97);
507 assert_eq!(counts.get(TokenModel::DeepSeek), 96);
508 assert_eq!(counts.get(TokenModel::Qwen), 99);
509 assert_eq!(counts.get(TokenModel::Cohere), 102);
510 assert_eq!(counts.get(TokenModel::Grok), 101);
511 }
512
513 #[test]
514 fn test_budget_exactly_met() {
515 let tokenizer = Tokenizer::new();
516 let text = "Hello world!";
517 let exact_budget = tokenizer.count(text, TokenModel::Gpt4o);
518
519 let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4o, exact_budget);
521 assert_eq!(truncated, text);
522 }
523
524 #[test]
525 fn test_exceeds_budget_check() {
526 let tokenizer = Tokenizer::new();
527 let text = "A fairly long string that should have a decent number of tokens.";
528
529 assert!(tokenizer.exceeds_budget(text, TokenModel::Claude, 1));
530 assert!(!tokenizer.exceeds_budget(text, TokenModel::Claude, 1000));
531 assert!(!tokenizer.exceeds_budget("", TokenModel::Claude, 0));
532 }
533}