1mod core;
127mod counts;
128mod models;
129
130pub use core::{quick_estimate, Tokenizer};
131pub use counts::TokenCounts;
132pub use models::TokenModel;
133
134#[cfg(test)]
135mod tests {
136 use super::*;
137
138 #[test]
139 fn test_exact_gpt4o_counting() {
140 let tokenizer = Tokenizer::new();
141 let text = "Hello, world!";
142 let count = tokenizer.count(text, TokenModel::Gpt4o);
143
144 assert!(count > 0);
146 assert!(count < 10); }
148
149 #[test]
150 fn test_exact_gpt5_counting() {
151 let tokenizer = Tokenizer::new();
152 let text = "def hello():\n print('Hello, World!')\n";
153
154 let count_52 = tokenizer.count(text, TokenModel::Gpt52);
156 let count_51 = tokenizer.count(text, TokenModel::Gpt51);
157 let count_5 = tokenizer.count(text, TokenModel::Gpt5);
158 let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
159
160 assert_eq!(count_52, count_51);
161 assert_eq!(count_51, count_5);
162 assert_eq!(count_5, count_4o);
163 assert!(count_52 > 5);
164 assert!(count_52 < 30);
165 }
166
167 #[test]
168 fn test_exact_o_series_counting() {
169 let tokenizer = Tokenizer::new();
170 let text = "Solve this math problem: 2 + 2 = ?";
171
172 let count_o4 = tokenizer.count(text, TokenModel::O4Mini);
174 let count_o3 = tokenizer.count(text, TokenModel::O3);
175 let count_o1 = tokenizer.count(text, TokenModel::O1);
176 let count_4o = tokenizer.count(text, TokenModel::Gpt4o);
177
178 assert_eq!(count_o4, count_o3);
179 assert_eq!(count_o3, count_o1);
180 assert_eq!(count_o1, count_4o);
181 }
182
183 #[test]
184 fn test_exact_gpt4_counting() {
185 let tokenizer = Tokenizer::new();
186 let text = "def hello():\n print('Hello, World!')\n";
187 let count = tokenizer.count(text, TokenModel::Gpt4);
188
189 assert!(count > 5);
191 assert!(count < 30);
192 }
193
194 #[test]
195 fn test_estimation_claude() {
196 let tokenizer = Tokenizer::new();
197 let text = "This is a test string for token estimation.";
198 let count = tokenizer.count(text, TokenModel::Claude);
199
200 assert!(count > 5);
202 assert!(count < 30);
203 }
204
205 #[test]
206 fn test_estimation_new_vendors() {
207 let tokenizer = Tokenizer::new();
208 let text = "This is a test string for new vendor token estimation.";
209
210 let mistral = tokenizer.count(text, TokenModel::Mistral);
212 let deepseek = tokenizer.count(text, TokenModel::DeepSeek);
213 let qwen = tokenizer.count(text, TokenModel::Qwen);
214 let cohere = tokenizer.count(text, TokenModel::Cohere);
215 let grok = tokenizer.count(text, TokenModel::Grok);
216
217 assert!(mistral > 5 && mistral < 50);
218 assert!(deepseek > 5 && deepseek < 50);
219 assert!(qwen > 5 && qwen < 50);
220 assert!(cohere > 5 && cohere < 50);
221 assert!(grok > 5 && grok < 50);
222 }
223
224 #[test]
225 fn test_count_all() {
226 let tokenizer = Tokenizer::new();
227 let text = "function hello() { console.log('hello'); }";
228 let counts = tokenizer.count_all(text);
229
230 assert!(counts.o200k > 0);
231 assert!(counts.cl100k > 0);
232 assert!(counts.claude > 0);
233 assert!(counts.gemini > 0);
234 assert!(counts.llama > 0);
235 assert!(counts.mistral > 0);
236 assert!(counts.deepseek > 0);
237 assert!(counts.qwen > 0);
238 assert!(counts.cohere > 0);
239 assert!(counts.grok > 0);
240 }
241
242 #[test]
243 fn test_empty_string() {
244 let tokenizer = Tokenizer::new();
245 assert_eq!(tokenizer.count("", TokenModel::Claude), 0);
246 assert_eq!(tokenizer.count("", TokenModel::Gpt4o), 0);
247 assert_eq!(tokenizer.count("", TokenModel::Gpt52), 0);
248 assert_eq!(tokenizer.count("", TokenModel::O3), 0);
249 }
250
251 #[test]
252 fn test_truncate_to_budget() {
253 let tokenizer = Tokenizer::new();
254 let text = "This is a fairly long string that we want to truncate to fit within a smaller token budget for testing purposes.";
255
256 let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4, 10);
257 let count = tokenizer.count(truncated, TokenModel::Gpt4);
258
259 assert!(count <= 10);
260 assert!(truncated.len() < text.len());
261 }
262
263 #[test]
264 fn test_quick_estimate() {
265 let count = quick_estimate("Hello world", TokenModel::Claude);
266 assert!(count > 0);
267 assert!(count < 10);
268 }
269
270 #[test]
271 fn test_token_counts_add() {
272 let a = TokenCounts {
273 o200k: 8,
274 cl100k: 9,
275 claude: 10,
276 gemini: 8,
277 llama: 10,
278 mistral: 10,
279 deepseek: 10,
280 qwen: 10,
281 cohere: 10,
282 grok: 10,
283 };
284 let b = TokenCounts {
285 o200k: 4,
286 cl100k: 5,
287 claude: 5,
288 gemini: 4,
289 llama: 5,
290 mistral: 5,
291 deepseek: 5,
292 qwen: 5,
293 cohere: 5,
294 grok: 5,
295 };
296 let sum = a + b;
297
298 assert_eq!(sum.o200k, 12);
299 assert_eq!(sum.cl100k, 14);
300 assert_eq!(sum.claude, 15);
301 }
302
303 #[test]
304 fn test_token_counts_min_max() {
305 let counts = TokenCounts {
306 o200k: 100,
307 cl100k: 110,
308 claude: 95,
309 gemini: 105,
310 llama: 98,
311 mistral: 97,
312 deepseek: 96,
313 qwen: 99,
314 cohere: 102,
315 grok: 101,
316 };
317
318 assert_eq!(counts.min(), 95);
319 assert_eq!(counts.max(), 110);
320 }
321
322 #[test]
323 fn test_most_efficient_model() {
324 let tokenizer = Tokenizer::new();
325 let text = "const x = 42;";
326 let (_model, count) = tokenizer.most_efficient_model(text);
327
328 assert!(count > 0);
330 }
331
332 #[test]
333 fn test_from_model_name_openai() {
334 assert_eq!(TokenModel::from_model_name("gpt-5.2"), Some(TokenModel::Gpt52));
336 assert_eq!(TokenModel::from_model_name("GPT-5.2"), Some(TokenModel::Gpt52));
337 assert_eq!(TokenModel::from_model_name("gpt-5.2-pro"), Some(TokenModel::Gpt52Pro));
338 assert_eq!(TokenModel::from_model_name("gpt-5.2-2025-12-11"), Some(TokenModel::Gpt52));
339
340 assert_eq!(TokenModel::from_model_name("gpt-5.1"), Some(TokenModel::Gpt51));
342 assert_eq!(TokenModel::from_model_name("gpt-5.1-mini"), Some(TokenModel::Gpt51Mini));
343 assert_eq!(TokenModel::from_model_name("gpt-5.1-codex"), Some(TokenModel::Gpt51Codex));
344
345 assert_eq!(TokenModel::from_model_name("gpt-5"), Some(TokenModel::Gpt5));
347 assert_eq!(TokenModel::from_model_name("gpt-5-mini"), Some(TokenModel::Gpt5Mini));
348 assert_eq!(TokenModel::from_model_name("gpt-5-nano"), Some(TokenModel::Gpt5Nano));
349
350 assert_eq!(TokenModel::from_model_name("o4-mini"), Some(TokenModel::O4Mini));
352 assert_eq!(TokenModel::from_model_name("o3"), Some(TokenModel::O3));
353 assert_eq!(TokenModel::from_model_name("o3-mini"), Some(TokenModel::O3Mini));
354 assert_eq!(TokenModel::from_model_name("o1"), Some(TokenModel::O1));
355 assert_eq!(TokenModel::from_model_name("o1-mini"), Some(TokenModel::O1Mini));
356 assert_eq!(TokenModel::from_model_name("o1-preview"), Some(TokenModel::O1Preview));
357
358 assert_eq!(TokenModel::from_model_name("gpt-4o"), Some(TokenModel::Gpt4o));
360 assert_eq!(TokenModel::from_model_name("gpt-4o-mini"), Some(TokenModel::Gpt4oMini));
361
362 assert_eq!(TokenModel::from_model_name("gpt-4"), Some(TokenModel::Gpt4));
364 assert_eq!(TokenModel::from_model_name("gpt-3.5-turbo"), Some(TokenModel::Gpt35Turbo));
365 }
366
367 #[test]
368 fn test_from_model_name_other_vendors() {
369 assert_eq!(TokenModel::from_model_name("claude"), Some(TokenModel::Claude));
371 assert_eq!(TokenModel::from_model_name("claude-sonnet"), Some(TokenModel::Claude));
372 assert_eq!(TokenModel::from_model_name("claude-opus-4.5"), Some(TokenModel::Claude));
373
374 assert_eq!(TokenModel::from_model_name("gemini"), Some(TokenModel::Gemini));
376 assert_eq!(TokenModel::from_model_name("gemini-2.5-pro"), Some(TokenModel::Gemini));
377
378 assert_eq!(TokenModel::from_model_name("llama-4"), Some(TokenModel::Llama));
380 assert_eq!(TokenModel::from_model_name("codellama"), Some(TokenModel::CodeLlama));
381
382 assert_eq!(TokenModel::from_model_name("mistral"), Some(TokenModel::Mistral));
384 assert_eq!(TokenModel::from_model_name("codestral"), Some(TokenModel::Mistral));
385
386 assert_eq!(TokenModel::from_model_name("deepseek"), Some(TokenModel::DeepSeek));
388 assert_eq!(TokenModel::from_model_name("deepseek-r1"), Some(TokenModel::DeepSeek));
389
390 assert_eq!(TokenModel::from_model_name("qwen3"), Some(TokenModel::Qwen));
392
393 assert_eq!(TokenModel::from_model_name("cohere"), Some(TokenModel::Cohere));
395 assert_eq!(TokenModel::from_model_name("command-r+"), Some(TokenModel::Cohere));
396
397 assert_eq!(TokenModel::from_model_name("grok-3"), Some(TokenModel::Grok));
399 }
400
401 #[test]
402 fn test_from_model_name_unknown() {
403 assert_eq!(TokenModel::from_model_name("unknown-model"), None);
404 assert_eq!(TokenModel::from_model_name(""), None);
405 assert_eq!(TokenModel::from_model_name("random"), None);
406 }
407
408 #[test]
409 fn test_model_properties() {
410 assert!(TokenModel::Gpt52.uses_o200k());
412 assert!(TokenModel::O3.uses_o200k());
413 assert!(TokenModel::Gpt4o.uses_o200k());
414 assert!(!TokenModel::Gpt4.uses_o200k());
415 assert!(!TokenModel::Claude.uses_o200k());
416
417 assert!(TokenModel::Gpt4.uses_cl100k());
419 assert!(TokenModel::Gpt35Turbo.uses_cl100k());
420 assert!(!TokenModel::Gpt52.uses_cl100k());
421 assert!(!TokenModel::Claude.uses_cl100k());
422
423 assert!(TokenModel::Gpt52.has_exact_tokenizer());
425 assert!(TokenModel::Gpt4.has_exact_tokenizer());
426 assert!(!TokenModel::Claude.has_exact_tokenizer());
427 assert!(!TokenModel::Mistral.has_exact_tokenizer());
428
429 assert_eq!(TokenModel::Gpt52.vendor(), "OpenAI");
431 assert_eq!(TokenModel::Claude.vendor(), "Anthropic");
432 assert_eq!(TokenModel::Gemini.vendor(), "Google");
433 assert_eq!(TokenModel::Llama.vendor(), "Meta");
434 assert_eq!(TokenModel::Mistral.vendor(), "Mistral AI");
435 assert_eq!(TokenModel::DeepSeek.vendor(), "DeepSeek");
436 assert_eq!(TokenModel::Qwen.vendor(), "Alibaba");
437 assert_eq!(TokenModel::Cohere.vendor(), "Cohere");
438 assert_eq!(TokenModel::Grok.vendor(), "xAI");
439 }
440
441 #[test]
442 fn test_all_models() {
443 let all = TokenModel::all();
444 assert_eq!(all.len(), 27); assert!(all.contains(&TokenModel::Gpt52));
446 assert!(all.contains(&TokenModel::O3));
447 assert!(all.contains(&TokenModel::Claude));
448 assert!(all.contains(&TokenModel::Mistral));
449 }
450
451 #[test]
452 fn test_tokenizer_caching() {
453 let tokenizer = Tokenizer::new();
454 let text = "This is a test string for caching verification.";
455
456 let count1 = tokenizer.count(text, TokenModel::Gpt4o);
458
459 let count2 = tokenizer.count(text, TokenModel::Gpt4o);
461
462 assert_eq!(count1, count2);
464 assert!(count1 > 0);
465
466 let count_claude = tokenizer.count(text, TokenModel::Claude);
468 assert!(count_claude > 0);
469 }
470
471 #[test]
472 fn test_tokenizer_without_cache() {
473 let tokenizer = Tokenizer::without_cache();
474 let text = "Test text for uncached counting.";
475
476 let count = tokenizer.count(text, TokenModel::Gpt4o);
478 assert!(count > 0);
479 assert!(count < 20);
480 }
481
482 #[test]
487 fn test_all_models_return_nonzero_for_content() {
488 let tokenizer = Tokenizer::new();
489 let content = "fn main() { println!(\"Hello, world!\"); }";
490
491 for model in TokenModel::all() {
493 let count = tokenizer.count(content, *model);
494 assert!(count > 0, "Model {:?} returned 0 tokens for non-empty content", model);
495 }
496 }
497
498 #[test]
499 fn test_unicode_content_handling() {
500 let tokenizer = Tokenizer::new();
501
502 let unicode_samples = [
504 "Hello, 世界! 🌍", "Привет мир", "مرحبا بالعالم", "🦀🦀🦀 Rust 🦀🦀🦀", "const λ = (x) => x * 2;", ];
510
511 for sample in unicode_samples {
512 let count = tokenizer.count(sample, TokenModel::Gpt4o);
513 assert!(count > 0, "Unicode sample '{}' returned 0 tokens", sample);
514
515 let truncated = tokenizer.truncate_to_budget(sample, TokenModel::Gpt4o, 3);
517 assert!(truncated.is_char_boundary(truncated.len()));
518 }
519 }
520
521 #[test]
522 fn test_very_long_content() {
523 let tokenizer = Tokenizer::new();
524
525 let long_content: String = (0..10000)
527 .map(|i| format!("Line {}: some repeated content here\n", i))
528 .collect();
529
530 let count = tokenizer.count(&long_content, TokenModel::Claude);
532 assert!(count > 1000, "Long content should have many tokens");
533
534 let truncated = tokenizer.truncate_to_budget(&long_content, TokenModel::Claude, 100);
536 let truncated_count = tokenizer.count(truncated, TokenModel::Claude);
537 assert!(truncated_count <= 100, "Truncation should respect budget");
538 }
539
540 #[test]
541 fn test_whitespace_only_content() {
542 let tokenizer = Tokenizer::new();
543
544 let whitespace_samples = [
545 " ", "\t\t\t", "\n\n\n", " \t \n ", ];
550
551 for sample in whitespace_samples {
552 let _count = tokenizer.count(sample, TokenModel::Gpt4o);
554 }
555 }
556
557 #[test]
558 fn test_special_characters_heavy_code() {
559 let tokenizer = Tokenizer::new();
560
561 let code = r#"
563 fn process<T: Clone + Debug>(items: &[T]) -> Result<Vec<T>, Error> {
564 items.iter()
565 .filter(|x| x.is_valid())
566 .map(|x| x.clone())
567 .collect::<Result<Vec<_>, _>>()
568 }
569 "#;
570
571 let count = tokenizer.count(code, TokenModel::CodeLlama);
572 assert!(count > 10, "Code content should have meaningful token count");
573
574 let claude_count = tokenizer.count(code, TokenModel::Claude);
576 assert!(claude_count > 10);
578 }
579
580 #[test]
581 fn test_model_get_consistency() {
582 let counts = TokenCounts {
584 o200k: 100,
585 cl100k: 110,
586 claude: 95,
587 gemini: 105,
588 llama: 98,
589 mistral: 97,
590 deepseek: 96,
591 qwen: 99,
592 cohere: 102,
593 grok: 101,
594 };
595
596 assert_eq!(counts.get(TokenModel::Gpt52), 100);
598 assert_eq!(counts.get(TokenModel::Gpt4o), 100);
599 assert_eq!(counts.get(TokenModel::O3), 100);
600
601 assert_eq!(counts.get(TokenModel::Gpt4), 110);
603 assert_eq!(counts.get(TokenModel::Gpt35Turbo), 110);
604
605 assert_eq!(counts.get(TokenModel::Claude), 95);
607 assert_eq!(counts.get(TokenModel::Gemini), 105);
608 assert_eq!(counts.get(TokenModel::Llama), 98);
609 assert_eq!(counts.get(TokenModel::CodeLlama), 98); assert_eq!(counts.get(TokenModel::Mistral), 97);
611 assert_eq!(counts.get(TokenModel::DeepSeek), 96);
612 assert_eq!(counts.get(TokenModel::Qwen), 99);
613 assert_eq!(counts.get(TokenModel::Cohere), 102);
614 assert_eq!(counts.get(TokenModel::Grok), 101);
615 }
616
617 #[test]
618 fn test_budget_exactly_met() {
619 let tokenizer = Tokenizer::new();
620 let text = "Hello world!";
621 let exact_budget = tokenizer.count(text, TokenModel::Gpt4o);
622
623 let truncated = tokenizer.truncate_to_budget(text, TokenModel::Gpt4o, exact_budget);
625 assert_eq!(truncated, text);
626 }
627
628 #[test]
629 fn test_exceeds_budget_check() {
630 let tokenizer = Tokenizer::new();
631 let text = "A fairly long string that should have a decent number of tokens.";
632
633 assert!(tokenizer.exceeds_budget(text, TokenModel::Claude, 1));
634 assert!(!tokenizer.exceeds_budget(text, TokenModel::Claude, 1000));
635 assert!(!tokenizer.exceeds_budget("", TokenModel::Claude, 0));
636 }
637}