use ai::model::Model;
#[test]
fn test_token_counting_accuracy() {
let model = Model::GPT41;
let test_cases = vec![
("", 0),
("Hello", 1),
("The quick brown fox jumps over the lazy dog.", 10),
(
"Hello, world! This is a test. How are you doing today? I hope everything is going well.",
20
),
("fn main() { println!(\"Hello, world!\"); }", 11),
("Special chars: @#$%^&*()_+-=[]{}|;':\",./<>?", 20),
];
for (text, min_expected) in test_cases {
let tokens = model.count_tokens(text).unwrap();
assert!(
tokens >= min_expected,
"Token count for '{}' was {} but expected at least {}",
text,
tokens,
min_expected
);
assert!(
tokens <= min_expected * 2 + 5,
"Token count for '{}' was {} but expected at most {}",
text,
tokens,
min_expected * 2 + 5
);
}
}
#[test]
fn test_no_underestimation_for_context_limit() {
let model = Model::GPT41;
let tricky_texts = vec![
"I a b c d e f g h i j k l m n o p q r s t u v w x y z",
"```python\ndef hello():\n print('world')\n```",
"https://github.com/user/repo/blob/main/src/lib.rs#L42",
"Error: failed to compile `foo.rs` at line 42: unexpected token ';'",
];
for text in tricky_texts {
let tokens = model.count_tokens(text).unwrap();
let char_estimate = text.len() / 4;
println!(
"Text: '{}' - Chars: {}, Tokens: {}, Old estimate would be: {}",
text,
text.len(),
tokens,
char_estimate
);
}
}
#[test]
fn test_token_counting_consistency() {
let model = Model::GPT41;
let test_text = "The quick brown fox jumps over the lazy dog. This is a test sentence with various words.";
let mut counts = Vec::new();
for _ in 0..5 {
counts.push(model.count_tokens(test_text).unwrap());
}
assert!(counts.windows(2).all(|w| w[0] == w[1]), "Token counting is not consistent");
}
#[test]
fn test_long_text_token_counting() {
let model = Model::GPT41;
let long_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(50);
let tokens = model.count_tokens(&long_text).unwrap();
let char_estimate = long_text.len() / 4;
println!(
"Long text - Chars: {}, Actual tokens: {}, Char-based estimate: {}",
long_text.len(),
tokens,
char_estimate
);
assert!(tokens > 0, "Token count should be greater than 0");
assert!(tokens < long_text.len(), "Token count should be less than character count");
}