#[cfg(test)]
mod tokenizer_contract_tests {
use super::BPETokenizer;
fn make_test_tokenizer() -> BPETokenizer {
let mut vocab: Vec<String> = (b'a'..=b'z').map(|c| String::from(c as char)).collect();
vocab.push("<unk>".to_string());
vocab.push("<|im_start|>".to_string());
vocab.push("<|im_end|>".to_string());
vocab.push("<|endoftext|>".to_string());
vocab.push("he".to_string());
vocab.push("ll".to_string());
vocab.push("lo".to_string());
let merges = vec![
("h".to_string(), "e".to_string()),
("l".to_string(), "l".to_string()),
("l".to_string(), "o".to_string()),
];
BPETokenizer::new(vocab, merges, "<unk>").expect("test tokenizer")
}
#[test]
fn falsify_tok_004_deterministic_encoding() {
let tokenizer = make_test_tokenizer();
let text = "hello";
let ids_a = tokenizer.encode(text);
let ids_b = tokenizer.encode(text);
assert_eq!(ids_a, ids_b, "F-TOK-004: encode must be deterministic");
}
#[test]
fn falsify_tok_005_empty_input() {
let tokenizer = make_test_tokenizer();
let ids = tokenizer.encode("");
assert!(ids.is_empty(), "F-TOK-005: empty input should produce empty tokens");
}
#[test]
fn falsify_tok_003_vocab_size() {
let tokenizer = make_test_tokenizer();
assert!(tokenizer.vocab_size() >= 26, "F-TOK-003: vocab must include at least a-z");
}
#[test]
fn falsify_tok_004b_encoding_stability() {
let tokenizer = make_test_tokenizer();
for input in &["a", "hello", "abc", ""] {
let first = tokenizer.encode(input);
for _ in 0..5 {
assert_eq!(
tokenizer.encode(input),
first,
"F-TOK-004: encoding '{input}' must be stable across calls"
);
}
}
}
#[test]
fn falsify_tok_thread_safety() {
fn assert_send_sync<T: Send + Sync>() {}
assert_send_sync::<BPETokenizer>();
}
}