#![allow(clippy::unwrap_used)]
#[test]
fn falsify_tok_001_roundtrip_ascii() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["hello world", "foo bar baz", "hello foo"];
let tok = BpeTokenizer::train(corpus, 256).expect("train");
let text = "hello world";
let ids = tok.encode(text).expect("encode");
let decoded = tok.decode(&ids).expect("decode");
assert_eq!(
decoded, text,
"FALSIFY-TOK-001: roundtrip must recover original ASCII text"
);
}
#[test]
fn falsify_tok_001b_roundtrip_multiword() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &[
"cargo test --lib",
"cargo build --release",
"apr code --model qwen3",
];
let tok = BpeTokenizer::train(corpus, 256).expect("train");
for text in corpus {
let ids = tok.encode(text).expect("encode");
let decoded = tok.decode(&ids).expect("decode");
assert_eq!(&decoded, text, "FALSIFY-TOK-001b: roundtrip for '{text}'");
}
}
#[test]
fn falsify_tok_003_vocab_size_bound() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["the quick brown fox jumps over the lazy dog"];
let requested = 300;
let tok = BpeTokenizer::train(corpus, requested).expect("train");
assert!(
tok.vocab_size() <= requested,
"FALSIFY-TOK-003: vocab_size {} must be <= requested {}",
tok.vocab_size(),
requested
);
assert!(
tok.vocab_size() > 0,
"FALSIFY-TOK-003: vocab_size must be > 0"
);
}
#[test]
fn falsify_tok_004_deterministic_encoding() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["hello world", "hello there", "world peace"];
let tok = BpeTokenizer::train(corpus, 256).expect("train");
let text = "hello world";
let ids1 = tok.encode(text).expect("encode 1");
let ids2 = tok.encode(text).expect("encode 2");
let ids3 = tok.encode(text).expect("encode 3");
assert_eq!(
ids1, ids2,
"FALSIFY-TOK-004: encoding must be deterministic (run 1 vs 2)"
);
assert_eq!(
ids2, ids3,
"FALSIFY-TOK-004: encoding must be deterministic (run 2 vs 3)"
);
}
#[test]
fn falsify_tok_005_empty_input() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["hello world"];
let tok = BpeTokenizer::train(corpus, 256).expect("train");
let ids = tok.encode("").expect("encode empty");
assert!(
ids.is_empty(),
"FALSIFY-TOK-005: empty input should produce 0 tokens, got {}",
ids.len()
);
}
#[test]
fn falsify_tok_006_merge_order_preserved() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["aaab aaab aaab", "bbba bbba bbba"];
let tok = BpeTokenizer::train(corpus, 300).expect("train");
let merges = tok.merges();
assert!(
!merges.is_empty(),
"FALSIFY-TOK-006: trained tokenizer must have merge rules"
);
let mut seen = std::collections::HashSet::new();
for merge in merges {
assert!(
seen.insert(merge.clone()),
"FALSIFY-TOK-007: duplicate merge rule found: {:?}",
merge
);
}
}
#[test]
fn falsify_tok_008_byte_coverage_seen_chars() {
use aprender::text::tokenize::BpeTokenizer;
let line = "abcdefghijklmnopqrstuvwxyz 0123456789";
let corpus: Vec<&str> = std::iter::repeat(line).take(20).collect();
let tok = BpeTokenizer::train(&corpus, 512).expect("train");
let ids = tok.encode(line).expect("encode");
assert!(
!ids.is_empty(),
"FALSIFY-TOK-008: training text must produce tokens"
);
let decoded = tok.decode(&ids).expect("decode");
assert_eq!(decoded, line, "FALSIFY-TOK-008: training text roundtrip");
}
#[test]
fn falsify_tok_009_ids_within_vocab() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["the sovereign ai stack uses rust"];
let tok = BpeTokenizer::train(corpus, 256).expect("train");
let vocab_size = tok.vocab_size() as u32;
let ids = tok.encode("the sovereign ai stack").expect("encode");
for &id in &ids {
assert!(
id < vocab_size,
"FALSIFY-TOK-009: token ID {} >= vocab_size {}",
id,
vocab_size
);
}
}
#[test]
fn falsify_tok_010_thread_safety() {
use aprender::text::tokenize::BpeTokenizer;
use std::sync::Arc;
let corpus = &["hello world testing concurrent"];
let tok = Arc::new(BpeTokenizer::train(corpus, 256).expect("train"));
let text = "hello world";
let expected = tok.encode(text).expect("baseline encode");
let handles: Vec<_> = (0..8)
.map(|_| {
let tok = Arc::clone(&tok);
let expected = expected.clone();
std::thread::spawn(move || {
let ids = tok.encode("hello world").expect("concurrent encode");
assert_eq!(
ids, expected,
"FALSIFY-TOK-010: concurrent encode must produce same result"
);
})
})
.collect();
for h in handles {
h.join().expect("thread panicked");
}
}
#[test]
fn falsify_data_001_rejects_nonexistent_path() {
use assert_cmd::Command;
let output = Command::cargo_bin("apr")
.expect("apr binary")
.args(["tokenize", "plan", "--data", "/nonexistent/corpus.txt"])
.output()
.expect("run apr");
assert!(
!output.status.success(),
"FALSIFY-DATA-001: nonexistent data path must fail"
);
}
#[test]
fn falsify_data_002_training_vocab_size_consistent() {
use aprender::text::tokenize::BpeTokenizer;
let corpus = &["the quick brown fox", "jumps over the lazy dog"];
let tok1 = BpeTokenizer::train(corpus, 256).expect("train 1");
let tok2 = BpeTokenizer::train(corpus, 256).expect("train 2");
assert_eq!(
tok1.vocab_size(),
tok2.vocab_size(),
"FALSIFY-DATA-002: vocab size must be identical across runs"
);
let text = "quick brown";
let ids1 = tok1.encode(text).expect("encode 1");
let dec1 = tok1.decode(&ids1).expect("decode 1");
let ids2 = tok2.encode(text).expect("encode 2");
let dec2 = tok2.decode(&ids2).expect("decode 2");
assert_eq!(dec1, text, "FALSIFY-DATA-002: roundtrip 1");
assert_eq!(dec2, text, "FALSIFY-DATA-002: roundtrip 2");
}
#[test]
fn falsify_data_003_wordpiece_roundtrip() {
use aprender::text::tokenize::WordPieceTokenizer;
let corpus = &["hello world", "hello there"];
let tok = WordPieceTokenizer::train(corpus, 256).expect("train");
let text = "hello world";
let ids = tok.encode(text).expect("encode");
let decoded = tok.decode(&ids).expect("decode");
assert_eq!(decoded, text, "FALSIFY-DATA-003: WordPiece roundtrip");
}