use super::*;
#[test]
fn falsify_bpe_load_from_files_matches_load_from_json_encode() {
let vocab_path = "/tmp/qwen-0.5b-tokenizer-extracted/vocab.json";
let merges_path = "/tmp/qwen-0.5b-tokenizer-extracted/merges.txt";
let json_path = "/home/noah/.cache/qwen2/tokenizer.json";
if !std::path::Path::new(vocab_path).exists()
|| !std::path::Path::new(merges_path).exists()
|| !std::path::Path::new(json_path).exists()
{
eprintln!("[falsify-bpe-upstream-002] skipping: host lacks tokenizer files");
return;
}
let vocab_json = std::fs::read_to_string(vocab_path).expect("read vocab");
let merges_txt = std::fs::read_to_string(merges_path).expect("read merges");
let from_files = load_from_files(&vocab_json, &merges_txt).expect("load_from_files ok");
let json = std::fs::read_to_string(json_path).expect("read tokenizer.json");
let from_json = load_from_json(&json).expect("load_from_json ok");
let text = "def fibonacci(n):\n return n\n";
let ids_files = from_files.encode(text);
let ids_json = from_json.encode(text);
eprintln!("[upstream-002] from_files: {ids_files:?}");
eprintln!("[upstream-002] from_json: {ids_json:?}");
eprintln!(
"[upstream-002] from_files vocab_size={}, from_json vocab_size={}",
from_files.vocab_size(),
from_json.vocab_size()
);
let unk_id = 151643_u32;
let unk_in_files = ids_files.iter().filter(|&&id| id == unk_id).count();
let unk_in_json = ids_json.iter().filter(|&&id| id == unk_id).count();
let total_files = ids_files.len();
let total_json = ids_json.len();
eprintln!(
"[upstream-002] from_files: {total_files} tokens, {unk_in_files} unks ({:.3}%)",
if total_files > 0 {
unk_in_files as f32 / total_files as f32 * 100.0
} else {
0.0
}
);
eprintln!(
"[upstream-002] from_json: {total_json} tokens, {unk_in_json} unks ({:.3}%)",
if total_json > 0 {
unk_in_json as f32 / total_json as f32 * 100.0
} else {
0.0
}
);
let ratio_files = unk_in_files as f32 / total_files.max(1) as f32;
let ratio_json = unk_in_json as f32 / total_json.max(1) as f32;
let divergence = (ratio_files - ratio_json).abs();
assert!(
divergence < 0.05,
"FALSIFY-BPE-UPSTREAM-002: load_from_files unk_ratio={ratio_files:.4} \
diverges from load_from_json unk_ratio={ratio_json:.4} by {divergence:.4} \
(>0.05). The two loaders SHOULD produce equivalent encoders for the \
same vocab. Fix scope: align load_from_files setup with load_from_json \
(likely added_tokens registration, merge format, or pretokenizer config)."
);
}
#[test]
fn falsify_bpe_qwen_encode_python_does_not_unk_99pct() {
let path = "/home/noah/.cache/qwen2/tokenizer.json";
if !std::path::Path::new(path).exists() {
eprintln!(
"[falsify_bpe_qwen_encode_python_does_not_unk_99pct] skipping: \
host lacks {path} (test is host-dependent for upstream H1C \
investigation)"
);
return;
}
let json = std::fs::read_to_string(path).expect("read tokenizer.json");
let tokenizer = load_from_json(&json).expect("load_from_json succeeds on Qwen2");
let text = "def fibonacci(n):\n if n < 2:\n return n\n \
return fibonacci(n - 1) + fibonacci(n - 2)\n";
let ids = tokenizer.encode(text);
let unk_id = tokenizer.token_to_id(&BpeConfig::qwen2().unk_token);
let unk_count = if let Some(unk) = unk_id {
ids.iter().filter(|&&id| id == unk).count()
} else {
0
};
let total = ids.len();
let unk_ratio = if total > 0 {
unk_count as f32 / total as f32
} else {
0.0
};
eprintln!(
"[falsify-bpe-upstream-001] text_bytes={}, encoded_tokens={total}, \
unk_count={unk_count}, unk_ratio={unk_ratio:.4}, unk_id={unk_id:?}",
text.len()
);
assert!(
unk_ratio < 0.50,
"FALSIFY-BPE-UPSTREAM-001: BpeTokenizer::encode on Qwen2 vocab \
produced unk_ratio={unk_ratio} (>{}); 99% `<unk>` defect class. \
encoded_tokens={total}, unk_count={unk_count}, unk_id={unk_id:?}. \
See evidence/section-60-5g-2-redispatch-2026-05-09/ + the §60 \
val_loss=0.0008 root-cause cascade.",
0.50
);
}
#[test]
fn test_encode_without_prefix_space() {
let config = BpeConfig {
add_prefix_space: false,
..BpeConfig::default()
};
let tokenizer = BpeTokenizer::new(config);
let tokens = tokenizer.encode("test");
assert!(tokens.is_empty());
}
#[test]
fn test_decode_with_unknown_id() {
let tokenizer = BpeTokenizer::gpt2_base();
let decoded = tokenizer.decode(&[999999]);
assert!(decoded.is_empty());
}
#[test]
fn test_decode_skips_special_tokens() {
let tokenizer = BpeTokenizer::gpt2_base();
let decoded = tokenizer.decode(&[50256, 72]);
assert!(!decoded.contains("<|endoftext|>"));
}
#[test]
fn test_bytes_to_bpe_tokens_unknown() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let result = tokenizer.bytes_to_bpe_tokens("A");
assert!(!result.is_empty());
}
#[test]
fn test_load_from_json_with_special_added_tokens() {
let json = r#"{
"model": {
"vocab": {"hello": 0, "world": 1},
"merges": []
},
"added_tokens": [
{"id": 100, "content": "<special>", "special": true},
{"id": 101, "content": "normal", "special": false}
]
}"#;
let result = load_from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert!(tokenizer.is_special_token("<special>"));
assert!(!tokenizer.is_special_token("normal"));
assert_eq!(tokenizer.token_to_id("normal"), Some(101));
}
#[test]
fn test_load_from_json_whisper_vocab_size() {
let mut vocab_entries: Vec<String> = Vec::new();
for i in 0..51000 {
vocab_entries.push(format!("\"tok{i}\": {i}"));
}
let vocab_str = vocab_entries.join(", ");
let json = format!(
"{{\"model\": {{\"vocab\": {{ {} }}, \"merges\": []}}, \"added_tokens\": []}}",
vocab_str
);
let result = load_from_json(&json);
assert!(result.is_ok());
}
#[test]
fn test_load_from_json_qwen2_vocab_size() {
let mut vocab_entries: Vec<String> = Vec::new();
for i in 0..151000 {
vocab_entries.push(format!("\"tok{i}\": {i}"));
}
let vocab_str = vocab_entries.join(", ");
let json = format!(
"{{\"model\": {{\"vocab\": {{ {} }}, \"merges\": []}}, \"added_tokens\": []}}",
vocab_str
);
let result = load_from_json(&json);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_whisper_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..51000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_gpt2_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..41000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_qwen2_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..151000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_empty_lines() {
let vocab = "{}";
let merges = "\n\na b\n\n";
let result = load_from_files(vocab, merges);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert_eq!(tokenizer.merges.len(), 1);
}
#[test]
fn test_load_from_files_invalid_json() {
let vocab = "not valid json";
let merges = "";
let result = load_from_files(vocab, merges);
assert!(result.is_err());
}
#[test]
fn test_qwen2_from_json() {
let json = r#"{
"model": {
"vocab": {
"<|endoftext|>": 151643,
"<|im_start|>": 151644,
"<|im_end|>": 151645,
"hello": 0
},
"merges": []
},
"added_tokens": [
{"id": 151643, "content": "<|endoftext|>", "special": true},
{"id": 151644, "content": "<|im_start|>", "special": true},
{"id": 151645, "content": "<|im_end|>", "special": true}
]
}"#;
let result = Qwen2BpeTokenizer::from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert!(tokenizer.is_eos(151645));
assert!(tokenizer.is_bos(151644));
}
#[test]
fn test_qwen2_from_json_default_ids() {
let json = r#"{
"model": {
"vocab": {"hello": 0, "world": 1},
"merges": []
},
"added_tokens": []
}"#;
let result = Qwen2BpeTokenizer::from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert_eq!(tokenizer.im_start_id(), Qwen2BpeTokenizer::IM_START_ID);
assert_eq!(tokenizer.im_end_id(), Qwen2BpeTokenizer::IM_END_ID);
}
#[test]
fn test_qwen2_from_file_not_found() {
let result = Qwen2BpeTokenizer::from_file("/nonexistent/path/tokenizer.json");
assert!(result.is_err());
}
#[test]
fn test_merge_rule_debug_clone() {
let rule = MergeRule::new("a", "b");
let cloned = rule.clone();
assert_eq!(rule, cloned);
let debug_str = format!("{:?}", rule);
assert!(debug_str.contains("MergeRule"));
}
#[test]
fn test_bpe_tokenizer_debug_clone() {
let tokenizer = BpeTokenizer::gpt2_base();
let cloned = tokenizer.clone();
assert_eq!(tokenizer.vocab_size(), cloned.vocab_size());
let debug_str = format!("{:?}", tokenizer);
assert!(debug_str.contains("BpeTokenizer"));
}
#[test]
fn test_qwen2_tokenizer_debug_clone() {
let tokenizer = Qwen2BpeTokenizer::new();
let cloned = tokenizer.clone();
assert_eq!(tokenizer.vocab_size(), cloned.vocab_size());
let debug_str = format!("{:?}", tokenizer);
assert!(debug_str.contains("Qwen2BpeTokenizer"));
}
#[test]
fn test_bpe_config_debug_clone() {
let config = BpeConfig::default();
let cloned = config.clone();
assert_eq!(config.vocab_size, cloned.vocab_size);
let debug_str = format!("{:?}", config);
assert!(debug_str.contains("BpeConfig"));
}
#[test]
fn test_encode_unk_token_fallback() {
let mut tokenizer = BpeTokenizer::new(BpeConfig::default());
tokenizer.add_special_token("<unk>", 0);
let tokens = tokenizer.encode("x");
assert!(tokens.is_empty() || tokens.contains(&0));
}
#[test]
fn test_pre_tokenize_multiple_spaces() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let words = tokenizer.pre_tokenize("hello world");
assert!(words.len() >= 2);
}
#[test]
fn test_pre_tokenize_leading_space() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let words = tokenizer.pre_tokenize(" hello");
assert!(!words.is_empty());
assert!(words[0].starts_with(' '));
}
#[test]
fn test_bpe_tokens_to_bytes_invalid_chars() {
let tokenizer = BpeTokenizer::gpt2_base();
let result = tokenizer.bpe_tokens_to_bytes("αβγ");
let _ = result;
}
#[test]
fn test_qwen2_encode_special_tokens() {
let tokenizer = Qwen2BpeTokenizer::new();
let text = "<|im_start|>user";
let tokens = tokenizer.encode(text);
assert!(tokens.contains(&151644));
}
#[test]
fn test_bpe_merge_priority() {
let mut tokenizer = BpeTokenizer::new(BpeConfig::default());
tokenizer.add_merge("x", "y"); tokenizer.add_merge("a", "b");
let tokens = vec![
"a".to_string(),
"b".to_string(),
"x".to_string(),
"y".to_string(),
];
let result = tokenizer.bpe(&tokens);
assert_eq!(result.len(), 2);
}