use super::*;
#[test]
fn test_encode_without_prefix_space() {
let config = BpeConfig {
add_prefix_space: false,
..BpeConfig::default()
};
let tokenizer = BpeTokenizer::new(config);
let tokens = tokenizer.encode("test");
assert!(tokens.is_empty());
}
#[test]
fn test_decode_with_unknown_id() {
let tokenizer = BpeTokenizer::gpt2_base();
let decoded = tokenizer.decode(&[999999]);
assert!(decoded.is_empty());
}
#[test]
fn test_decode_skips_special_tokens() {
let tokenizer = BpeTokenizer::gpt2_base();
let decoded = tokenizer.decode(&[50256, 72]);
assert!(!decoded.contains("<|endoftext|>"));
}
#[test]
fn test_bytes_to_bpe_tokens_unknown() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let result = tokenizer.bytes_to_bpe_tokens("A");
assert!(!result.is_empty());
}
#[test]
fn test_load_from_json_with_special_added_tokens() {
let json = r#"{
"model": {
"vocab": {"hello": 0, "world": 1},
"merges": []
},
"added_tokens": [
{"id": 100, "content": "<special>", "special": true},
{"id": 101, "content": "normal", "special": false}
]
}"#;
let result = load_from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert!(tokenizer.is_special_token("<special>"));
assert!(!tokenizer.is_special_token("normal"));
assert_eq!(tokenizer.token_to_id("normal"), Some(101));
}
#[test]
fn test_load_from_json_whisper_vocab_size() {
let mut vocab_entries: Vec<String> = Vec::new();
for i in 0..51000 {
vocab_entries.push(format!("\"tok{i}\": {i}"));
}
let vocab_str = vocab_entries.join(", ");
let json = format!(
"{{\"model\": {{\"vocab\": {{ {} }}, \"merges\": []}}, \"added_tokens\": []}}",
vocab_str
);
let result = load_from_json(&json);
assert!(result.is_ok());
}
#[test]
fn test_load_from_json_qwen2_vocab_size() {
let mut vocab_entries: Vec<String> = Vec::new();
for i in 0..151000 {
vocab_entries.push(format!("\"tok{i}\": {i}"));
}
let vocab_str = vocab_entries.join(", ");
let json = format!(
"{{\"model\": {{\"vocab\": {{ {} }}, \"merges\": []}}, \"added_tokens\": []}}",
vocab_str
);
let result = load_from_json(&json);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_whisper_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..51000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_gpt2_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..41000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_qwen2_vocab_size() {
let mut vocab: HashMap<String, u32> = HashMap::new();
for i in 0..151000u32 {
vocab.insert(format!("tok{i}"), i);
}
let vocab_json = serde_json::to_string(&vocab).expect("serialize");
let merges = "";
let result = load_from_files(&vocab_json, merges);
assert!(result.is_ok());
}
#[test]
fn test_load_from_files_empty_lines() {
let vocab = "{}";
let merges = "\n\na b\n\n";
let result = load_from_files(vocab, merges);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert_eq!(tokenizer.merges.len(), 1);
}
#[test]
fn test_load_from_files_invalid_json() {
let vocab = "not valid json";
let merges = "";
let result = load_from_files(vocab, merges);
assert!(result.is_err());
}
#[test]
fn test_qwen2_from_json() {
let json = r#"{
"model": {
"vocab": {
"<|endoftext|>": 151643,
"<|im_start|>": 151644,
"<|im_end|>": 151645,
"hello": 0
},
"merges": []
},
"added_tokens": [
{"id": 151643, "content": "<|endoftext|>", "special": true},
{"id": 151644, "content": "<|im_start|>", "special": true},
{"id": 151645, "content": "<|im_end|>", "special": true}
]
}"#;
let result = Qwen2BpeTokenizer::from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert!(tokenizer.is_eos(151645));
assert!(tokenizer.is_bos(151644));
}
#[test]
fn test_qwen2_from_json_default_ids() {
let json = r#"{
"model": {
"vocab": {"hello": 0, "world": 1},
"merges": []
},
"added_tokens": []
}"#;
let result = Qwen2BpeTokenizer::from_json(json);
assert!(result.is_ok());
let tokenizer = result.expect("load failed");
assert_eq!(tokenizer.im_start_id(), Qwen2BpeTokenizer::IM_START_ID);
assert_eq!(tokenizer.im_end_id(), Qwen2BpeTokenizer::IM_END_ID);
}
#[test]
fn test_qwen2_from_file_not_found() {
let result = Qwen2BpeTokenizer::from_file("/nonexistent/path/tokenizer.json");
assert!(result.is_err());
}
#[test]
fn test_merge_rule_debug_clone() {
let rule = MergeRule::new("a", "b");
let cloned = rule.clone();
assert_eq!(rule, cloned);
let debug_str = format!("{:?}", rule);
assert!(debug_str.contains("MergeRule"));
}
#[test]
fn test_bpe_tokenizer_debug_clone() {
let tokenizer = BpeTokenizer::gpt2_base();
let cloned = tokenizer.clone();
assert_eq!(tokenizer.vocab_size(), cloned.vocab_size());
let debug_str = format!("{:?}", tokenizer);
assert!(debug_str.contains("BpeTokenizer"));
}
#[test]
fn test_qwen2_tokenizer_debug_clone() {
let tokenizer = Qwen2BpeTokenizer::new();
let cloned = tokenizer.clone();
assert_eq!(tokenizer.vocab_size(), cloned.vocab_size());
let debug_str = format!("{:?}", tokenizer);
assert!(debug_str.contains("Qwen2BpeTokenizer"));
}
#[test]
fn test_bpe_config_debug_clone() {
let config = BpeConfig::default();
let cloned = config.clone();
assert_eq!(config.vocab_size, cloned.vocab_size);
let debug_str = format!("{:?}", config);
assert!(debug_str.contains("BpeConfig"));
}
#[test]
fn test_encode_unk_token_fallback() {
let mut tokenizer = BpeTokenizer::new(BpeConfig::default());
tokenizer.add_special_token("<unk>", 0);
let tokens = tokenizer.encode("x");
assert!(tokens.is_empty() || tokens.contains(&0));
}
#[test]
fn test_pre_tokenize_multiple_spaces() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let words = tokenizer.pre_tokenize("hello world");
assert!(words.len() >= 2);
}
#[test]
fn test_pre_tokenize_leading_space() {
let tokenizer = BpeTokenizer::new(BpeConfig::default());
let words = tokenizer.pre_tokenize(" hello");
assert!(!words.is_empty());
assert!(words[0].starts_with(' '));
}
#[test]
fn test_bpe_tokens_to_bytes_invalid_chars() {
let tokenizer = BpeTokenizer::gpt2_base();
let result = tokenizer.bpe_tokens_to_bytes("αβγ");
let _ = result;
}
#[test]
fn test_qwen2_encode_special_tokens() {
let tokenizer = Qwen2BpeTokenizer::new();
let text = "<|im_start|>user";
let tokens = tokenizer.encode(text);
assert!(tokens.contains(&151644));
}
#[test]
fn test_bpe_merge_priority() {
let mut tokenizer = BpeTokenizer::new(BpeConfig::default());
tokenizer.add_merge("x", "y"); tokenizer.add_merge("a", "b");
let tokens = vec![
"a".to_string(),
"b".to_string(),
"x".to_string(),
"y".to_string(),
];
let result = tokenizer.bpe(&tokens);
assert_eq!(result.len(), 2);
}