use super::*;
#[test]
fn test_bpe_encode_multibyte_unicode() {
let vocab = vec![
"<unk>".to_string(),
"<0xE4>".to_string(), "<0xB8>".to_string(),
"<0xAD>".to_string(),
];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\u{4E2D}");
assert_eq!(encoded, vec![1, 2, 3]);
}
#[test]
fn test_bpe_encode_multibyte_to_unk() {
let vocab = vec!["<unk>".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\u{4E2D}");
assert_eq!(encoded.len(), 3); assert!(encoded.iter().all(|&id| id == 0)); }
#[test]
fn test_bpe_encode_emoji_fallback() {
let vocab = vec!["<unk>".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\u{1F600}"); assert_eq!(encoded.len(), 4); assert!(encoded.iter().all(|&id| id == 0)); }
#[test]
fn test_bpe_encode_long_token_match() {
let vocab = vec![
"<unk>".to_string(),
"a".to_string(),
"ab".to_string(),
"abc".to_string(),
"abcd".to_string(),
"abcde".to_string(),
"abcdefghij".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("abcdefghij");
assert_eq!(encoded, vec![6]);
let encoded2 = tokenizer.encode("abcdefghijk");
assert_eq!(encoded2, vec![6, 0]);
}
#[test]
fn test_bpe_encode_newline_and_carriage_return() {
let vocab = vec![
"<unk>".to_string(),
"\u{010A}".to_string(), "\u{1E02}".to_string(), "a".to_string(),
];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("a\na");
assert_eq!(encoded, vec![3, 1, 3]);
let encoded2 = tokenizer.encode("a\ra");
assert_eq!(encoded2, vec![3, 2, 3]); }
#[test]
fn test_bpe_encode_mixed_special_whitespace() {
let vocab = vec![
"<unk>".to_string(),
"\u{0120}".to_string(), "\u{010A}".to_string(), "\u{1E02}".to_string(), "x".to_string(),
];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("x x\n\rx");
assert_eq!(encoded, vec![4, 1, 4, 2, 3, 4]);
}
#[test]
fn test_bpe_decode_all_special_tokens() {
let vocab = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"<pad>".to_string(),
"<|user|>".to_string(),
"<|assistant|>".to_string(),
"hello".to_string(),
];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[0, 1, 2, 3, 4, 5, 6]).expect("test");
assert_eq!(decoded, "hello");
}
#[test]
fn test_bpe_decode_byte_token_invalid_length() {
let vocab = vec![
"<unk>".to_string(),
"<0xE6E>".to_string(), "<0xE>".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2]).expect("test");
assert!(decoded.contains("<0xE6E>"));
assert!(decoded.contains("<0xE>"));
}
#[test]
fn test_bpe_decode_utf8_invalid_sequence() {
let vocab = vec![
"<unk>".to_string(),
"<0xFF>".to_string(), "<0xFE>".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2]).expect("test");
assert!(decoded.contains('\u{FFFD}')); }
#[test]
fn test_bpe_decode_gpt2_control_chars() {
let vocab = vec![
"<unk>".to_string(),
"\u{0100}".to_string(), "\u{0109}".to_string(), "\u{010D}".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2, 3]).expect("test");
assert!(!decoded.is_empty());
}
#[test]
fn test_bpe_decode_gpt2_del_and_extended() {
let vocab = vec![
"<unk>".to_string(),
"\u{017F}".to_string(), "\u{0180}".to_string(), "\u{01A0}".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2, 3]).expect("test");
assert!(!decoded.is_empty());
}
#[test]
fn test_bpe_decode_gpt2_unmapped_range() {
let vocab = vec![
"<unk>".to_string(),
"\u{0150}".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1]).expect("test");
assert!(!decoded.is_empty());
}
#[test]
fn test_bpe_decode_high_unicode_non_gpt2() {
let vocab = vec![
"<unk>".to_string(),
"\u{0200}".to_string(), "\u{1000}".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2]).expect("test");
assert!(decoded.contains('\u{0200}'));
assert!(decoded.contains('\u{1000}'));
}
#[test]
fn test_bpe_merge_no_applicable_pairs() {
let vocab = vec![
"<unk>".to_string(),
"x".to_string(),
"y".to_string(),
"z".to_string(),
"ab".to_string(), ];
let merges = vec![("a".to_string(), "b".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").expect("test");
let encoded = tokenizer.encode("xyz");
assert_eq!(encoded, vec![1, 2, 3]); }
#[test]
fn test_bpe_merge_partial_sequence() {
let vocab = vec![
"<unk>".to_string(),
"a".to_string(),
"b".to_string(),
"c".to_string(),
"ab".to_string(),
];
let merges = vec![("a".to_string(), "b".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").expect("test");
let encoded = tokenizer.encode("abc");
assert_eq!(encoded, vec![4, 3]); }
#[test]
fn test_bpe_merge_multiple_pairs_in_sequence() {
let vocab = vec![
"<unk>".to_string(),
"a".to_string(),
"b".to_string(),
"ab".to_string(),
];
let merges = vec![("a".to_string(), "b".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").expect("test");
let encoded = tokenizer.encode("abab");
assert_eq!(encoded, vec![3, 3]); }
#[test]
fn test_bpe_merge_trailing_unmerged() {
let vocab = vec![
"<unk>".to_string(),
"a".to_string(),
"b".to_string(),
"ab".to_string(),
];
let merges = vec![("a".to_string(), "b".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").expect("test");
let encoded = tokenizer.encode("aba");
assert_eq!(encoded, vec![3, 1]); }
#[test]
fn test_bpe_merge_leading_unmerged() {
let vocab = vec![
"<unk>".to_string(),
"x".to_string(),
"a".to_string(),
"b".to_string(),
"ab".to_string(),
];
let merges = vec![("a".to_string(), "b".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").expect("test");
let encoded = tokenizer.encode("xab");
assert_eq!(encoded, vec![1, 4]); }
#[test]
fn test_bpe_encode_single_char_no_merges() {
let vocab = vec!["<unk>".to_string(), "a".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("a");
assert_eq!(encoded, vec![1]);
}
#[test]
fn test_bpe_decode_printable_ascii() {
let vocab = vec![
"<unk>".to_string(),
"!".to_string(), "~".to_string(), "A".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1, 2, 3]).expect("test");
assert_eq!(decoded, "!~A");
}
#[test]
fn test_bpe_gpt2_soft_hyphen() {
let vocab = vec!["<unk>".to_string(), "\u{01AD}".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[1]).expect("test");
assert!(!decoded.is_empty());
}