use super::*;
#[test]
fn test_sentencepiece_viterbi_all_unknown() {
let vocab = vec![("<unk>".to_string(), 0.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("xyz");
assert_eq!(encoded.len(), 3); assert!(encoded.iter().all(|&id| id == 0)); }
#[test]
fn test_sentencepiece_viterbi_partial_match() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("a".to_string(), -1.0),
("c".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("abc");
assert_eq!(encoded.len(), 3);
assert_eq!(encoded[0], 1); assert_eq!(encoded[1], 0); assert_eq!(encoded[2], 2); }
#[test]
fn test_sentencepiece_viterbi_long_unknown_sequence() {
let vocab = vec![("<unk>".to_string(), 0.0), ("x".to_string(), -1.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("xaaaax");
assert_eq!(encoded.len(), 6);
assert_eq!(encoded[0], 1); assert_eq!(encoded[5], 1); assert!(encoded[1..5].iter().all(|&id| id == 0)); }
#[test]
fn test_sentencepiece_score_ordering() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("hello".to_string(), -10.0), ("hel".to_string(), -0.5), ("lo".to_string(), -0.5), ];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("hello");
assert_eq!(encoded, vec![2, 3]); }
#[test]
fn test_sentencepiece_unicode_chars() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("\u{3053}".to_string(), -1.0),
("\u{3093}".to_string(), -1.0),
("\u{3053}\u{3093}".to_string(), -0.5),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("\u{3053}\u{3093}");
assert_eq!(encoded, vec![3]); }
#[test]
fn test_vocabulary_large_id() {
let tokens: Vec<String> = (0..1000).map(|i| format!("token_{i}")).collect();
let vocab = Vocabulary::from_tokens(tokens).expect("test");
assert_eq!(vocab.size(), 1000);
assert_eq!(vocab.get_id("token_999"), Some(999));
assert_eq!(vocab.get_token(999), Some("token_999"));
}
#[test]
fn test_tokenizer_multiple_unknowns() {
let tokens = vec!["<unk>".to_string(), "a".to_string()];
let vocab = Vocabulary::from_tokens(tokens).expect("test");
let tokenizer = Tokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("x y z a");
assert_eq!(encoded, vec![0, 0, 0, 1]); }
#[test]
fn test_tokenizer_whitespace_only() {
let tokens = vec!["<unk>".to_string()];
let vocab = Vocabulary::from_tokens(tokens).expect("test");
let tokenizer = Tokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode(" ");
assert!(encoded.is_empty()); }
#[test]
fn test_bpe_encode_byte_token_in_vocab() {
let vocab = vec![
"<unk>".to_string(),
"<0xC3>".to_string(), "<0xA9>".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\u{00E9}");
assert_eq!(encoded, vec![1, 2]); }
#[test]
fn test_bpe_encode_mixed_known_unknown_bytes() {
let vocab = vec![
"<unk>".to_string(),
"<0xC3>".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\u{00E9}");
assert_eq!(encoded, vec![1, 0]); }
#[test]
fn test_sentencepiece_roundtrip_with_unknown() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("hello".to_string(), -1.0),
("x".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("helloYworld");
let decoded = tokenizer.decode(&encoded).expect("test");
assert!(decoded.contains("hello"));
assert!(decoded.contains("<unk>"));
}
#[test]
fn test_bpe_roundtrip_with_spaces() {
let vocab = vec![
"<unk>".to_string(),
"\u{0120}hello".to_string(), "\u{0120}world".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode(" hello world");
let decoded = tokenizer.decode(&encoded).expect("test");
assert_eq!(decoded, " hello world");
}
#[test]
fn test_bpe_decode_consecutive_special_chars() {
let vocab = vec![
"<unk>".to_string(),
"\u{0120}".to_string(), "\u{0120}\u{0120}".to_string(), "\u{010A}".to_string(), "\u{010A}\u{010A}".to_string(), ];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[2, 4]).expect("test");
assert_eq!(decoded, " \n\n"); }
#[test]
fn test_bpe_encode_only_spaces() {
let vocab = vec!["<unk>".to_string(), "\u{0120}".to_string()]; let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode(" ");
assert_eq!(encoded, vec![1, 1, 1]); }
#[test]
fn test_bpe_encode_only_newlines() {
let vocab = vec!["<unk>".to_string(), "\u{010A}".to_string()]; let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode("\n\n\n");
assert_eq!(encoded, vec![1, 1, 1]); }
#[test]
fn test_sentencepiece_equal_scores() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("a".to_string(), -1.0),
("b".to_string(), -1.0),
("ab".to_string(), -2.0), ];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("ab");
assert!(!encoded.is_empty());
let decoded = tokenizer.decode(&encoded).expect("test");
assert_eq!(decoded, "ab");
}
#[test]
fn test_sentencepiece_negative_infinity_unreachable() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("ab".to_string(), -1.0), ];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").expect("test");
let encoded = tokenizer.encode("a");
assert_eq!(encoded.len(), 1);
assert_eq!(encoded[0], 0); }
#[test]
fn test_bpe_very_long_token() {
let long_token = "a".repeat(30);
let vocab = vec!["<unk>".to_string(), long_token.clone()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let encoded = tokenizer.encode(&long_token);
assert_eq!(encoded, vec![1]);
let very_long = "a".repeat(35);
let encoded2 = tokenizer.encode(&very_long);
assert_eq!(encoded2.len(), 6); }
#[test]
fn test_vocabulary_special_chars_in_tokens() {
let tokens = vec![
"<unk>".to_string(),
"hello\tworld".to_string(), "foo\nbar".to_string(), "a b c".to_string(), ];
let vocab = Vocabulary::from_tokens(tokens).expect("test");
assert_eq!(vocab.get_id("hello\tworld"), Some(1));
assert_eq!(vocab.get_id("foo\nbar"), Some(2));
assert_eq!(vocab.get_id("a b c"), Some(3));
}
#[test]
fn test_bpe_decode_unk_token_skipped() {
let vocab = vec!["<unk>".to_string(), "hello".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").expect("test");
let decoded = tokenizer.decode(&[0, 1, 0]).expect("test");
assert_eq!(decoded, "hello"); }