use super::*;
#[test]
fn test_decode_byte_token() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"<0x0A>".to_string(), ];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let decoded = tokenizer.decode(&[3]); assert_eq!(decoded, "\n");
}
#[test]
fn test_decode_invalid_byte_token() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"<0xZZ>".to_string(), ];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let decoded = tokenizer.decode(&[3]);
assert_eq!(decoded, "<0xZZ>");
}
#[test]
fn test_encode_gpt2_mode() {
let mut tokenizer = create_test_tokenizer();
tokenizer.set_model(TokenizerModel::Gpt2);
let tokens = tokenizer.encode("Hello World");
assert!(!tokens.is_empty());
}
#[test]
fn test_from_gguf_too_short() {
let short_data = b"GGUF";
let result = LlamaTokenizer::from_gguf_bytes(short_data);
assert!(result.is_err());
}
#[test]
fn test_from_gguf_missing_tokens() {
let mut data = Vec::new();
data.extend_from_slice(b"GGUF");
data.extend_from_slice(&3u32.to_le_bytes()); data.extend_from_slice(&0u64.to_le_bytes()); data.extend_from_slice(&0u64.to_le_bytes());
let result = LlamaTokenizer::from_gguf_bytes(&data);
assert!(result.is_err());
}
#[test]
fn test_gguf_value_variants() {
let values = vec![
GGUFValue::UInt8(1),
GGUFValue::Int8(-1),
GGUFValue::UInt16(100),
GGUFValue::Int16(-100),
GGUFValue::UInt32(1000),
GGUFValue::Int32(-1000),
GGUFValue::Float32(1.5),
GGUFValue::Bool(true),
GGUFValue::String("test".to_string()),
GGUFValue::Array(vec![GGUFValue::UInt8(1)]),
GGUFValue::UInt64(10000),
GGUFValue::Int64(-10000),
GGUFValue::Float64(3.14),
];
for val in &values {
let debug = format!("{:?}", val);
assert!(!debug.is_empty());
}
let cloned = values.clone();
assert_eq!(values.len(), cloned.len());
}
#[test]
fn test_decode_gpt2_non_mapped_char() {
let bytes = decode_gpt2_token("日本語"); let text = String::from_utf8_lossy(&bytes);
assert!(!text.is_empty());
}
#[test]
fn test_gpt2_decode_full_sentence() {
let mut tokenizer = create_test_tokenizer();
tokenizer.set_model(TokenizerModel::Gpt2);
let decoded = tokenizer.decode(&[
tokenizer.bos_token_id(),
3, tokenizer.eos_token_id(),
]);
assert!(!decoded.contains("<s>"));
assert!(!decoded.contains("</s>"));
}
#[test]
fn test_vocab_size_accessor() {
let tokenizer = create_test_tokenizer();
assert!(tokenizer.vocab_size() > 0);
}
#[test]
fn test_constants() {
assert_eq!(LLAMA_VOCAB_SIZE, 32000);
assert_eq!(BOS_TOKEN, "<s>");
assert_eq!(EOS_TOKEN, "</s>");
assert_eq!(UNK_TOKEN, "<unk>");
}
#[test]
fn test_encode_with_byte_fallback_for_emoji() {
let mut tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"▁Hello".to_string(),
];
for i in 0u8..=255 {
tokens.push(format!("<0x{i:02X}>"));
}
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let encoded = tokenizer.encode("🎉");
assert!(!encoded.is_empty());
}
#[test]
fn test_decode_skips_unknown_token_id() {
let tokenizer = create_test_tokenizer();
let decoded = tokenizer.decode(&[9999]);
assert!(decoded.is_empty() || decoded.len() > 0);
}
#[test]
fn test_gpt2_byte_decoder_size() {
let decoder = build_gpt2_byte_decoder();
assert_eq!(decoder.len(), 256);
}
#[test]
fn test_decode_sentencepiece_handles_hybrid_space() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"Ġworld".to_string(), ];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let decoded = tokenizer.decode(&[3]);
assert_eq!(decoded, "world"); }
#[test]
fn test_pad_token_accessors() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"<pad>".to_string(),
];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
assert!(tokenizer.pad_token_id.is_none());
}
#[test]
fn test_encode_long_text() {
let tokenizer = create_test_tokenizer();
let long_text = "Hello World Hello World Hello World Hello World";
let tokens = tokenizer.encode(long_text);
assert!(!tokens.is_empty());
}
#[test]
fn test_encode_special_chars() {
let tokenizer = create_test_tokenizer();
let text_with_special = "Hello\tWorld\nNew Line";
let tokens = tokenizer.encode(text_with_special);
assert!(!tokens.is_empty());
}
#[test]
fn test_encode_numbers() {
let tokenizer = create_test_tokenizer();
let numeric = "12345 67890";
let tokens = tokenizer.encode(numeric);
assert!(!tokens.is_empty());
}
#[test]
fn test_decode_empty() {
let tokenizer = create_test_tokenizer();
let decoded = tokenizer.decode(&[]);
assert!(decoded.is_empty());
}
#[test]
fn test_decode_gpt2_with_all_special_tokens() {
let tokens = vec![
"<unk>".to_string(),
"<|endoftext|>".to_string(),
"</s>".to_string(),
"test".to_string(),
];
let scores = vec![0.0; tokens.len()];
let mut tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
tokenizer.set_model(TokenizerModel::Gpt2);
let decoded = tokenizer.decode(&[1, 2]); assert!(decoded.is_empty());
}
#[test]
fn test_decode_gpt2_mixed_tokens() {
let tokens = vec![
"<unk>".to_string(),
"<|endoftext|>".to_string(),
"</s>".to_string(),
"Hello".to_string(),
"Ġworld".to_string(),
];
let scores = vec![0.0; tokens.len()];
let mut tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
tokenizer.set_model(TokenizerModel::Gpt2);
let decoded = tokenizer.decode(&[1, 3, 4, 2]); assert_eq!(decoded, "Hello world");
}
#[test]
fn test_encode_gpt2_with_newlines() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"Hello".to_string(),
"Ċworld".to_string(), ];
let scores = vec![0.0; tokens.len()];
let mut tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
tokenizer.set_model(TokenizerModel::Gpt2);
let encoded = tokenizer.encode("test\nline");
assert!(!encoded.is_empty());
}
#[test]
fn test_token_to_id_all_tokens() {
let tokenizer = create_test_tokenizer();
assert!(tokenizer.token_to_id("<unk>").is_some());
assert!(tokenizer.token_to_id("<s>").is_some());
assert!(tokenizer.token_to_id("</s>").is_some());
}
#[test]
fn test_id_to_token_boundary() {
let tokenizer = create_test_tokenizer();
assert!(tokenizer.id_to_token(0).is_some()); assert!(tokenizer
.id_to_token(tokenizer.vocab_size() as u32 - 1)
.is_some()); assert!(tokenizer
.id_to_token(tokenizer.vocab_size() as u32)
.is_none()); }
#[test]
fn test_eos_token_id_accessor() {
let tokenizer = create_test_tokenizer();
assert_eq!(tokenizer.eos_token_id(), 2);
}
#[test]
fn test_unk_token_id_accessor() {
let tokenizer = create_test_tokenizer();
assert_eq!(tokenizer.unk_token_id(), 0);
}
#[test]
fn test_bos_token_id_accessor() {
let tokenizer = create_test_tokenizer();
assert_eq!(tokenizer.bos_token_id(), 1);
}
#[test]
fn test_encode_with_bos_empty() {
let tokenizer = create_test_tokenizer();
let tokens = tokenizer.encode_with_bos("");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], tokenizer.bos_token_id());
}
#[test]
fn test_encode_with_bos_single_word() {
let tokenizer = create_test_tokenizer();
let tokens = tokenizer.encode_with_bos("Hello");
assert!(tokens.len() >= 2); assert_eq!(tokens[0], tokenizer.bos_token_id());
}
#[test]
fn test_decode_sentencepiece_byte_fallback_invalid() {
let tokens = vec![
"<unk>".to_string(),
"<s>".to_string(),
"</s>".to_string(),
"<0x".to_string(), ];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let decoded = tokenizer.decode(&[3]);
assert_eq!(decoded, "<0x");
}
#[test]
fn test_from_gguf_bytes_v1_format() {
let mut data = Vec::new();
data.extend_from_slice(b"GGUF");
data.extend_from_slice(&1u32.to_le_bytes()); data.extend_from_slice(&0u64.to_le_bytes());
data.extend_from_slice(&0u64.to_le_bytes());
let result = LlamaTokenizer::from_gguf_bytes(&data);
let _ = result.is_ok();
}
#[test]
fn test_gguf_value_array_nested() {
let values = GGUFValue::Array(vec![
GGUFValue::Array(vec![GGUFValue::UInt8(1), GGUFValue::UInt8(2)]),
GGUFValue::Array(vec![GGUFValue::UInt8(3), GGUFValue::UInt8(4)]),
]);
let debug = format!("{:?}", values);
assert!(debug.contains("Array"));
}
#[test]
fn test_tokenizer_model_values() {
let sp = TokenizerModel::SentencePiece;
let gpt2 = TokenizerModel::Gpt2;
assert_ne!(sp, gpt2);
assert_eq!(sp, TokenizerModel::default());
}
#[test]
fn test_encode_all_unk() {
let tokens = vec!["<unk>".to_string(), "<s>".to_string(), "</s>".to_string()];
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let encoded = tokenizer.encode("xyz123");
assert!(!encoded.is_empty());
}
#[test]
fn test_decode_multiple_byte_tokens() {
let mut tokens = vec!["<unk>".to_string(), "<s>".to_string(), "</s>".to_string()];
for i in 0u8..10 {
tokens.push(format!("<0x{:02X}>", i));
}
let scores = vec![0.0; tokens.len()];
let tokenizer = LlamaTokenizer::new(tokens, scores, 1, 2, 0).unwrap();
let decoded = tokenizer.decode(&[3, 4, 5]); assert_eq!(decoded.len(), 3);
}
#[test]
fn test_gpt2_byte_decoder_all_chars() {
let decoder = build_gpt2_byte_decoder();
assert_eq!(decoder.len(), 256);
for b in b'!'..=b'~' {
assert!(decoder.values().any(|&v| v == b));
}
}
#[test]
fn test_decode_gpt2_token_complex() {
let token = "HelloĠworldĊnewline";
let bytes = decode_gpt2_token(token);
let text = String::from_utf8_lossy(&bytes);
assert!(text.contains(" ")); assert!(text.contains("\n")); }