use oxibonsai_runtime::native_tokenizer::{NativeTokenizerBridge, NativeTokenizerError};
#[test]
fn char_level_fallback_encode_decode() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids = bridge.encode("hello").expect("encode should succeed");
let text = bridge.decode(&ids).expect("decode should succeed");
assert_eq!(text, "hello");
}
#[test]
fn char_level_fallback_nonempty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids = bridge.encode("hello").expect("encode should succeed");
assert!(
!ids.is_empty(),
"encoding a non-empty string must yield tokens"
);
}
#[test]
fn char_level_fallback_roundtrip_long() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let original = "thequickbrownfox";
let ids = bridge.encode(original).expect("encode should succeed");
assert!(
ids.len() >= original.len(),
"must encode at least one token per char"
);
let decoded = bridge.decode(&ids).expect("decode should succeed");
assert_eq!(decoded, original);
}
#[test]
fn native_tokenizer_vocab_size_positive() {
let bridge = NativeTokenizerBridge::char_level_fallback();
assert!(
bridge.vocab_size() > 0,
"vocab_size must be positive, got {}",
bridge.vocab_size()
);
}
#[test]
fn char_level_encode_consistent() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids1 = bridge.encode("consistent").expect("first encode");
let ids2 = bridge.encode("consistent").expect("second encode");
assert_eq!(
ids1, ids2,
"encoding the same text twice must yield equal IDs"
);
}
#[test]
fn char_level_special_chars() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids_ascii = bridge
.encode("hello world\nhow are you")
.expect("encode ascii with space+newline");
assert!(!ids_ascii.is_empty());
let ids_unicode = bridge.encode("café").expect("encode unicode");
assert!(!ids_unicode.is_empty());
}
#[test]
fn native_tokenizer_decode_empty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let text = bridge
.decode(&[])
.expect("decoding empty slice should succeed");
assert_eq!(text, "", "decoding [] must return an empty string");
}
#[test]
fn native_tokenizer_format_chat_no_template() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let result = bridge.format_chat(&[("user", "Hello!")]);
match result {
Err(NativeTokenizerError::NoChatTemplate) => { }
other => panic!("expected NoChatTemplate, got {:?}", other),
}
}
#[test]
fn native_tokenizer_with_chatml_format() {
let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
let prompt = bridge
.format_chat(&[("user", "Hello!")])
.expect("format_chat should succeed");
assert!(
prompt.contains("<|im_start|>user"),
"prompt must contain im_start+role; got: {prompt:?}"
);
assert!(
prompt.contains("Hello!"),
"prompt must contain the message content; got: {prompt:?}"
);
assert!(
prompt.contains("<|im_end|>"),
"prompt must contain im_end; got: {prompt:?}"
);
let multi = bridge
.format_chat(&[
("system", "You are a helpful assistant."),
("user", "What is 2+2?"),
])
.expect("multi-turn format_chat should succeed");
assert!(multi.contains("<|im_start|>system"));
assert!(multi.contains("You are a helpful assistant."));
assert!(multi.contains("<|im_start|>user"));
assert!(multi.contains("What is 2+2?"));
}
#[test]
fn native_tokenizer_encode_empty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids = bridge
.encode("")
.expect("encoding empty string should succeed");
assert!(
ids.len() <= 1,
"encoding empty string should yield 0 or 1 tokens (BOS), got {}",
ids.len()
);
}