#![cfg(any(
feature = "tokenizer-spm",
feature = "tokenizer-bpe",
feature = "tokenizer-deepseek-v32",
feature = "tokenizer-tools"
))]
#[cfg(feature = "tokenizer-spm")]
#[test]
fn spm_huge_sparse_vocab_id_no_oom_and_detok_correct() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::SpmStreamingDetokenizer};
let vocab = vec![
("\u{2581}Hello".to_string(), 0u32),
("\u{2581}world".to_string(), 1u32),
("!".to_string(), 2u32),
("\u{2581}sparse".to_string(), u32::MAX),
("\u{2581}mid".to_string(), 9_999_999u32),
];
let mut d = SpmStreamingDetokenizer::new(vocab, true);
d.reset();
for t in [0u32, 1, 2] {
d.add_token(t);
}
d.finalize();
assert_eq!(d.text(), "Hello world!");
let mut d2 = SpmStreamingDetokenizer::new(
vec![
("\u{2581}far".to_string(), u32::MAX),
("\u{2581}away".to_string(), 4_000_000_000u32),
],
true,
);
d2.reset();
d2.add_token(u32::MAX);
d2.add_token(4_000_000_000u32);
d2.finalize();
assert_eq!(d2.text(), "far away");
let mut d3 = SpmStreamingDetokenizer::new(vec![("\u{2581}x".to_string(), u32::MAX)], false);
d3.reset();
d3.add_token(u32::MAX);
d3.finalize();
assert_eq!(d3.text(), " x");
}
#[cfg(feature = "tokenizer-spm")]
#[test]
fn spm_byte_token_with_non_ascii_does_not_panic() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::SpmStreamingDetokenizer};
let vocab = vec![
("<0x\u{20AC}".to_string(), 0u32),
("<0x41>".to_string(), 1u32),
];
let mut d = SpmStreamingDetokenizer::new(vocab, false);
d.reset();
d.add_token(1);
d.finalize();
assert_eq!(d.text(), "A");
}
#[cfg(feature = "tokenizer-bpe")]
#[test]
fn bpe_huge_sparse_vocab_id_no_oom_and_detok_correct() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::BpeStreamingDetokenizer};
let vocab = vec![
("Hello".to_string(), 0u32),
("\u{0120}world".to_string(), 1u32),
("\u{0120}sparse".to_string(), u32::MAX),
("\u{0120}mid".to_string(), 3_000_000_000u32),
];
let mut d = BpeStreamingDetokenizer::new(vocab, false);
d.reset();
d.add_token(0);
d.add_token(1);
d.finalize();
assert_eq!(d.text(), "Hello world");
let mut d2 = BpeStreamingDetokenizer::new(vec![("\u{0120}far".to_string(), u32::MAX)], false);
d2.reset();
d2.add_token(u32::MAX);
d2.add_token(123_456u32); d2.finalize();
assert_eq!(d2.text(), "far");
}
#[cfg(feature = "tokenizer-deepseek-v32")]
#[test]
fn deepseek_v32_thinking_developer_user_no_panic() {
use mlxrs::tokenizer::chat::{ChatTemplateOverride, DeepseekV32};
use serde_json::json;
let bos = "<\u{ff5c}begin\u{2581}of\u{2581}sentence\u{ff5c}>";
let think_start = "<think>";
let think_end = "</think>";
let messages = json!([
{"role": "developer", "content": "be terse"},
{"role": "user", "content": "hi"},
]);
let out = DeepseekV32
.apply(messages.as_array().unwrap(), None, true, false, true)
.expect("must not panic / error on [developer, user]");
let expected = format!(
"{bos}<\u{ff5c}User\u{ff5c}>\n\n# The user's message is: be terse<\u{ff5c}Assistant\u{ff5c}>{think_end}<\u{ff5c}User\u{ff5c}>hi<\u{ff5c}Assistant\u{ff5c}>{think_start}"
);
assert_eq!(out, expected);
let messages2 = json!([
{"role": "developer", "content": "ctx"},
{"role": "user", "content": "q"},
{"role": "assistant", "content": "a", "reasoning_content": "r"},
]);
let out2 = DeepseekV32
.apply(messages2.as_array().unwrap(), None, true, false, true)
.expect("must not panic / error on [developer, user, assistant]");
assert!(out2.starts_with(bos));
assert!(out2.contains("# The user's message is: ctx"));
assert!(out2.contains("<\u{ff5c}User\u{ff5c}>q<\u{ff5c}Assistant\u{ff5c}>"));
}
#[cfg(feature = "tokenizer-tools")]
#[test]
fn pythonic_unicode_value_after_space_no_panic() {
use mlxrs::tokenizer::tools::{Pythonic, ToolParser};
use serde_json::json;
let calls = Pythonic
.parse(
"<|tool_call_start|>[f(city= \"\u{e9}\", n= 2)]<|tool_call_end|>",
None,
)
.expect("unicode-after-space must parse, not panic");
assert_eq!(calls.len(), 1);
assert_eq!(calls[0].name(), "f");
assert_eq!(calls[0].arguments()["city"], json!("\u{e9}"));
assert_eq!(calls[0].arguments()["n"], json!(2));
let calls2 = Pythonic
.parse("[g(a= \u{1f600}\u{1f680}, b=\"\u{4e2d}\u{6587}\")]", None)
.expect("multibyte unquoted/quoted values must not panic");
assert_eq!(calls2[0].name(), "g");
assert_eq!(calls2[0].arguments()["b"], json!("\u{4e2d}\u{6587}"));
let calls3 = Pythonic
.parse(
"<|tool_call_start|>[get_weather(city=\"Paris\", days=3)]<|tool_call_end|>",
None,
)
.unwrap();
assert_eq!(calls3[0].name(), "get_weather");
assert_eq!(calls3[0].arguments()["city"], json!("Paris"));
assert_eq!(calls3[0].arguments()["days"], json!(3));
}
#[cfg(feature = "tokenizer-tools")]
#[test]
fn function_gemma_escape_unicode_after_value_no_panic() {
use mlxrs::tokenizer::tools::{FunctionGemma, ToolParser};
let r = FunctionGemma.parse(
"<start_function_call>call:f{k:<escape>v<escape>\u{e9}}<end_function_call>",
None,
);
let _ = r;
let ok = FunctionGemma
.parse("call:greet{name:<escape>Bob<escape>,count:3}", None)
.expect("valid function_gemma must still parse");
assert_eq!(ok[0].name(), "greet");
assert_eq!(ok[0].arguments()["name"], serde_json::json!("Bob"));
assert_eq!(ok[0].arguments()["count"], serde_json::json!(3));
}
#[cfg(feature = "tokenizer-tools")]
#[test]
fn gemma4_balanced_brace_non_ascii_inside_no_panic() {
use mlxrs::tokenizer::tools::{Gemma4, ToolParser};
let calls = Gemma4
.parse(
"call:f{city:<|\"|>\u{e9}\u{1f600}<|\"|>,note:<|\"|>\u{4e2d}\u{6587}<|\"|>}",
None,
)
.expect("non-ASCII gemma4 string values must parse, not panic");
assert_eq!(calls[0].name(), "f");
assert_eq!(
calls[0].arguments()["city"],
serde_json::json!("\u{e9}\u{1f600}")
);
assert_eq!(
calls[0].arguments()["note"],
serde_json::json!("\u{4e2d}\u{6587}")
);
let ok = Gemma4
.parse("call:f{name:<|\"|>Bob<|\"|>,n:2}", None)
.expect("valid gemma4 must still parse");
assert_eq!(ok[0].name(), "f");
assert_eq!(ok[0].arguments()["name"], serde_json::json!("Bob"));
assert_eq!(ok[0].arguments()["n"], serde_json::json!(2));
}