#![cfg(feature = "tokenizer-stream")]
#[cfg(feature = "tokenizer-stream")]
#[test]
fn naive_text_and_last_segment_expose_partial_before_newline() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::NaiveStreamingDetokenizer};
let decode = |ids: &[u32]| {
ids
.iter()
.map(|&i| match i {
1 => "Hel",
2 => "lo",
3 => " wor",
4 => "ld",
_ => "",
})
.collect::<String>()
};
let mut d = NaiveStreamingDetokenizer::new(decode, false);
d.reset();
d.add_token(1);
assert_eq!(d.text(), "Hel");
let seg1 = d.last_segment();
assert_eq!(seg1, "Hel");
d.add_token(2);
assert_eq!(d.text(), "Hello");
let seg2 = d.last_segment();
assert_eq!(seg2, "lo");
d.add_token(3);
d.add_token(4);
assert_eq!(d.text(), "Hello world");
let seg3 = d.last_segment();
assert_eq!(seg3, " world");
d.finalize();
assert_eq!(d.text(), "Hello world");
assert_eq!(d.tokens(), &[1u32, 2, 3, 4]);
let decode2 = |ids: &[u32]| ids.iter().map(|&i| format!("x{i}")).collect::<String>();
let mut d2 = NaiveStreamingDetokenizer::new(decode2, false);
d2.reset();
d2.add_token(7);
assert_eq!(d2.combined_text(), d2.text());
assert_eq!(d2.combined_text(), "x7");
}
#[cfg(feature = "tokenizer-spm")]
#[test]
fn spm_streaming_unchanged_after_cow_text() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::SpmStreamingDetokenizer};
let vocab = vec![
("\u{2581}Hello".to_string(), 0u32),
("\u{2581}world".to_string(), 1u32),
("!".to_string(), 2u32),
];
let mut d = SpmStreamingDetokenizer::new(vocab, true);
d.reset();
let mut streamed = String::new();
for t in [0u32, 1, 2] {
d.add_token(t);
streamed.push_str(&d.last_segment());
}
d.finalize();
streamed.push_str(&d.last_segment());
assert_eq!(d.text(), "Hello world!");
assert_eq!(streamed, "Hello world!");
}
#[cfg(feature = "tokenizer-bpe")]
#[test]
fn bpe_streaming_unchanged_after_cow_text() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::BpeStreamingDetokenizer};
let vocab = vec![
("Hello".to_string(), 0u32),
("\u{0120}world".to_string(), 1u32),
];
let mut d = BpeStreamingDetokenizer::new(vocab, false);
d.reset();
let mut streamed = String::new();
d.add_token(0);
streamed.push_str(&d.last_segment());
d.add_token(1);
streamed.push_str(&d.last_segment());
d.finalize();
streamed.push_str(&d.last_segment());
assert_eq!(d.text(), "Hello world");
assert_eq!(streamed, "Hello world");
}
#[cfg(feature = "tokenizer-bpe")]
#[test]
fn bpe_byte_zero_token_streams_as_nul_not_u0100() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::BpeStreamingDetokenizer};
let vocab = vec![
("\u{0100}".to_string(), 0u32), ("A".to_string(), 1u32), ];
let mut d = BpeStreamingDetokenizer::new(vocab, false);
d.reset();
d.add_token(0);
d.add_token(1);
d.finalize();
let streamed = d.text().into_owned();
assert_eq!(streamed.as_bytes(), b"\0A");
assert!(
!streamed.contains('\u{0100}'),
"byte 0x00 must not surface as U+0100 text"
);
let mut d2 = BpeStreamingDetokenizer::new(vec![("\u{0100}".to_string(), 0u32)], false);
d2.reset();
d2.add_token(0);
d2.finalize();
assert_eq!(d2.text().into_owned().as_bytes(), b"\0");
}
#[cfg(feature = "tokenizer-bpe")]
#[test]
fn bpe_sparse_inrange_hole_is_empty_out_of_range_is_bang() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::BpeStreamingDetokenizer};
let vocab = vec![
("Hi".to_string(), 0u32),
("\u{0120}there".to_string(), 5u32),
];
let mut d = BpeStreamingDetokenizer::new(vocab, false);
d.reset();
d.add_token(0); d.add_token(3); d.add_token(5); d.add_token(9); d.finalize();
assert_eq!(d.text(), "Hi there!");
let mut d2 = BpeStreamingDetokenizer::new(vec![("\u{0120}far".to_string(), u32::MAX)], false);
d2.reset();
d2.add_token(u32::MAX); d2.add_token(123_456u32); d2.finalize();
assert_eq!(d2.text(), "far");
let mut d3 = BpeStreamingDetokenizer::new(vec![("Hello".to_string(), 0u32)], false);
d3.reset();
d3.add_token(0); d3.add_token(1); d3.finalize();
assert_eq!(d3.text(), "Hello!");
}
#[cfg(feature = "tokenizer-spm")]
#[test]
fn last_segment_allocates_only_the_per_step_delta_not_the_whole_buffer() {
use mlxrs::tokenizer::{StreamingDetokenizer, stream::SpmStreamingDetokenizer};
const N: u32 = 2048;
let vocab: Vec<(String, u32)> = (0..N).map(|i| (format!("\u{2581}w{i}"), i)).collect();
let mut d = SpmStreamingDetokenizer::new(vocab, true);
d.reset();
let mut concat = String::new();
let mut max_seg_len = 0usize;
let mut prev_text_len = 0usize;
for t in 0..N {
d.add_token(t);
let text_len_before = d.text().len();
let seg = d.last_segment();
let delta = text_len_before - prev_text_len;
assert_eq!(
seg.len(),
delta,
"token {t}: segment length must equal the per-step text delta, \
not the cumulative buffer"
);
assert!(
seg.len() <= "\u{2581}w0000".len(),
"token {t}: per-step segment {} bytes — must stay bounded by one \
word, never grow with total output (a full-buffer clone regression)",
seg.len()
);
if t > 16 {
assert!(
seg.len() < concat.len(),
"token {t}: segment ({} bytes) must be far shorter than the \
{}-byte accumulated text — the old `into_owned()` cloned the \
whole buffer",
seg.len(),
concat.len()
);
assert!(
!seg.contains(&concat[..concat.len() / 2]),
"token {t}: segment must not contain a prefix of the whole prior \
output (would indicate a full-buffer copy was returned)"
);
}
max_seg_len = max_seg_len.max(seg.len());
prev_text_len = text_len_before;
concat.push_str(&seg);
}
d.finalize();
concat.push_str(&d.last_segment());
assert_eq!(concat, d.text());
let final_len = d.text().len();
assert!(
max_seg_len < final_len / 4,
"max single segment ({max_seg_len} bytes) must be tiny vs the final \
{final_len}-byte buffer — proves no per-call full-buffer copy"
);
}
#[cfg(feature = "tokenizer-stream")]
#[test]
fn detokenizer_factory_returns_typed_variant_for_naive() {
use mlxrs::tokenizer::{Detokenizer, NaiveHfDetokenizer, StreamingDetokenizer};
use tokenizers::Tokenizer as HfTokenizer;
const TOKENIZER_JSON: &str = include_str!("fixtures/tokenizer.json");
let hf: HfTokenizer = TOKENIZER_JSON.parse().expect("parse fixture tokenizer");
let d = Detokenizer::Naive(Box::new(NaiveHfDetokenizer::new(hf, false)));
assert!(matches!(d, Detokenizer::Naive(_)));
let _: &[u32] = d.tokens();
}
#[cfg(feature = "tokenizer-stream")]
#[test]
fn detokenizer_custom_escape_hatch() {
use mlxrs::tokenizer::{Detokenizer, StreamingDetokenizer};
struct NullDetok {
tokens: Vec<u32>,
offset: usize,
}
impl StreamingDetokenizer for NullDetok {
fn reset(&mut self) {
self.tokens.clear();
self.offset = 0;
}
fn add_token(&mut self, t: u32) {
self.tokens.push(t);
}
fn finalize(&mut self) {}
fn text(&self) -> std::borrow::Cow<'_, str> {
std::borrow::Cow::Borrowed("")
}
fn tokens(&self) -> &[u32] {
&self.tokens
}
fn offset(&self) -> usize {
self.offset
}
fn set_offset(&mut self, o: usize) {
self.offset = o;
}
}
let mut d = Detokenizer::Custom(Box::new(NullDetok {
tokens: Vec::new(),
offset: 0,
}));
assert!(matches!(d, Detokenizer::Custom(_)));
d.add_token(42);
d.add_token(43);
assert_eq!(d.tokens(), &[42, 43]);
assert_eq!(d.text().as_ref(), "");
}
#[cfg(all(feature = "tokenizer-stream", feature = "tokenizer-spm"))]
#[test]
fn detokenizer_spm_variant_exists() {
use mlxrs::tokenizer::{Detokenizer, StreamingDetokenizer, stream::SpmStreamingDetokenizer};
let vocab = vec![("\u{2581}foo".to_string(), 0u32)];
let d = Detokenizer::Spm(SpmStreamingDetokenizer::new(vocab, false));
assert!(matches!(d, Detokenizer::Spm(_)));
let _: &[u32] = d.tokens();
}
#[cfg(all(feature = "tokenizer-stream", feature = "tokenizer-bpe"))]
#[test]
fn detokenizer_bpe_variant_exists() {
use mlxrs::tokenizer::{Detokenizer, StreamingDetokenizer, stream::BpeStreamingDetokenizer};
let vocab = vec![("\u{0120}foo".to_string(), 0u32)];
let d = Detokenizer::Bpe(BpeStreamingDetokenizer::new(vocab, false));
assert!(matches!(d, Detokenizer::Bpe(_)));
let _: &[u32] = d.tokens();
}