1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
//! Bare `tokenizer-stream` naive-detokenizer tests.
//!
//! Mirrors mlx-swift-lm's `NaiveStreamingDetokenizer`: the only detokenizer
//! available without `tokenizer-spm` / `tokenizer-bpe`. Re-decodes the
//! accumulated segment via the `tokenizers` crate's own configured decoder.
//! No SPM/BPE class inference, no tool parsers / override registry, no
//! `serde_json` on the streaming path.
#![cfg(all(feature = "tokenizer-stream", not(feature = "tokenizer-config")))]
use std::io::Write;
// #111: `Tokenizer::detokenizer()` returns the enum-unified
// [`Detokenizer`]; methods like `add_token` / `text` / `last_segment`
// require the [`StreamingDetokenizer`] trait in scope.
use mlxrs::tokenizer::{StreamingDetokenizer, Tokenizer};
const TOKENIZER_JSON: &str = include_str!("fixtures/tokenizer.json");
/// Materialize the (deterministic, byte-identical-every-call) fixture
/// exactly once per test process. `cargo test` runs the tests in this
/// binary in parallel; creating + truncating + rewriting the same file on
/// every call races a concurrent `Tokenizer::from_path` read. The content
/// is constant, so a write-once `OnceLock` guard removes the race entirely
/// (every test then reads the same already-complete file) without
/// serializing the tests.
fn fixture_dir() -> std::path::PathBuf {
static FIXTURE: std::sync::OnceLock<std::path::PathBuf> = std::sync::OnceLock::new();
FIXTURE
.get_or_init(|| {
let dir =
std::env::temp_dir().join(format!("mlxrs-tok-stream-fixture-{}", std::process::id()));
std::fs::create_dir_all(&dir).unwrap();
let mut f = std::fs::File::create(dir.join("tokenizer.json")).unwrap();
f.write_all(TOKENIZER_JSON.as_bytes()).unwrap();
dir
})
.clone()
}
#[test]
fn naive_detokenizer_streams_and_reconstructs_full_decode() {
let tok = Tokenizer::from_path(fixture_dir(), None).unwrap();
let ids = tok
.encode("hello world the quick brown fox", false)
.unwrap();
let full = tok.decode(&ids, false).unwrap();
// Without `tokenizer-spm`/`tokenizer-bpe` the factory always yields the
// naive re-decode detokenizer (graceful default). Stream tokens one at a
// time and accumulate readable segments.
let mut d = tok.detokenizer();
d.reset();
let mut streamed = String::new();
for &t in &ids {
d.add_token(t);
streamed.push_str(&d.last_segment());
}
d.finalize();
streamed.push_str(&d.last_segment());
assert_eq!(d.text(), full);
assert_eq!(d.tokens(), ids.as_slice());
assert_eq!(streamed, full);
}
#[test]
fn encode_decode_round_trip_serde_json_free_core() {
// Bare stream (no `tokenizer-config`): encode/decode work purely from
// `tokenizer.json` with no `serde_json` linked on this path.
let tok = Tokenizer::from_path(fixture_dir(), None).unwrap();
let ids = tok
.encode("hello world the quick brown fox", false)
.unwrap();
assert_eq!(ids, vec![3, 4, 5, 6, 7, 8]);
let text = tok.decode(&ids, false).unwrap();
assert!(text.contains("hello"));
assert!(text.contains("fox"));
assert_eq!(tok.encode(&text, false).unwrap(), ids);
// Thinking inference is vocab-based (no config / serde_json).
assert!(tok.has_thinking());
assert_eq!(tok.think_start(), Some("<think>"));
}