1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
//! Tokenizer bridge wrapping HuggingFace tokenizers.
//!
//! On WASM targets, the `tokenizers` crate is unavailable (requires native C extensions).
//! A stub implementation is provided that returns errors for all operations.
use crate::error::{RuntimeError, RuntimeResult};
/// Thin wrapper around `tokenizers::Tokenizer`.
///
/// On non-WASM targets, delegates to the full HuggingFace tokenizers library.
/// On WASM targets, all methods return a `RuntimeError::Tokenizer` error.
pub struct TokenizerBridge {
#[cfg(not(target_arch = "wasm32"))]
inner: tokenizers::Tokenizer,
#[cfg(target_arch = "wasm32")]
_phantom: (),
}
/// Per-stream UTF-8-safe decode state. Owned by the caller.
///
/// BPE / byte-level tokenizers (Qwen3, GPT-2, etc.) sometimes emit a single
/// token that carries only **part** of a multi-byte UTF-8 character (e.g. one
/// byte of a CJK ideograph or emoji). Decoding tokens one-at-a-time without
/// buffering breaks those multi-byte sequences and produces `U+FFFD`
/// replacement characters in the output stream. This state mirrors what the
/// HuggingFace `tokenizers::DecodeStream` keeps internally so that we can own
/// it externally and feed tokens through `TokenizerBridge::step_decode`.
///
/// Use one `DecodeStreamState` per generation request; reset (or drop &
/// re-create) it between independent requests.
#[derive(Default)]
#[cfg_attr(target_arch = "wasm32", allow(dead_code))]
pub struct DecodeStreamState {
ids: Vec<u32>,
prefix: String,
prefix_index: usize,
skip_special_tokens: bool,
}
impl DecodeStreamState {
/// Construct a fresh decode-stream state.
///
/// `skip_special_tokens` matches the existing `decode()` behavior — pass
/// `true` to drop sentinel tokens (e.g. `<|im_end|>`) from the output.
pub fn new(skip_special_tokens: bool) -> Self {
Self {
ids: Vec::new(),
prefix: String::new(),
prefix_index: 0,
skip_special_tokens,
}
}
/// Reset the state, preserving the original `skip_special_tokens` flag.
pub fn reset(&mut self) {
*self = Self::new(self.skip_special_tokens);
}
}
impl TokenizerBridge {
/// Load a tokenizer from a JSON file.
#[cfg(not(target_arch = "wasm32"))]
pub fn from_file(path: &str) -> RuntimeResult<Self> {
let inner = tokenizers::Tokenizer::from_file(path)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))?;
Ok(Self { inner })
}
/// Load a tokenizer from a JSON file.
///
/// On WASM targets, always returns an error since the tokenizers library
/// requires native code not available in WebAssembly.
#[cfg(target_arch = "wasm32")]
pub fn from_file(_path: &str) -> RuntimeResult<Self> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Encode text to token IDs.
#[cfg(not(target_arch = "wasm32"))]
pub fn encode(&self, text: &str) -> RuntimeResult<Vec<u32>> {
let encoding = self
.inner
.encode(text, false)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))?;
Ok(encoding.get_ids().to_vec())
}
/// Encode text to token IDs.
///
/// On WASM targets, always returns an error.
#[cfg(target_arch = "wasm32")]
pub fn encode(&self, _text: &str) -> RuntimeResult<Vec<u32>> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Decode token IDs to text.
#[cfg(not(target_arch = "wasm32"))]
pub fn decode(&self, ids: &[u32]) -> RuntimeResult<String> {
self.inner
.decode(ids, true)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))
}
/// Decode token IDs to text.
///
/// On WASM targets, always returns an error.
#[cfg(target_arch = "wasm32")]
pub fn decode(&self, _ids: &[u32]) -> RuntimeResult<String> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Get the vocabulary size.
#[cfg(not(target_arch = "wasm32"))]
pub fn vocab_size(&self) -> usize {
self.inner.get_vocab_size(true)
}
/// Get the vocabulary size.
///
/// On WASM targets, returns 0 since no tokenizer is available.
#[cfg(target_arch = "wasm32")]
pub fn vocab_size(&self) -> usize {
0
}
/// Get the internal tokenizer reference.
#[cfg(not(target_arch = "wasm32"))]
pub fn inner(&self) -> &tokenizers::Tokenizer {
&self.inner
}
/// Create a fresh decode-stream state for one generation request.
///
/// See [`DecodeStreamState`] and [`Self::step_decode`] for the streaming
/// decode protocol. Use this instead of repeatedly calling
/// [`Self::decode`] with single-token slices, which mishandles tokens that
/// straddle UTF-8 codepoint boundaries.
#[cfg(not(target_arch = "wasm32"))]
pub fn new_decode_stream(&self, skip_special_tokens: bool) -> DecodeStreamState {
DecodeStreamState::new(skip_special_tokens)
}
/// Create a fresh decode-stream state.
///
/// On WASM targets this returns a state object, but [`Self::step_decode`]
/// will always error. The state itself is harmless to construct.
#[cfg(target_arch = "wasm32")]
pub fn new_decode_stream(&self, skip_special_tokens: bool) -> DecodeStreamState {
DecodeStreamState::new(skip_special_tokens)
}
/// Advance the decode stream by one token.
///
/// Returns `Ok(Some(text))` only when the buffered bytes form a complete
/// UTF-8 chunk (which may span several previous tokens for CJK / emoji);
/// returns `Ok(None)` when more tokens are needed before any well-formed
/// text can be emitted. Callers must **not** print the empty string when
/// `Ok(None)` is returned — wait for the next token.
#[cfg(not(target_arch = "wasm32"))]
pub fn step_decode(
&self,
state: &mut DecodeStreamState,
id: u32,
) -> RuntimeResult<Option<String>> {
tokenizers::step_decode_stream(
&*self.inner,
vec![id],
state.skip_special_tokens,
&mut state.ids,
&mut state.prefix,
&mut state.prefix_index,
)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))
}
/// Advance the decode stream by one token.
///
/// On WASM targets this always returns an error since the tokenizers
/// library is unavailable.
#[cfg(target_arch = "wasm32")]
pub fn step_decode(
&self,
_state: &mut DecodeStreamState,
_id: u32,
) -> RuntimeResult<Option<String>> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
}
#[cfg(all(test, not(target_arch = "wasm32")))]
mod tests {
use super::*;
use std::path::Path;
/// Path to the project's bundled Qwen3 tokenizer. Tests that need a real
/// BPE tokenizer skip themselves when this fixture is missing so that
/// freshly-cloned working trees still pass `cargo test`.
const FIXTURE_TOKENIZER: &str = "../../models/tokenizer.json";
fn maybe_load_fixture() -> Option<TokenizerBridge> {
if !Path::new(FIXTURE_TOKENIZER).exists() {
eprintln!(
"skipped: tokenizer fixture not found at {FIXTURE_TOKENIZER} \
(run scripts/download_tokenizer.sh to enable)",
);
return None;
}
match TokenizerBridge::from_file(FIXTURE_TOKENIZER) {
Ok(t) => Some(t),
Err(e) => {
eprintln!("skipped: failed to load tokenizer fixture: {e}");
None
}
}
}
/// Drive every id through `step_decode` and concatenate the well-formed
/// chunks. Mirrors what the CLI / SSE code paths do.
fn stream_through(tok: &TokenizerBridge, ids: &[u32]) -> RuntimeResult<String> {
let mut state = tok.new_decode_stream(true);
let mut out = String::new();
for &id in ids {
if let Some(chunk) = tok.step_decode(&mut state, id)? {
out.push_str(&chunk);
}
}
Ok(out)
}
#[test]
fn streaming_decode_cjk_no_replacement_chars() -> RuntimeResult<()> {
let Some(tok) = maybe_load_fixture() else {
return Ok(());
};
// Mix of Japanese ideographs and hiragana exercising multi-byte UTF-8
// (3 bytes per char) that BPE byte-level tokenization typically splits
// across two or three tokens.
let input = "日本語処理を専門";
let ids = tok.encode(input)?;
assert!(!ids.is_empty(), "encoding yielded no token ids");
let streamed = stream_through(&tok, &ids)?;
assert!(
!streamed.contains('\u{FFFD}'),
"streaming decode produced U+FFFD replacement char(s); output: {streamed:?}",
);
assert_eq!(
streamed, input,
"streaming decode did not reconstruct the original CJK input",
);
Ok(())
}
#[test]
fn streaming_decode_ascii_passes_through() -> RuntimeResult<()> {
let Some(tok) = maybe_load_fixture() else {
return Ok(());
};
let input = "Hello, world! Streaming ASCII works fine.";
let ids = tok.encode(input)?;
let streamed = stream_through(&tok, &ids)?;
assert!(!streamed.contains('\u{FFFD}'));
assert_eq!(streamed, input);
Ok(())
}
#[test]
fn streaming_decode_handles_empty_input() -> RuntimeResult<()> {
let Some(tok) = maybe_load_fixture() else {
return Ok(());
};
// Driving zero ids must yield no output and must not panic.
let streamed = stream_through(&tok, &[])?;
assert!(
streamed.is_empty(),
"empty token stream should yield empty output, got {streamed:?}",
);
// Resetting a fresh state is a no-op; the state is still usable
// afterwards (verified by re-running the empty-input drive).
let mut state = tok.new_decode_stream(true);
state.reset();
let still_empty = stream_through(&tok, &[])?;
assert!(still_empty.is_empty());
Ok(())
}
}