1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
//! Native tokenizer bridge using oxibonsai-tokenizer (pure Rust BPE).
//!
//! This bridge allows the inference engine to use the project-native tokenizer
//! without any C/FFI dependencies, making it fully WASM-compatible.
//!
//! ## Overview
//!
//! [`NativeTokenizerBridge`] wraps an [`OxiTokenizer`] instance and optionally
//! a [`ChatTemplate`] to provide a unified encode/decode/chat-format API that
//! mirrors [`crate::tokenizer_bridge::TokenizerBridge`] but requires zero
//! C extensions and compiles to `wasm32-unknown-unknown`.
//!
//! ## Quick start
//!
//! ```rust
//! use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
//!
//! // Character-level fallback — no vocab file needed, great for testing.
//! let bridge = NativeTokenizerBridge::char_level_fallback();
//! let ids = bridge.encode("hello").expect("encode should succeed");
//! assert!(!ids.is_empty());
//! let text = bridge.decode(&ids).expect("decode should succeed");
//! assert_eq!(text, "hello");
//! ```
use oxibonsai_tokenizer::utils::ChatTemplate;
use oxibonsai_tokenizer::{OxiTokenizer, TokenizerConfig};
// ── Error type ────────────────────────────────────────────────────────────────
/// Errors that can arise from [`NativeTokenizerBridge`] operations.
#[derive(Debug, thiserror::Error)]
pub enum NativeTokenizerError {
/// Wraps an error propagated from the underlying [`OxiTokenizer`].
#[error("tokenizer error: {0}")]
Tokenizer(#[from] oxibonsai_tokenizer::TokenizerError),
/// Returned when [`NativeTokenizerBridge::format_chat`] is called but no
/// chat template was configured at construction time.
#[error("no chat template configured")]
NoChatTemplate,
/// Encoding failed for a reason not covered by the underlying tokenizer
/// error type (e.g. an internal invariant violation).
#[error("encode failed: {0}")]
EncodeFailed(String),
}
// ── NativeTokenizerBridge ─────────────────────────────────────────────────────
/// Bridge between the inference engine and [`OxiTokenizer`].
///
/// Provides encode/decode/chat-format operations backed by the project-native
/// pure Rust BPE tokenizer. The bridge is `Send + Sync` and holds no mutable
/// state after construction.
pub struct NativeTokenizerBridge {
tokenizer: OxiTokenizer,
chat_template: Option<ChatTemplate>,
}
impl NativeTokenizerBridge {
// ── Constructors ──────────────────────────────────────────────────────
/// Create a bridge wrapping the provided [`OxiTokenizer`], with no chat
/// template.
///
/// Use [`NativeTokenizerBridge::with_chatml`] if you need ChatML
/// formatting (e.g. for Qwen3 models).
pub fn new(tokenizer: OxiTokenizer) -> Self {
Self {
tokenizer,
chat_template: None,
}
}
/// Create a minimal char-level fallback tokenizer.
///
/// This uses [`OxiTokenizer::char_level_stub`] with a 512-token vocabulary
/// and attaches no chat template. Useful for unit tests and smoke-checks
/// where a real vocab file is not required.
pub fn char_level_fallback() -> Self {
Self::new(OxiTokenizer::char_level_stub(512))
}
/// Create a bridge with a ChatML template pre-configured.
///
/// This is the correct constructor for Qwen3 / OxiBonsai models, which
/// use the `<|im_start|>role\ncontent<|im_end|>` format.
pub fn with_chatml(tokenizer: OxiTokenizer) -> Self {
Self {
tokenizer,
chat_template: Some(ChatTemplate::chatml()),
}
}
/// Create a char-level fallback tokenizer with a ChatML template.
///
/// Convenience constructor that combines `char_level_fallback` and
/// `with_chatml` — handy for tests that exercise the chat-formatting
/// path without a real vocab file.
pub fn char_level_fallback_with_chatml() -> Self {
Self::with_chatml(OxiTokenizer::char_level_stub(512))
}
/// Create a bridge from a JSON-serialized vocabulary and merge table,
/// using the supplied configuration.
///
/// `vocab_json`: `{ "token": id, … }`
/// `merges_json`: `[["a", "b"], …]` (highest-priority merge first)
pub fn from_json(
vocab_json: &str,
merges_json: &str,
config: TokenizerConfig,
) -> Result<Self, NativeTokenizerError> {
let tokenizer = OxiTokenizer::from_json(vocab_json, merges_json, config)?;
Ok(Self::new(tokenizer))
}
// ── Core encode / decode ──────────────────────────────────────────────
/// Encode a text string into a sequence of token IDs.
///
/// Delegates directly to [`OxiTokenizer::encode`].
pub fn encode(&self, text: &str) -> Result<Vec<u32>, NativeTokenizerError> {
self.tokenizer
.encode(text)
.map_err(NativeTokenizerError::Tokenizer)
}
/// Decode a sequence of token IDs back into a UTF-8 string.
///
/// Special tokens (BOS, EOS, PAD, UNK) are silently skipped.
/// Unknown IDs produce `\u{FFFD}` (replacement character).
pub fn decode(&self, ids: &[u32]) -> Result<String, NativeTokenizerError> {
self.tokenizer
.decode(ids)
.map_err(NativeTokenizerError::Tokenizer)
}
/// Decode a single token ID to its string representation.
pub fn decode_token(&self, id: u32) -> Result<String, NativeTokenizerError> {
self.tokenizer
.decode_token(id)
.map_err(NativeTokenizerError::Tokenizer)
}
/// Encode a batch of texts, returning one `Vec<u32>` per input.
pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Vec<u32>>, NativeTokenizerError> {
self.tokenizer
.encode_batch(texts)
.map_err(NativeTokenizerError::Tokenizer)
}
// ── Chat template ─────────────────────────────────────────────────────
/// Format a list of `(role, content)` pairs into a single prompt string
/// using the configured chat template.
///
/// Returns [`NativeTokenizerError::NoChatTemplate`] if no template was
/// provided at construction time.
///
/// # Example
///
/// ```rust
/// use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
///
/// let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
/// let prompt = bridge
/// .format_chat(&[("user", "Hello!")])
/// .expect("format_chat should succeed");
/// assert!(prompt.contains("<|im_start|>user"));
/// ```
pub fn format_chat(&self, messages: &[(&str, &str)]) -> Result<String, NativeTokenizerError> {
match &self.chat_template {
Some(tmpl) => Ok(tmpl.format(messages)),
None => Err(NativeTokenizerError::NoChatTemplate),
}
}
// ── Informational helpers ─────────────────────────────────────────────
/// Return the total vocabulary size.
pub fn vocab_size(&self) -> usize {
self.tokenizer.vocab_size()
}
/// Return the BOS token ID from the underlying tokenizer configuration.
pub fn bos_id(&self) -> u32 {
self.tokenizer.bos_id()
}
/// Return the EOS token ID from the underlying tokenizer configuration.
pub fn eos_id(&self) -> u32 {
self.tokenizer.eos_id()
}
/// Return `true` if the given token ID is a special token (BOS/EOS/PAD/UNK).
pub fn is_special(&self, id: u32) -> bool {
self.tokenizer.is_special(id)
}
/// Return a reference to the underlying [`OxiTokenizer`].
pub fn inner(&self) -> &OxiTokenizer {
&self.tokenizer
}
/// Return a reference to the configured [`ChatTemplate`], if any.
pub fn chat_template(&self) -> Option<&ChatTemplate> {
self.chat_template.as_ref()
}
}
// ── std::fmt::Debug ───────────────────────────────────────────────────────────
impl std::fmt::Debug for NativeTokenizerBridge {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("NativeTokenizerBridge")
.field("vocab_size", &self.vocab_size())
.field("has_chat_template", &self.chat_template.is_some())
.finish()
}
}
// ── Unit tests ────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn char_level_fallback_encode_decode() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids = bridge.encode("hello").expect("encode should succeed");
let text = bridge.decode(&ids).expect("decode should succeed");
assert_eq!(text, "hello");
}
#[test]
fn char_level_fallback_nonempty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids = bridge.encode("hello").expect("encode should succeed");
assert!(!ids.is_empty());
}
#[test]
fn char_level_fallback_roundtrip_long() {
let bridge = NativeTokenizerBridge::char_level_fallback();
// Single word — no GPT-2 space-prefix complications.
let original = "thequickbrownfox";
let ids = bridge.encode(original).expect("encode should succeed");
let decoded = bridge.decode(&ids).expect("decode should succeed");
assert_eq!(decoded, original);
}
#[test]
fn native_tokenizer_vocab_size_positive() {
let bridge = NativeTokenizerBridge::char_level_fallback();
assert!(bridge.vocab_size() > 0);
}
#[test]
fn char_level_encode_consistent() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let ids1 = bridge.encode("consistent").expect("first encode");
let ids2 = bridge.encode("consistent").expect("second encode");
assert_eq!(ids1, ids2);
}
#[test]
fn char_level_special_chars() {
let bridge = NativeTokenizerBridge::char_level_fallback();
// Spaces, newlines, and basic Unicode should not panic.
let ids = bridge.encode("hello world\nhow are you").expect("encode");
assert!(!ids.is_empty());
}
#[test]
fn native_tokenizer_decode_empty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let text = bridge.decode(&[]).expect("decode empty should succeed");
assert_eq!(text, "");
}
#[test]
fn native_tokenizer_format_chat_no_template() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let result = bridge.format_chat(&[("user", "Hello!")]);
assert!(matches!(result, Err(NativeTokenizerError::NoChatTemplate)));
}
#[test]
fn native_tokenizer_with_chatml_format() {
let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
let prompt = bridge
.format_chat(&[("user", "Hello!")])
.expect("format_chat should succeed");
assert!(prompt.contains("<|im_start|>user"));
assert!(prompt.contains("Hello!"));
assert!(prompt.contains("<|im_end|>"));
}
#[test]
fn native_tokenizer_encode_empty() {
let bridge = NativeTokenizerBridge::char_level_fallback();
// Encoding an empty string should succeed (may be empty or BOS-only).
let ids = bridge.encode("").expect("encode empty should succeed");
// No assertion on content — just that it does not error.
let _ = ids;
}
#[test]
fn debug_impl_shows_vocab_size() {
let bridge = NativeTokenizerBridge::char_level_fallback();
let dbg = format!("{bridge:?}");
assert!(dbg.contains("vocab_size"));
assert!(dbg.contains("has_chat_template"));
}
#[test]
fn bos_eos_ids_accessible() {
let bridge = NativeTokenizerBridge::char_level_fallback();
assert_eq!(bridge.bos_id(), 1);
assert_eq!(bridge.eos_id(), 2);
}
#[test]
fn special_token_detection() {
let bridge = NativeTokenizerBridge::char_level_fallback();
assert!(bridge.is_special(0)); // UNK
assert!(bridge.is_special(1)); // BOS
assert!(bridge.is_special(2)); // EOS
assert!(bridge.is_special(3)); // PAD
assert!(!bridge.is_special(4)); // first real token
}
#[test]
fn inner_ref_returns_tokenizer() {
let bridge = NativeTokenizerBridge::char_level_fallback();
// We can call inner() and get a consistent vocab_size.
assert_eq!(bridge.inner().vocab_size(), bridge.vocab_size());
}
}