oxibonsai_runtime/native_tokenizer.rs
1//! Native tokenizer bridge using oxibonsai-tokenizer (pure Rust BPE).
2//!
3//! This bridge allows the inference engine to use the project-native tokenizer
4//! without any C/FFI dependencies, making it fully WASM-compatible.
5//!
6//! ## Overview
7//!
8//! [`NativeTokenizerBridge`] wraps an [`OxiTokenizer`] instance and optionally
9//! a [`ChatTemplate`] to provide a unified encode/decode/chat-format API that
10//! mirrors [`crate::tokenizer_bridge::TokenizerBridge`] but requires zero
11//! C extensions and compiles to `wasm32-unknown-unknown`.
12//!
13//! ## Quick start
14//!
15//! ```rust
16//! use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
17//!
18//! // Character-level fallback — no vocab file needed, great for testing.
19//! let bridge = NativeTokenizerBridge::char_level_fallback();
20//! let ids = bridge.encode("hello").expect("encode should succeed");
21//! assert!(!ids.is_empty());
22//! let text = bridge.decode(&ids).expect("decode should succeed");
23//! assert_eq!(text, "hello");
24//! ```
25
26use oxibonsai_tokenizer::utils::ChatTemplate;
27use oxibonsai_tokenizer::{OxiTokenizer, TokenizerConfig};
28
29// ── Error type ────────────────────────────────────────────────────────────────
30
31/// Errors that can arise from [`NativeTokenizerBridge`] operations.
32#[derive(Debug, thiserror::Error)]
33pub enum NativeTokenizerError {
34 /// Wraps an error propagated from the underlying [`OxiTokenizer`].
35 #[error("tokenizer error: {0}")]
36 Tokenizer(#[from] oxibonsai_tokenizer::TokenizerError),
37
38 /// Returned when [`NativeTokenizerBridge::format_chat`] is called but no
39 /// chat template was configured at construction time.
40 #[error("no chat template configured")]
41 NoChatTemplate,
42
43 /// Encoding failed for a reason not covered by the underlying tokenizer
44 /// error type (e.g. an internal invariant violation).
45 #[error("encode failed: {0}")]
46 EncodeFailed(String),
47}
48
49// ── NativeTokenizerBridge ─────────────────────────────────────────────────────
50
51/// Bridge between the inference engine and [`OxiTokenizer`].
52///
53/// Provides encode/decode/chat-format operations backed by the project-native
54/// pure Rust BPE tokenizer. The bridge is `Send + Sync` and holds no mutable
55/// state after construction.
56pub struct NativeTokenizerBridge {
57 tokenizer: OxiTokenizer,
58 chat_template: Option<ChatTemplate>,
59}
60
61impl NativeTokenizerBridge {
62 // ── Constructors ──────────────────────────────────────────────────────
63
64 /// Create a bridge wrapping the provided [`OxiTokenizer`], with no chat
65 /// template.
66 ///
67 /// Use [`NativeTokenizerBridge::with_chatml`] if you need ChatML
68 /// formatting (e.g. for Qwen3 models).
69 pub fn new(tokenizer: OxiTokenizer) -> Self {
70 Self {
71 tokenizer,
72 chat_template: None,
73 }
74 }
75
76 /// Create a minimal char-level fallback tokenizer.
77 ///
78 /// This uses [`OxiTokenizer::char_level_stub`] with a 512-token vocabulary
79 /// and attaches no chat template. Useful for unit tests and smoke-checks
80 /// where a real vocab file is not required.
81 pub fn char_level_fallback() -> Self {
82 Self::new(OxiTokenizer::char_level_stub(512))
83 }
84
85 /// Create a bridge with a ChatML template pre-configured.
86 ///
87 /// This is the correct constructor for Qwen3 / OxiBonsai models, which
88 /// use the `<|im_start|>role\ncontent<|im_end|>` format.
89 pub fn with_chatml(tokenizer: OxiTokenizer) -> Self {
90 Self {
91 tokenizer,
92 chat_template: Some(ChatTemplate::chatml()),
93 }
94 }
95
96 /// Create a char-level fallback tokenizer with a ChatML template.
97 ///
98 /// Convenience constructor that combines `char_level_fallback` and
99 /// `with_chatml` — handy for tests that exercise the chat-formatting
100 /// path without a real vocab file.
101 pub fn char_level_fallback_with_chatml() -> Self {
102 Self::with_chatml(OxiTokenizer::char_level_stub(512))
103 }
104
105 /// Create a bridge from a JSON-serialized vocabulary and merge table,
106 /// using the supplied configuration.
107 ///
108 /// `vocab_json`: `{ "token": id, … }`
109 /// `merges_json`: `[["a", "b"], …]` (highest-priority merge first)
110 pub fn from_json(
111 vocab_json: &str,
112 merges_json: &str,
113 config: TokenizerConfig,
114 ) -> Result<Self, NativeTokenizerError> {
115 let tokenizer = OxiTokenizer::from_json(vocab_json, merges_json, config)?;
116 Ok(Self::new(tokenizer))
117 }
118
119 // ── Core encode / decode ──────────────────────────────────────────────
120
121 /// Encode a text string into a sequence of token IDs.
122 ///
123 /// Delegates directly to [`OxiTokenizer::encode`].
124 pub fn encode(&self, text: &str) -> Result<Vec<u32>, NativeTokenizerError> {
125 self.tokenizer
126 .encode(text)
127 .map_err(NativeTokenizerError::Tokenizer)
128 }
129
130 /// Decode a sequence of token IDs back into a UTF-8 string.
131 ///
132 /// Special tokens (BOS, EOS, PAD, UNK) are silently skipped.
133 /// Unknown IDs produce `\u{FFFD}` (replacement character).
134 pub fn decode(&self, ids: &[u32]) -> Result<String, NativeTokenizerError> {
135 self.tokenizer
136 .decode(ids)
137 .map_err(NativeTokenizerError::Tokenizer)
138 }
139
140 /// Decode a single token ID to its string representation.
141 pub fn decode_token(&self, id: u32) -> Result<String, NativeTokenizerError> {
142 self.tokenizer
143 .decode_token(id)
144 .map_err(NativeTokenizerError::Tokenizer)
145 }
146
147 /// Encode a batch of texts, returning one `Vec<u32>` per input.
148 pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Vec<u32>>, NativeTokenizerError> {
149 self.tokenizer
150 .encode_batch(texts)
151 .map_err(NativeTokenizerError::Tokenizer)
152 }
153
154 // ── Chat template ─────────────────────────────────────────────────────
155
156 /// Format a list of `(role, content)` pairs into a single prompt string
157 /// using the configured chat template.
158 ///
159 /// Returns [`NativeTokenizerError::NoChatTemplate`] if no template was
160 /// provided at construction time.
161 ///
162 /// # Example
163 ///
164 /// ```rust
165 /// use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
166 ///
167 /// let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
168 /// let prompt = bridge
169 /// .format_chat(&[("user", "Hello!")])
170 /// .expect("format_chat should succeed");
171 /// assert!(prompt.contains("<|im_start|>user"));
172 /// ```
173 pub fn format_chat(&self, messages: &[(&str, &str)]) -> Result<String, NativeTokenizerError> {
174 match &self.chat_template {
175 Some(tmpl) => Ok(tmpl.format(messages)),
176 None => Err(NativeTokenizerError::NoChatTemplate),
177 }
178 }
179
180 // ── Informational helpers ─────────────────────────────────────────────
181
182 /// Return the total vocabulary size.
183 pub fn vocab_size(&self) -> usize {
184 self.tokenizer.vocab_size()
185 }
186
187 /// Return the BOS token ID from the underlying tokenizer configuration.
188 pub fn bos_id(&self) -> u32 {
189 self.tokenizer.bos_id()
190 }
191
192 /// Return the EOS token ID from the underlying tokenizer configuration.
193 pub fn eos_id(&self) -> u32 {
194 self.tokenizer.eos_id()
195 }
196
197 /// Return `true` if the given token ID is a special token (BOS/EOS/PAD/UNK).
198 pub fn is_special(&self, id: u32) -> bool {
199 self.tokenizer.is_special(id)
200 }
201
202 /// Return a reference to the underlying [`OxiTokenizer`].
203 pub fn inner(&self) -> &OxiTokenizer {
204 &self.tokenizer
205 }
206
207 /// Return a reference to the configured [`ChatTemplate`], if any.
208 pub fn chat_template(&self) -> Option<&ChatTemplate> {
209 self.chat_template.as_ref()
210 }
211}
212
213// ── std::fmt::Debug ───────────────────────────────────────────────────────────
214
215impl std::fmt::Debug for NativeTokenizerBridge {
216 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
217 f.debug_struct("NativeTokenizerBridge")
218 .field("vocab_size", &self.vocab_size())
219 .field("has_chat_template", &self.chat_template.is_some())
220 .finish()
221 }
222}
223
224// ── Unit tests ────────────────────────────────────────────────────────────────
225
226#[cfg(test)]
227mod tests {
228 use super::*;
229
230 #[test]
231 fn char_level_fallback_encode_decode() {
232 let bridge = NativeTokenizerBridge::char_level_fallback();
233 let ids = bridge.encode("hello").expect("encode should succeed");
234 let text = bridge.decode(&ids).expect("decode should succeed");
235 assert_eq!(text, "hello");
236 }
237
238 #[test]
239 fn char_level_fallback_nonempty() {
240 let bridge = NativeTokenizerBridge::char_level_fallback();
241 let ids = bridge.encode("hello").expect("encode should succeed");
242 assert!(!ids.is_empty());
243 }
244
245 #[test]
246 fn char_level_fallback_roundtrip_long() {
247 let bridge = NativeTokenizerBridge::char_level_fallback();
248 // Single word — no GPT-2 space-prefix complications.
249 let original = "thequickbrownfox";
250 let ids = bridge.encode(original).expect("encode should succeed");
251 let decoded = bridge.decode(&ids).expect("decode should succeed");
252 assert_eq!(decoded, original);
253 }
254
255 #[test]
256 fn native_tokenizer_vocab_size_positive() {
257 let bridge = NativeTokenizerBridge::char_level_fallback();
258 assert!(bridge.vocab_size() > 0);
259 }
260
261 #[test]
262 fn char_level_encode_consistent() {
263 let bridge = NativeTokenizerBridge::char_level_fallback();
264 let ids1 = bridge.encode("consistent").expect("first encode");
265 let ids2 = bridge.encode("consistent").expect("second encode");
266 assert_eq!(ids1, ids2);
267 }
268
269 #[test]
270 fn char_level_special_chars() {
271 let bridge = NativeTokenizerBridge::char_level_fallback();
272 // Spaces, newlines, and basic Unicode should not panic.
273 let ids = bridge.encode("hello world\nhow are you").expect("encode");
274 assert!(!ids.is_empty());
275 }
276
277 #[test]
278 fn native_tokenizer_decode_empty() {
279 let bridge = NativeTokenizerBridge::char_level_fallback();
280 let text = bridge.decode(&[]).expect("decode empty should succeed");
281 assert_eq!(text, "");
282 }
283
284 #[test]
285 fn native_tokenizer_format_chat_no_template() {
286 let bridge = NativeTokenizerBridge::char_level_fallback();
287 let result = bridge.format_chat(&[("user", "Hello!")]);
288 assert!(matches!(result, Err(NativeTokenizerError::NoChatTemplate)));
289 }
290
291 #[test]
292 fn native_tokenizer_with_chatml_format() {
293 let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
294 let prompt = bridge
295 .format_chat(&[("user", "Hello!")])
296 .expect("format_chat should succeed");
297 assert!(prompt.contains("<|im_start|>user"));
298 assert!(prompt.contains("Hello!"));
299 assert!(prompt.contains("<|im_end|>"));
300 }
301
302 #[test]
303 fn native_tokenizer_encode_empty() {
304 let bridge = NativeTokenizerBridge::char_level_fallback();
305 // Encoding an empty string should succeed (may be empty or BOS-only).
306 let ids = bridge.encode("").expect("encode empty should succeed");
307 // No assertion on content — just that it does not error.
308 let _ = ids;
309 }
310
311 #[test]
312 fn debug_impl_shows_vocab_size() {
313 let bridge = NativeTokenizerBridge::char_level_fallback();
314 let dbg = format!("{bridge:?}");
315 assert!(dbg.contains("vocab_size"));
316 assert!(dbg.contains("has_chat_template"));
317 }
318
319 #[test]
320 fn bos_eos_ids_accessible() {
321 let bridge = NativeTokenizerBridge::char_level_fallback();
322 assert_eq!(bridge.bos_id(), 1);
323 assert_eq!(bridge.eos_id(), 2);
324 }
325
326 #[test]
327 fn special_token_detection() {
328 let bridge = NativeTokenizerBridge::char_level_fallback();
329 assert!(bridge.is_special(0)); // UNK
330 assert!(bridge.is_special(1)); // BOS
331 assert!(bridge.is_special(2)); // EOS
332 assert!(bridge.is_special(3)); // PAD
333 assert!(!bridge.is_special(4)); // first real token
334 }
335
336 #[test]
337 fn inner_ref_returns_tokenizer() {
338 let bridge = NativeTokenizerBridge::char_level_fallback();
339 // We can call inner() and get a consistent vocab_size.
340 assert_eq!(bridge.inner().vocab_size(), bridge.vocab_size());
341 }
342}