Skip to main content

oxibonsai_runtime/
native_tokenizer.rs

1//! Native tokenizer bridge using oxibonsai-tokenizer (pure Rust BPE).
2//!
3//! This bridge allows the inference engine to use the project-native tokenizer
4//! without any C/FFI dependencies, making it fully WASM-compatible.
5//!
6//! ## Overview
7//!
8//! [`NativeTokenizerBridge`] wraps an [`OxiTokenizer`] instance and optionally
9//! a [`ChatTemplate`] to provide a unified encode/decode/chat-format API that
10//! mirrors [`crate::tokenizer_bridge::TokenizerBridge`] but requires zero
11//! C extensions and compiles to `wasm32-unknown-unknown`.
12//!
13//! ## Quick start
14//!
15//! ```rust
16//! use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
17//!
18//! // Character-level fallback — no vocab file needed, great for testing.
19//! let bridge = NativeTokenizerBridge::char_level_fallback();
20//! let ids = bridge.encode("hello").expect("encode should succeed");
21//! assert!(!ids.is_empty());
22//! let text = bridge.decode(&ids).expect("decode should succeed");
23//! assert_eq!(text, "hello");
24//! ```
25
26use oxibonsai_tokenizer::utils::ChatTemplate;
27use oxibonsai_tokenizer::{OxiTokenizer, TokenizerConfig};
28
29// ── Error type ────────────────────────────────────────────────────────────────
30
31/// Errors that can arise from [`NativeTokenizerBridge`] operations.
32#[derive(Debug, thiserror::Error)]
33pub enum NativeTokenizerError {
34    /// Wraps an error propagated from the underlying [`OxiTokenizer`].
35    #[error("tokenizer error: {0}")]
36    Tokenizer(#[from] oxibonsai_tokenizer::TokenizerError),
37
38    /// Returned when [`NativeTokenizerBridge::format_chat`] is called but no
39    /// chat template was configured at construction time.
40    #[error("no chat template configured")]
41    NoChatTemplate,
42
43    /// Encoding failed for a reason not covered by the underlying tokenizer
44    /// error type (e.g. an internal invariant violation).
45    #[error("encode failed: {0}")]
46    EncodeFailed(String),
47}
48
49// ── NativeTokenizerBridge ─────────────────────────────────────────────────────
50
51/// Bridge between the inference engine and [`OxiTokenizer`].
52///
53/// Provides encode/decode/chat-format operations backed by the project-native
54/// pure Rust BPE tokenizer.  The bridge is `Send + Sync` and holds no mutable
55/// state after construction.
56pub struct NativeTokenizerBridge {
57    tokenizer: OxiTokenizer,
58    chat_template: Option<ChatTemplate>,
59}
60
61impl NativeTokenizerBridge {
62    // ── Constructors ──────────────────────────────────────────────────────
63
64    /// Create a bridge wrapping the provided [`OxiTokenizer`], with no chat
65    /// template.
66    ///
67    /// Use [`NativeTokenizerBridge::with_chatml`] if you need ChatML
68    /// formatting (e.g. for Qwen3 models).
69    pub fn new(tokenizer: OxiTokenizer) -> Self {
70        Self {
71            tokenizer,
72            chat_template: None,
73        }
74    }
75
76    /// Create a minimal char-level fallback tokenizer.
77    ///
78    /// This uses [`OxiTokenizer::char_level_stub`] with a 512-token vocabulary
79    /// and attaches no chat template.  Useful for unit tests and smoke-checks
80    /// where a real vocab file is not required.
81    pub fn char_level_fallback() -> Self {
82        Self::new(OxiTokenizer::char_level_stub(512))
83    }
84
85    /// Create a bridge with a ChatML template pre-configured.
86    ///
87    /// This is the correct constructor for Qwen3 / OxiBonsai models, which
88    /// use the `<|im_start|>role\ncontent<|im_end|>` format.
89    pub fn with_chatml(tokenizer: OxiTokenizer) -> Self {
90        Self {
91            tokenizer,
92            chat_template: Some(ChatTemplate::chatml()),
93        }
94    }
95
96    /// Create a char-level fallback tokenizer with a ChatML template.
97    ///
98    /// Convenience constructor that combines `char_level_fallback` and
99    /// `with_chatml` — handy for tests that exercise the chat-formatting
100    /// path without a real vocab file.
101    pub fn char_level_fallback_with_chatml() -> Self {
102        Self::with_chatml(OxiTokenizer::char_level_stub(512))
103    }
104
105    /// Create a bridge from a JSON-serialized vocabulary and merge table,
106    /// using the supplied configuration.
107    ///
108    /// `vocab_json`: `{ "token": id, … }`
109    /// `merges_json`: `[["a", "b"], …]` (highest-priority merge first)
110    pub fn from_json(
111        vocab_json: &str,
112        merges_json: &str,
113        config: TokenizerConfig,
114    ) -> Result<Self, NativeTokenizerError> {
115        let tokenizer = OxiTokenizer::from_json(vocab_json, merges_json, config)?;
116        Ok(Self::new(tokenizer))
117    }
118
119    // ── Core encode / decode ──────────────────────────────────────────────
120
121    /// Encode a text string into a sequence of token IDs.
122    ///
123    /// Delegates directly to [`OxiTokenizer::encode`].
124    pub fn encode(&self, text: &str) -> Result<Vec<u32>, NativeTokenizerError> {
125        self.tokenizer
126            .encode(text)
127            .map_err(NativeTokenizerError::Tokenizer)
128    }
129
130    /// Decode a sequence of token IDs back into a UTF-8 string.
131    ///
132    /// Special tokens (BOS, EOS, PAD, UNK) are silently skipped.
133    /// Unknown IDs produce `\u{FFFD}` (replacement character).
134    pub fn decode(&self, ids: &[u32]) -> Result<String, NativeTokenizerError> {
135        self.tokenizer
136            .decode(ids)
137            .map_err(NativeTokenizerError::Tokenizer)
138    }
139
140    /// Decode a single token ID to its string representation.
141    pub fn decode_token(&self, id: u32) -> Result<String, NativeTokenizerError> {
142        self.tokenizer
143            .decode_token(id)
144            .map_err(NativeTokenizerError::Tokenizer)
145    }
146
147    /// Encode a batch of texts, returning one `Vec<u32>` per input.
148    pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Vec<u32>>, NativeTokenizerError> {
149        self.tokenizer
150            .encode_batch(texts)
151            .map_err(NativeTokenizerError::Tokenizer)
152    }
153
154    // ── Chat template ─────────────────────────────────────────────────────
155
156    /// Format a list of `(role, content)` pairs into a single prompt string
157    /// using the configured chat template.
158    ///
159    /// Returns [`NativeTokenizerError::NoChatTemplate`] if no template was
160    /// provided at construction time.
161    ///
162    /// # Example
163    ///
164    /// ```rust
165    /// use oxibonsai_runtime::native_tokenizer::NativeTokenizerBridge;
166    ///
167    /// let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
168    /// let prompt = bridge
169    ///     .format_chat(&[("user", "Hello!")])
170    ///     .expect("format_chat should succeed");
171    /// assert!(prompt.contains("<|im_start|>user"));
172    /// ```
173    pub fn format_chat(&self, messages: &[(&str, &str)]) -> Result<String, NativeTokenizerError> {
174        match &self.chat_template {
175            Some(tmpl) => Ok(tmpl.format(messages)),
176            None => Err(NativeTokenizerError::NoChatTemplate),
177        }
178    }
179
180    // ── Informational helpers ─────────────────────────────────────────────
181
182    /// Return the total vocabulary size.
183    pub fn vocab_size(&self) -> usize {
184        self.tokenizer.vocab_size()
185    }
186
187    /// Return the BOS token ID from the underlying tokenizer configuration.
188    pub fn bos_id(&self) -> u32 {
189        self.tokenizer.bos_id()
190    }
191
192    /// Return the EOS token ID from the underlying tokenizer configuration.
193    pub fn eos_id(&self) -> u32 {
194        self.tokenizer.eos_id()
195    }
196
197    /// Return `true` if the given token ID is a special token (BOS/EOS/PAD/UNK).
198    pub fn is_special(&self, id: u32) -> bool {
199        self.tokenizer.is_special(id)
200    }
201
202    /// Return a reference to the underlying [`OxiTokenizer`].
203    pub fn inner(&self) -> &OxiTokenizer {
204        &self.tokenizer
205    }
206
207    /// Return a reference to the configured [`ChatTemplate`], if any.
208    pub fn chat_template(&self) -> Option<&ChatTemplate> {
209        self.chat_template.as_ref()
210    }
211}
212
213// ── std::fmt::Debug ───────────────────────────────────────────────────────────
214
215impl std::fmt::Debug for NativeTokenizerBridge {
216    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
217        f.debug_struct("NativeTokenizerBridge")
218            .field("vocab_size", &self.vocab_size())
219            .field("has_chat_template", &self.chat_template.is_some())
220            .finish()
221    }
222}
223
224// ── Unit tests ────────────────────────────────────────────────────────────────
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229
230    #[test]
231    fn char_level_fallback_encode_decode() {
232        let bridge = NativeTokenizerBridge::char_level_fallback();
233        let ids = bridge.encode("hello").expect("encode should succeed");
234        let text = bridge.decode(&ids).expect("decode should succeed");
235        assert_eq!(text, "hello");
236    }
237
238    #[test]
239    fn char_level_fallback_nonempty() {
240        let bridge = NativeTokenizerBridge::char_level_fallback();
241        let ids = bridge.encode("hello").expect("encode should succeed");
242        assert!(!ids.is_empty());
243    }
244
245    #[test]
246    fn char_level_fallback_roundtrip_long() {
247        let bridge = NativeTokenizerBridge::char_level_fallback();
248        // Single word — no GPT-2 space-prefix complications.
249        let original = "thequickbrownfox";
250        let ids = bridge.encode(original).expect("encode should succeed");
251        let decoded = bridge.decode(&ids).expect("decode should succeed");
252        assert_eq!(decoded, original);
253    }
254
255    #[test]
256    fn native_tokenizer_vocab_size_positive() {
257        let bridge = NativeTokenizerBridge::char_level_fallback();
258        assert!(bridge.vocab_size() > 0);
259    }
260
261    #[test]
262    fn char_level_encode_consistent() {
263        let bridge = NativeTokenizerBridge::char_level_fallback();
264        let ids1 = bridge.encode("consistent").expect("first encode");
265        let ids2 = bridge.encode("consistent").expect("second encode");
266        assert_eq!(ids1, ids2);
267    }
268
269    #[test]
270    fn char_level_special_chars() {
271        let bridge = NativeTokenizerBridge::char_level_fallback();
272        // Spaces, newlines, and basic Unicode should not panic.
273        let ids = bridge.encode("hello world\nhow are you").expect("encode");
274        assert!(!ids.is_empty());
275    }
276
277    #[test]
278    fn native_tokenizer_decode_empty() {
279        let bridge = NativeTokenizerBridge::char_level_fallback();
280        let text = bridge.decode(&[]).expect("decode empty should succeed");
281        assert_eq!(text, "");
282    }
283
284    #[test]
285    fn native_tokenizer_format_chat_no_template() {
286        let bridge = NativeTokenizerBridge::char_level_fallback();
287        let result = bridge.format_chat(&[("user", "Hello!")]);
288        assert!(matches!(result, Err(NativeTokenizerError::NoChatTemplate)));
289    }
290
291    #[test]
292    fn native_tokenizer_with_chatml_format() {
293        let bridge = NativeTokenizerBridge::char_level_fallback_with_chatml();
294        let prompt = bridge
295            .format_chat(&[("user", "Hello!")])
296            .expect("format_chat should succeed");
297        assert!(prompt.contains("<|im_start|>user"));
298        assert!(prompt.contains("Hello!"));
299        assert!(prompt.contains("<|im_end|>"));
300    }
301
302    #[test]
303    fn native_tokenizer_encode_empty() {
304        let bridge = NativeTokenizerBridge::char_level_fallback();
305        // Encoding an empty string should succeed (may be empty or BOS-only).
306        let ids = bridge.encode("").expect("encode empty should succeed");
307        // No assertion on content — just that it does not error.
308        let _ = ids;
309    }
310
311    #[test]
312    fn debug_impl_shows_vocab_size() {
313        let bridge = NativeTokenizerBridge::char_level_fallback();
314        let dbg = format!("{bridge:?}");
315        assert!(dbg.contains("vocab_size"));
316        assert!(dbg.contains("has_chat_template"));
317    }
318
319    #[test]
320    fn bos_eos_ids_accessible() {
321        let bridge = NativeTokenizerBridge::char_level_fallback();
322        assert_eq!(bridge.bos_id(), 1);
323        assert_eq!(bridge.eos_id(), 2);
324    }
325
326    #[test]
327    fn special_token_detection() {
328        let bridge = NativeTokenizerBridge::char_level_fallback();
329        assert!(bridge.is_special(0)); // UNK
330        assert!(bridge.is_special(1)); // BOS
331        assert!(bridge.is_special(2)); // EOS
332        assert!(bridge.is_special(3)); // PAD
333        assert!(!bridge.is_special(4)); // first real token
334    }
335
336    #[test]
337    fn inner_ref_returns_tokenizer() {
338        let bridge = NativeTokenizerBridge::char_level_fallback();
339        // We can call inner() and get a consistent vocab_size.
340        assert_eq!(bridge.inner().vocab_size(), bridge.vocab_size());
341    }
342}