Skip to main content

oxillama_runtime/
tokenizer_bridge.rs

1//! Tokenizer bridge — wraps HuggingFace `tokenizers` crate.
2//!
3//! Provides encoding (text → token IDs) and decoding (token IDs → text)
4//! using the tokenizer configuration embedded in GGUF model files or
5//! loaded from separate tokenizer files.
6//!
7//! When neither `tokenizer-onig` nor `tokenizer-wasm` is enabled (e.g. for
8//! bare no_std-like WASM targets), all methods return
9//! `RuntimeError::TokenizerNotAvailable` so that the crate still compiles
10//! without any regex or C dependencies.
11//!
12//! Feature matrix:
13//! - `tokenizer-onig`  — uses HuggingFace tokenizers with Oniguruma (C regex, native only)
14//! - `tokenizer-wasm`  — uses HuggingFace tokenizers with fancy-regex (pure Rust, wasm32-safe)
15//! - neither           — stub that returns `TokenizerNotAvailable`
16
17use crate::error::{RuntimeError, RuntimeResult};
18
19// ─── Full implementation when tokenizer-onig OR tokenizer-wasm is enabled ────
20
21#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
22/// Tokenizer bridge wrapping the HuggingFace tokenizers library.
23pub struct TokenizerBridge {
24    /// The underlying HuggingFace tokenizer.
25    tokenizer: tokenizers::Tokenizer,
26    /// Cached vocab bytes — populated on first call to `vocab_bytes_cached()`.
27    cached_vocab: std::sync::OnceLock<Vec<(u32, Vec<u8>)>>,
28}
29
30#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
31impl TokenizerBridge {
32    /// Load a tokenizer from a JSON file path.
33    pub fn from_file(path: &str) -> RuntimeResult<Self> {
34        let tokenizer =
35            tokenizers::Tokenizer::from_file(path).map_err(|e| RuntimeError::TokenizerError {
36                message: format!("failed to load tokenizer from {path}: {e}"),
37            })?;
38        Ok(Self {
39            tokenizer,
40            cached_vocab: std::sync::OnceLock::new(),
41        })
42    }
43
44    /// Create a tokenizer from JSON bytes (e.g., from GGUF metadata).
45    pub fn from_bytes(json: &[u8]) -> RuntimeResult<Self> {
46        let tokenizer =
47            tokenizers::Tokenizer::from_bytes(json).map_err(|e| RuntimeError::TokenizerError {
48                message: format!("failed to parse tokenizer JSON: {e}"),
49            })?;
50        Ok(Self {
51            tokenizer,
52            cached_vocab: std::sync::OnceLock::new(),
53        })
54    }
55
56    /// Encode text to token IDs.
57    pub fn encode(&self, text: &str) -> RuntimeResult<Vec<u32>> {
58        let encoding =
59            self.tokenizer
60                .encode(text, false)
61                .map_err(|e| RuntimeError::TokenizerError {
62                    message: format!("encoding failed: {e}"),
63                })?;
64        Ok(encoding.get_ids().to_vec())
65    }
66
67    /// Decode token IDs back to text.
68    pub fn decode(&self, tokens: &[u32]) -> RuntimeResult<String> {
69        self.tokenizer
70            .decode(tokens, true)
71            .map_err(|e| RuntimeError::TokenizerError {
72                message: format!("decoding failed: {e}"),
73            })
74    }
75
76    /// Get the vocabulary size.
77    pub fn vocab_size(&self) -> usize {
78        self.tokenizer.get_vocab_size(true)
79    }
80
81    /// Get the BOS (beginning of sequence) token ID, if any.
82    pub fn bos_token_id(&self) -> Option<u32> {
83        self.tokenizer
84            .token_to_id("<s>")
85            .or_else(|| self.tokenizer.token_to_id("<|begin_of_text|>"))
86    }
87
88    /// Get the EOS (end of sequence) token ID, if any.
89    pub fn eos_token_id(&self) -> Option<u32> {
90        self.tokenizer
91            .token_to_id("</s>")
92            .or_else(|| self.tokenizer.token_to_id("<|end_of_text|>"))
93            .or_else(|| self.tokenizer.token_to_id("<|endoftext|>"))
94    }
95
96    /// Get the string representation of a single token ID.
97    ///
98    /// Returns `None` if the id is not in the vocabulary.
99    pub fn id_to_token(&self, id: u32) -> Option<String> {
100        self.tokenizer.id_to_token(id)
101    }
102
103    /// Get the byte representation of a single token ID.
104    ///
105    /// Uses decode (skip_special_tokens=false) to produce the canonical bytes
106    /// for byte-level BPE tokenizers. Returns `None` if the id is unknown.
107    pub fn token_to_bytes(&self, id: u32) -> Option<Vec<u8>> {
108        self.tokenizer
109            .decode(&[id], false)
110            .ok()
111            .map(|s| s.into_bytes())
112    }
113
114    /// Build the full vocab as `(token_id, byte_representation)` pairs.
115    ///
116    /// This is used to pre-compute the vocabulary for grammar masking.
117    /// The result can be cached and shared across generation steps.
118    pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
119        let vocab = self.tokenizer.get_vocab(true);
120        let mut result: Vec<(u32, Vec<u8>)> = vocab
121            .into_values()
122            .filter_map(|id| self.token_to_bytes(id).map(|bytes| (id, bytes)))
123            .collect();
124        // Sort by id for determinism and cache-friendly iteration.
125        result.sort_unstable_by_key(|&(id, _)| id);
126        result
127    }
128
129    /// Get cached vocabulary bytes. Computes on first call, returns cached thereafter.
130    pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
131        self.cached_vocab.get_or_init(|| self.vocab_bytes())
132    }
133}
134
135// ─── Stub when neither tokenizer feature is active ───────────────────────────
136
137#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
138/// Tokenizer bridge stub — not available without `tokenizer-onig` or `tokenizer-wasm`.
139///
140/// All methods return [`RuntimeError::TokenizerNotAvailable`] so callers
141/// can detect the missing functionality at runtime rather than at link time.
142pub struct TokenizerBridge;
143
144#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
145impl TokenizerBridge {
146    /// Always returns `Err(TokenizerNotAvailable)` — no C tokenizer available.
147    pub fn from_file(_path: &str) -> RuntimeResult<Self> {
148        Err(RuntimeError::TokenizerNotAvailable)
149    }
150
151    /// Always returns `Err(TokenizerNotAvailable)` — no C tokenizer available.
152    pub fn from_bytes(_json: &[u8]) -> RuntimeResult<Self> {
153        Err(RuntimeError::TokenizerNotAvailable)
154    }
155
156    /// Always returns `Err(TokenizerNotAvailable)`.
157    pub fn encode(&self, _text: &str) -> RuntimeResult<Vec<u32>> {
158        Err(RuntimeError::TokenizerNotAvailable)
159    }
160
161    /// Always returns `Err(TokenizerNotAvailable)`.
162    pub fn decode(&self, _tokens: &[u32]) -> RuntimeResult<String> {
163        Err(RuntimeError::TokenizerNotAvailable)
164    }
165
166    /// Returns 0 — stub has no vocabulary.
167    pub fn vocab_size(&self) -> usize {
168        0
169    }
170
171    /// Returns `None` — stub has no BOS token.
172    pub fn bos_token_id(&self) -> Option<u32> {
173        None
174    }
175
176    /// Returns `None` — stub has no EOS token.
177    pub fn eos_token_id(&self) -> Option<u32> {
178        None
179    }
180
181    /// Returns `None` — stub has no vocabulary.
182    pub fn id_to_token(&self, _id: u32) -> Option<String> {
183        None
184    }
185
186    /// Returns `None` — stub has no vocabulary.
187    pub fn token_to_bytes(&self, _id: u32) -> Option<Vec<u8>> {
188        None
189    }
190
191    /// Returns an empty vec — stub has no vocabulary.
192    pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
193        Vec::new()
194    }
195
196    /// Get cached vocabulary bytes. Returns empty — stub has no vocabulary.
197    pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
198        &[]
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    /// Loading from a non-existent file must return an error in all configs.
207    #[test]
208    fn test_from_file_nonexistent_errors() {
209        let result = TokenizerBridge::from_file("/nonexistent/path/tokenizer_test.json");
210        assert!(result.is_err(), "missing tokenizer file should error");
211    }
212
213    /// When neither `tokenizer-onig` nor `tokenizer-wasm` is active, the stub
214    /// returns `TokenizerNotAvailable` for every method.
215    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
216    #[test]
217    fn test_stub_from_file_returns_not_available() {
218        let result = TokenizerBridge::from_file("/any/path.json");
219        assert!(
220            matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
221            "stub should return TokenizerNotAvailable, got {result:?}"
222        );
223    }
224
225    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
226    #[test]
227    fn test_stub_from_bytes_returns_not_available() {
228        let result = TokenizerBridge::from_bytes(b"{}");
229        assert!(
230            matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
231            "stub should return TokenizerNotAvailable, got {result:?}"
232        );
233    }
234
235    /// When any tokenizer backend is enabled, invalid JSON must error.
236    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
237    #[test]
238    fn test_from_bytes_invalid_json_errors() {
239        let result = TokenizerBridge::from_bytes(b"not valid json {{{{");
240        assert!(
241            result.is_err(),
242            "invalid tokenizer JSON should return an error"
243        );
244    }
245
246    /// vocab_size returns 0 for the no-tokenizer stub.
247    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
248    #[test]
249    fn test_stub_vocab_size_is_zero() {
250        // We cannot construct the stub directly, so we verify via error path.
251        let r = TokenizerBridge::from_bytes(b"{}");
252        assert!(r.is_err());
253    }
254
255    // ─── Full tokenizer tests (require tokenizer-onig or tokenizer-wasm) ──────
256
257    /// Minimal BPE tokenizer JSON that the tokenizers crate can parse.
258    /// Uses a very small vocabulary sufficient for round-trip tests.
259    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
260    const MINIMAL_TOKENIZER_JSON: &str = r#"{
261      "version": "1.0",
262      "truncation": null,
263      "padding": null,
264      "added_tokens": [
265        {"id": 0, "special": true, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
266        {"id": 1, "special": true, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
267        {"id": 2, "special": true, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false}
268      ],
269      "normalizer": null,
270      "pre_tokenizer": null,
271      "post_processor": null,
272      "decoder": null,
273      "model": {
274        "type": "BPE",
275        "dropout": null,
276        "unk_token": "<unk>",
277        "continuing_subword_prefix": null,
278        "end_of_word_suffix": null,
279        "fuse_unk": false,
280        "byte_fallback": false,
281        "vocab": {
282          "<unk>": 0,
283          "<s>": 1,
284          "</s>": 2,
285          "h": 3,
286          "e": 4,
287          "l": 5,
288          "o": 6,
289          " ": 7,
290          "w": 8,
291          "r": 9,
292          "d": 10,
293          "a": 11,
294          "b": 12
295        },
296        "merges": []
297      }
298    }"#;
299
300    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
301    #[test]
302    fn test_from_bytes_valid_json_succeeds() {
303        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
304            .expect("test: valid tokenizer JSON should parse");
305        assert!(
306            bridge.vocab_size() > 0,
307            "vocab_size should be positive after loading"
308        );
309    }
310
311    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
312    #[test]
313    fn test_bos_token_id_found() {
314        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
315            .expect("test: valid tokenizer JSON should parse");
316        // <s> is id=1 in our minimal vocab
317        assert_eq!(
318            bridge.bos_token_id(),
319            Some(1),
320            "BOS token <s> should have id=1"
321        );
322    }
323
324    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
325    #[test]
326    fn test_eos_token_id_found() {
327        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
328            .expect("test: valid tokenizer JSON should parse");
329        // </s> is id=2 in our minimal vocab
330        assert_eq!(
331            bridge.eos_token_id(),
332            Some(2),
333            "EOS token </s> should have id=2"
334        );
335    }
336
337    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
338    #[test]
339    fn test_encode_produces_tokens() {
340        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
341            .expect("test: valid tokenizer JSON should parse");
342        // With no merges and character-level vocab, "hello" → individual char tokens
343        let tokens = bridge.encode("hello").expect("test: encode should succeed");
344        assert!(
345            !tokens.is_empty(),
346            "encoding 'hello' should produce at least one token"
347        );
348    }
349
350    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
351    #[test]
352    fn test_decode_empty_slice_returns_empty_string() {
353        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
354            .expect("test: valid tokenizer JSON should parse");
355        let decoded = bridge
356            .decode(&[])
357            .expect("test: decoding empty slice should succeed");
358        assert_eq!(
359            decoded, "",
360            "decoding empty token list should return empty string"
361        );
362    }
363
364    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
365    #[test]
366    fn test_encode_decode_roundtrip() {
367        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
368            .expect("test: valid tokenizer JSON should parse");
369        // All chars in "hello" are in our single-char vocab, so encode→decode should work
370        let tokens = bridge.encode("hello").expect("test: encode should succeed");
371        let decoded = bridge.decode(&tokens).expect("test: decode should succeed");
372        // The tokenizers crate may add spaces; just verify no panic and non-empty result
373        assert!(
374            !decoded.is_empty() || tokens.is_empty(),
375            "decoded output consistency"
376        );
377    }
378
379    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
380    #[test]
381    fn test_vocab_size_matches_json() {
382        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
383            .expect("test: valid tokenizer JSON should parse");
384        // We defined 13 tokens in the vocab (0..=12)
385        assert_eq!(
386            bridge.vocab_size(),
387            13,
388            "vocab_size should match the number of defined tokens"
389        );
390    }
391
392    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
393    #[test]
394    fn test_token_to_bytes_special_token() {
395        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
396            .expect("test: valid tokenizer JSON should parse");
397        // Token id 0 = <unk>; token_to_bytes returns bytes of the decoded string
398        // The result depends on skip_special_tokens=false behaviour
399        let _bytes = bridge.token_to_bytes(0); // just verify no panic
400    }
401
402    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
403    #[test]
404    fn test_vocab_bytes_is_sorted() {
405        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
406            .expect("test: valid tokenizer JSON should parse");
407        let pairs = bridge.vocab_bytes();
408        for window in pairs.windows(2) {
409            assert!(
410                window[0].0 <= window[1].0,
411                "vocab_bytes should be sorted by token id, got {} > {}",
412                window[0].0,
413                window[1].0
414            );
415        }
416    }
417
418    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
419    #[test]
420    fn test_from_bytes_invalid_json_structure_errors() {
421        // Valid JSON but not a tokenizer schema
422        let result = TokenizerBridge::from_bytes(b"{\"not\": \"a tokenizer\"}");
423        assert!(result.is_err(), "non-tokenizer JSON should return an error");
424    }
425}