oxillama-runtime 0.1.3

Inference engine — KV cache, sampling, tokenizer bridge
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! Tokenizer bridge — wraps HuggingFace `tokenizers` crate.
//!
//! Provides encoding (text → token IDs) and decoding (token IDs → text)
//! using the tokenizer configuration embedded in GGUF model files or
//! loaded from separate tokenizer files.
//!
//! When neither `tokenizer-onig` nor `tokenizer-wasm` is enabled (e.g. for
//! bare no_std-like WASM targets), all methods return
//! `RuntimeError::TokenizerNotAvailable` so that the crate still compiles
//! without any regex or C dependencies.
//!
//! Feature matrix:
//! - `tokenizer-onig`  — uses HuggingFace tokenizers with Oniguruma (C regex, native only)
//! - `tokenizer-wasm`  — uses HuggingFace tokenizers with fancy-regex (pure Rust, wasm32-safe)
//! - neither           — stub that returns `TokenizerNotAvailable`

use crate::error::{RuntimeError, RuntimeResult};

// ─── Full implementation when tokenizer-onig OR tokenizer-wasm is enabled ────

#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
/// Tokenizer bridge wrapping the HuggingFace tokenizers library.
pub struct TokenizerBridge {
    /// The underlying HuggingFace tokenizer.
    tokenizer: tokenizers::Tokenizer,
    /// Cached vocab bytes — populated on first call to `vocab_bytes_cached()`.
    cached_vocab: std::sync::OnceLock<Vec<(u32, Vec<u8>)>>,
}

#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
impl TokenizerBridge {
    /// Load a tokenizer from a JSON file path.
    pub fn from_file(path: &str) -> RuntimeResult<Self> {
        let tokenizer =
            tokenizers::Tokenizer::from_file(path).map_err(|e| RuntimeError::TokenizerError {
                message: format!("failed to load tokenizer from {path}: {e}"),
            })?;
        Ok(Self {
            tokenizer,
            cached_vocab: std::sync::OnceLock::new(),
        })
    }

    /// Create a tokenizer from JSON bytes (e.g., from GGUF metadata).
    pub fn from_bytes(json: &[u8]) -> RuntimeResult<Self> {
        let tokenizer =
            tokenizers::Tokenizer::from_bytes(json).map_err(|e| RuntimeError::TokenizerError {
                message: format!("failed to parse tokenizer JSON: {e}"),
            })?;
        Ok(Self {
            tokenizer,
            cached_vocab: std::sync::OnceLock::new(),
        })
    }

    /// Encode text to token IDs.
    pub fn encode(&self, text: &str) -> RuntimeResult<Vec<u32>> {
        let encoding =
            self.tokenizer
                .encode(text, false)
                .map_err(|e| RuntimeError::TokenizerError {
                    message: format!("encoding failed: {e}"),
                })?;
        Ok(encoding.get_ids().to_vec())
    }

    /// Decode token IDs back to text.
    pub fn decode(&self, tokens: &[u32]) -> RuntimeResult<String> {
        self.tokenizer
            .decode(tokens, true)
            .map_err(|e| RuntimeError::TokenizerError {
                message: format!("decoding failed: {e}"),
            })
    }

    /// Get the vocabulary size.
    pub fn vocab_size(&self) -> usize {
        self.tokenizer.get_vocab_size(true)
    }

    /// Get the BOS (beginning of sequence) token ID, if any.
    pub fn bos_token_id(&self) -> Option<u32> {
        self.tokenizer
            .token_to_id("<s>")
            .or_else(|| self.tokenizer.token_to_id("<|begin_of_text|>"))
    }

    /// Get the EOS (end of sequence) token ID, if any.
    pub fn eos_token_id(&self) -> Option<u32> {
        self.tokenizer
            .token_to_id("</s>")
            .or_else(|| self.tokenizer.token_to_id("<|end_of_text|>"))
            .or_else(|| self.tokenizer.token_to_id("<|endoftext|>"))
    }

    /// Get the string representation of a single token ID.
    ///
    /// Returns `None` if the id is not in the vocabulary.
    pub fn id_to_token(&self, id: u32) -> Option<String> {
        self.tokenizer.id_to_token(id)
    }

    /// Get the byte representation of a single token ID.
    ///
    /// Uses decode (skip_special_tokens=false) to produce the canonical bytes
    /// for byte-level BPE tokenizers. Returns `None` if the id is unknown.
    pub fn token_to_bytes(&self, id: u32) -> Option<Vec<u8>> {
        self.tokenizer
            .decode(&[id], false)
            .ok()
            .map(|s| s.into_bytes())
    }

    /// Build the full vocab as `(token_id, byte_representation)` pairs.
    ///
    /// This is used to pre-compute the vocabulary for grammar masking.
    /// The result can be cached and shared across generation steps.
    pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
        let vocab = self.tokenizer.get_vocab(true);
        let mut result: Vec<(u32, Vec<u8>)> = vocab
            .into_values()
            .filter_map(|id| self.token_to_bytes(id).map(|bytes| (id, bytes)))
            .collect();
        // Sort by id for determinism and cache-friendly iteration.
        result.sort_unstable_by_key(|&(id, _)| id);
        result
    }

    /// Get cached vocabulary bytes. Computes on first call, returns cached thereafter.
    pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
        self.cached_vocab.get_or_init(|| self.vocab_bytes())
    }
}

// ─── Stub when neither tokenizer feature is active ───────────────────────────

#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
/// Tokenizer bridge stub — not available without `tokenizer-onig` or `tokenizer-wasm`.
///
/// All methods return [`RuntimeError::TokenizerNotAvailable`] so callers
/// can detect the missing functionality at runtime rather than at link time.
pub struct TokenizerBridge;

#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
impl TokenizerBridge {
    /// Always returns `Err(TokenizerNotAvailable)` — no C tokenizer available.
    pub fn from_file(_path: &str) -> RuntimeResult<Self> {
        Err(RuntimeError::TokenizerNotAvailable)
    }

    /// Always returns `Err(TokenizerNotAvailable)` — no C tokenizer available.
    pub fn from_bytes(_json: &[u8]) -> RuntimeResult<Self> {
        Err(RuntimeError::TokenizerNotAvailable)
    }

    /// Always returns `Err(TokenizerNotAvailable)`.
    pub fn encode(&self, _text: &str) -> RuntimeResult<Vec<u32>> {
        Err(RuntimeError::TokenizerNotAvailable)
    }

    /// Always returns `Err(TokenizerNotAvailable)`.
    pub fn decode(&self, _tokens: &[u32]) -> RuntimeResult<String> {
        Err(RuntimeError::TokenizerNotAvailable)
    }

    /// Returns 0 — stub has no vocabulary.
    pub fn vocab_size(&self) -> usize {
        0
    }

    /// Returns `None` — stub has no BOS token.
    pub fn bos_token_id(&self) -> Option<u32> {
        None
    }

    /// Returns `None` — stub has no EOS token.
    pub fn eos_token_id(&self) -> Option<u32> {
        None
    }

    /// Returns `None` — stub has no vocabulary.
    pub fn id_to_token(&self, _id: u32) -> Option<String> {
        None
    }

    /// Returns `None` — stub has no vocabulary.
    pub fn token_to_bytes(&self, _id: u32) -> Option<Vec<u8>> {
        None
    }

    /// Returns an empty vec — stub has no vocabulary.
    pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
        Vec::new()
    }

    /// Get cached vocabulary bytes. Returns empty — stub has no vocabulary.
    pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
        &[]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Loading from a non-existent file must return an error in all configs.
    #[test]
    fn test_from_file_nonexistent_errors() {
        let result = TokenizerBridge::from_file("/nonexistent/path/tokenizer_test.json");
        assert!(result.is_err(), "missing tokenizer file should error");
    }

    /// When neither `tokenizer-onig` nor `tokenizer-wasm` is active, the stub
    /// returns `TokenizerNotAvailable` for every method.
    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
    #[test]
    fn test_stub_from_file_returns_not_available() {
        let result = TokenizerBridge::from_file("/any/path.json");
        assert!(
            matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
            "stub should return TokenizerNotAvailable, got {result:?}"
        );
    }

    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
    #[test]
    fn test_stub_from_bytes_returns_not_available() {
        let result = TokenizerBridge::from_bytes(b"{}");
        assert!(
            matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
            "stub should return TokenizerNotAvailable, got {result:?}"
        );
    }

    /// When any tokenizer backend is enabled, invalid JSON must error.
    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_from_bytes_invalid_json_errors() {
        let result = TokenizerBridge::from_bytes(b"not valid json {{{{");
        assert!(
            result.is_err(),
            "invalid tokenizer JSON should return an error"
        );
    }

    /// vocab_size returns 0 for the no-tokenizer stub.
    #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
    #[test]
    fn test_stub_vocab_size_is_zero() {
        // We cannot construct the stub directly, so we verify via error path.
        let r = TokenizerBridge::from_bytes(b"{}");
        assert!(r.is_err());
    }

    // ─── Full tokenizer tests (require tokenizer-onig or tokenizer-wasm) ──────

    /// Minimal BPE tokenizer JSON that the tokenizers crate can parse.
    /// Uses a very small vocabulary sufficient for round-trip tests.
    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    const MINIMAL_TOKENIZER_JSON: &str = r#"{
      "version": "1.0",
      "truncation": null,
      "padding": null,
      "added_tokens": [
        {"id": 0, "special": true, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
        {"id": 1, "special": true, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
        {"id": 2, "special": true, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false}
      ],
      "normalizer": null,
      "pre_tokenizer": null,
      "post_processor": null,
      "decoder": null,
      "model": {
        "type": "BPE",
        "dropout": null,
        "unk_token": "<unk>",
        "continuing_subword_prefix": null,
        "end_of_word_suffix": null,
        "fuse_unk": false,
        "byte_fallback": false,
        "vocab": {
          "<unk>": 0,
          "<s>": 1,
          "</s>": 2,
          "h": 3,
          "e": 4,
          "l": 5,
          "o": 6,
          " ": 7,
          "w": 8,
          "r": 9,
          "d": 10,
          "a": 11,
          "b": 12
        },
        "merges": []
      }
    }"#;

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_from_bytes_valid_json_succeeds() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        assert!(
            bridge.vocab_size() > 0,
            "vocab_size should be positive after loading"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_bos_token_id_found() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // <s> is id=1 in our minimal vocab
        assert_eq!(
            bridge.bos_token_id(),
            Some(1),
            "BOS token <s> should have id=1"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_eos_token_id_found() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // </s> is id=2 in our minimal vocab
        assert_eq!(
            bridge.eos_token_id(),
            Some(2),
            "EOS token </s> should have id=2"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_encode_produces_tokens() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // With no merges and character-level vocab, "hello" → individual char tokens
        let tokens = bridge.encode("hello").expect("test: encode should succeed");
        assert!(
            !tokens.is_empty(),
            "encoding 'hello' should produce at least one token"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_decode_empty_slice_returns_empty_string() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        let decoded = bridge
            .decode(&[])
            .expect("test: decoding empty slice should succeed");
        assert_eq!(
            decoded, "",
            "decoding empty token list should return empty string"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_encode_decode_roundtrip() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // All chars in "hello" are in our single-char vocab, so encode→decode should work
        let tokens = bridge.encode("hello").expect("test: encode should succeed");
        let decoded = bridge.decode(&tokens).expect("test: decode should succeed");
        // The tokenizers crate may add spaces; just verify no panic and non-empty result
        assert!(
            !decoded.is_empty() || tokens.is_empty(),
            "decoded output consistency"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_vocab_size_matches_json() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // We defined 13 tokens in the vocab (0..=12)
        assert_eq!(
            bridge.vocab_size(),
            13,
            "vocab_size should match the number of defined tokens"
        );
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_token_to_bytes_special_token() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        // Token id 0 = <unk>; token_to_bytes returns bytes of the decoded string
        // The result depends on skip_special_tokens=false behaviour
        let _bytes = bridge.token_to_bytes(0); // just verify no panic
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_vocab_bytes_is_sorted() {
        let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
            .expect("test: valid tokenizer JSON should parse");
        let pairs = bridge.vocab_bytes();
        for window in pairs.windows(2) {
            assert!(
                window[0].0 <= window[1].0,
                "vocab_bytes should be sorted by token id, got {} > {}",
                window[0].0,
                window[1].0
            );
        }
    }

    #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
    #[test]
    fn test_from_bytes_invalid_json_structure_errors() {
        // Valid JSON but not a tokenizer schema
        let result = TokenizerBridge::from_bytes(b"{\"not\": \"a tokenizer\"}");
        assert!(result.is_err(), "non-tokenizer JSON should return an error");
    }
}