1use crate::error::{RuntimeError, RuntimeResult};
18
19#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
22pub struct TokenizerBridge {
24 tokenizer: tokenizers::Tokenizer,
26 cached_vocab: std::sync::OnceLock<Vec<(u32, Vec<u8>)>>,
28}
29
30#[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
31impl TokenizerBridge {
32 pub fn from_file(path: &str) -> RuntimeResult<Self> {
34 let tokenizer =
35 tokenizers::Tokenizer::from_file(path).map_err(|e| RuntimeError::TokenizerError {
36 message: format!("failed to load tokenizer from {path}: {e}"),
37 })?;
38 Ok(Self {
39 tokenizer,
40 cached_vocab: std::sync::OnceLock::new(),
41 })
42 }
43
44 pub fn from_bytes(json: &[u8]) -> RuntimeResult<Self> {
46 let tokenizer =
47 tokenizers::Tokenizer::from_bytes(json).map_err(|e| RuntimeError::TokenizerError {
48 message: format!("failed to parse tokenizer JSON: {e}"),
49 })?;
50 Ok(Self {
51 tokenizer,
52 cached_vocab: std::sync::OnceLock::new(),
53 })
54 }
55
56 pub fn encode(&self, text: &str) -> RuntimeResult<Vec<u32>> {
58 let encoding =
59 self.tokenizer
60 .encode(text, false)
61 .map_err(|e| RuntimeError::TokenizerError {
62 message: format!("encoding failed: {e}"),
63 })?;
64 Ok(encoding.get_ids().to_vec())
65 }
66
67 pub fn decode(&self, tokens: &[u32]) -> RuntimeResult<String> {
69 self.tokenizer
70 .decode(tokens, true)
71 .map_err(|e| RuntimeError::TokenizerError {
72 message: format!("decoding failed: {e}"),
73 })
74 }
75
76 pub fn vocab_size(&self) -> usize {
78 self.tokenizer.get_vocab_size(true)
79 }
80
81 pub fn bos_token_id(&self) -> Option<u32> {
83 self.tokenizer
84 .token_to_id("<s>")
85 .or_else(|| self.tokenizer.token_to_id("<|begin_of_text|>"))
86 }
87
88 pub fn eos_token_id(&self) -> Option<u32> {
90 self.tokenizer
91 .token_to_id("</s>")
92 .or_else(|| self.tokenizer.token_to_id("<|end_of_text|>"))
93 .or_else(|| self.tokenizer.token_to_id("<|endoftext|>"))
94 }
95
96 pub fn id_to_token(&self, id: u32) -> Option<String> {
100 self.tokenizer.id_to_token(id)
101 }
102
103 pub fn token_to_bytes(&self, id: u32) -> Option<Vec<u8>> {
108 self.tokenizer
109 .decode(&[id], false)
110 .ok()
111 .map(|s| s.into_bytes())
112 }
113
114 pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
119 let vocab = self.tokenizer.get_vocab(true);
120 let mut result: Vec<(u32, Vec<u8>)> = vocab
121 .into_values()
122 .filter_map(|id| self.token_to_bytes(id).map(|bytes| (id, bytes)))
123 .collect();
124 result.sort_unstable_by_key(|&(id, _)| id);
126 result
127 }
128
129 pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
131 self.cached_vocab.get_or_init(|| self.vocab_bytes())
132 }
133}
134
135#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
138pub struct TokenizerBridge;
143
144#[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
145impl TokenizerBridge {
146 pub fn from_file(_path: &str) -> RuntimeResult<Self> {
148 Err(RuntimeError::TokenizerNotAvailable)
149 }
150
151 pub fn from_bytes(_json: &[u8]) -> RuntimeResult<Self> {
153 Err(RuntimeError::TokenizerNotAvailable)
154 }
155
156 pub fn encode(&self, _text: &str) -> RuntimeResult<Vec<u32>> {
158 Err(RuntimeError::TokenizerNotAvailable)
159 }
160
161 pub fn decode(&self, _tokens: &[u32]) -> RuntimeResult<String> {
163 Err(RuntimeError::TokenizerNotAvailable)
164 }
165
166 pub fn vocab_size(&self) -> usize {
168 0
169 }
170
171 pub fn bos_token_id(&self) -> Option<u32> {
173 None
174 }
175
176 pub fn eos_token_id(&self) -> Option<u32> {
178 None
179 }
180
181 pub fn id_to_token(&self, _id: u32) -> Option<String> {
183 None
184 }
185
186 pub fn token_to_bytes(&self, _id: u32) -> Option<Vec<u8>> {
188 None
189 }
190
191 pub fn vocab_bytes(&self) -> Vec<(u32, Vec<u8>)> {
193 Vec::new()
194 }
195
196 pub fn vocab_bytes_cached(&self) -> &[(u32, Vec<u8>)] {
198 &[]
199 }
200}
201
202#[cfg(test)]
203mod tests {
204 use super::*;
205
206 #[test]
208 fn test_from_file_nonexistent_errors() {
209 let result = TokenizerBridge::from_file("/nonexistent/path/tokenizer_test.json");
210 assert!(result.is_err(), "missing tokenizer file should error");
211 }
212
213 #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
216 #[test]
217 fn test_stub_from_file_returns_not_available() {
218 let result = TokenizerBridge::from_file("/any/path.json");
219 assert!(
220 matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
221 "stub should return TokenizerNotAvailable, got {result:?}"
222 );
223 }
224
225 #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
226 #[test]
227 fn test_stub_from_bytes_returns_not_available() {
228 let result = TokenizerBridge::from_bytes(b"{}");
229 assert!(
230 matches!(result, Err(RuntimeError::TokenizerNotAvailable)),
231 "stub should return TokenizerNotAvailable, got {result:?}"
232 );
233 }
234
235 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
237 #[test]
238 fn test_from_bytes_invalid_json_errors() {
239 let result = TokenizerBridge::from_bytes(b"not valid json {{{{");
240 assert!(
241 result.is_err(),
242 "invalid tokenizer JSON should return an error"
243 );
244 }
245
246 #[cfg(not(any(feature = "tokenizer-onig", feature = "tokenizer-wasm")))]
248 #[test]
249 fn test_stub_vocab_size_is_zero() {
250 let r = TokenizerBridge::from_bytes(b"{}");
252 assert!(r.is_err());
253 }
254
255 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
260 const MINIMAL_TOKENIZER_JSON: &str = r#"{
261 "version": "1.0",
262 "truncation": null,
263 "padding": null,
264 "added_tokens": [
265 {"id": 0, "special": true, "content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
266 {"id": 1, "special": true, "content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false},
267 {"id": 2, "special": true, "content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false}
268 ],
269 "normalizer": null,
270 "pre_tokenizer": null,
271 "post_processor": null,
272 "decoder": null,
273 "model": {
274 "type": "BPE",
275 "dropout": null,
276 "unk_token": "<unk>",
277 "continuing_subword_prefix": null,
278 "end_of_word_suffix": null,
279 "fuse_unk": false,
280 "byte_fallback": false,
281 "vocab": {
282 "<unk>": 0,
283 "<s>": 1,
284 "</s>": 2,
285 "h": 3,
286 "e": 4,
287 "l": 5,
288 "o": 6,
289 " ": 7,
290 "w": 8,
291 "r": 9,
292 "d": 10,
293 "a": 11,
294 "b": 12
295 },
296 "merges": []
297 }
298 }"#;
299
300 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
301 #[test]
302 fn test_from_bytes_valid_json_succeeds() {
303 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
304 .expect("test: valid tokenizer JSON should parse");
305 assert!(
306 bridge.vocab_size() > 0,
307 "vocab_size should be positive after loading"
308 );
309 }
310
311 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
312 #[test]
313 fn test_bos_token_id_found() {
314 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
315 .expect("test: valid tokenizer JSON should parse");
316 assert_eq!(
318 bridge.bos_token_id(),
319 Some(1),
320 "BOS token <s> should have id=1"
321 );
322 }
323
324 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
325 #[test]
326 fn test_eos_token_id_found() {
327 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
328 .expect("test: valid tokenizer JSON should parse");
329 assert_eq!(
331 bridge.eos_token_id(),
332 Some(2),
333 "EOS token </s> should have id=2"
334 );
335 }
336
337 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
338 #[test]
339 fn test_encode_produces_tokens() {
340 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
341 .expect("test: valid tokenizer JSON should parse");
342 let tokens = bridge.encode("hello").expect("test: encode should succeed");
344 assert!(
345 !tokens.is_empty(),
346 "encoding 'hello' should produce at least one token"
347 );
348 }
349
350 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
351 #[test]
352 fn test_decode_empty_slice_returns_empty_string() {
353 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
354 .expect("test: valid tokenizer JSON should parse");
355 let decoded = bridge
356 .decode(&[])
357 .expect("test: decoding empty slice should succeed");
358 assert_eq!(
359 decoded, "",
360 "decoding empty token list should return empty string"
361 );
362 }
363
364 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
365 #[test]
366 fn test_encode_decode_roundtrip() {
367 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
368 .expect("test: valid tokenizer JSON should parse");
369 let tokens = bridge.encode("hello").expect("test: encode should succeed");
371 let decoded = bridge.decode(&tokens).expect("test: decode should succeed");
372 assert!(
374 !decoded.is_empty() || tokens.is_empty(),
375 "decoded output consistency"
376 );
377 }
378
379 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
380 #[test]
381 fn test_vocab_size_matches_json() {
382 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
383 .expect("test: valid tokenizer JSON should parse");
384 assert_eq!(
386 bridge.vocab_size(),
387 13,
388 "vocab_size should match the number of defined tokens"
389 );
390 }
391
392 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
393 #[test]
394 fn test_token_to_bytes_special_token() {
395 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
396 .expect("test: valid tokenizer JSON should parse");
397 let _bytes = bridge.token_to_bytes(0); }
401
402 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
403 #[test]
404 fn test_vocab_bytes_is_sorted() {
405 let bridge = TokenizerBridge::from_bytes(MINIMAL_TOKENIZER_JSON.as_bytes())
406 .expect("test: valid tokenizer JSON should parse");
407 let pairs = bridge.vocab_bytes();
408 for window in pairs.windows(2) {
409 assert!(
410 window[0].0 <= window[1].0,
411 "vocab_bytes should be sorted by token id, got {} > {}",
412 window[0].0,
413 window[1].0
414 );
415 }
416 }
417
418 #[cfg(any(feature = "tokenizer-onig", feature = "tokenizer-wasm"))]
419 #[test]
420 fn test_from_bytes_invalid_json_structure_errors() {
421 let result = TokenizerBridge::from_bytes(b"{\"not\": \"a tokenizer\"}");
423 assert!(result.is_err(), "non-tokenizer JSON should return an error");
424 }
425}