1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
//! Tokenizer bridge wrapping HuggingFace tokenizers.
//!
//! On WASM targets, the `tokenizers` crate is unavailable (requires native C extensions).
//! A stub implementation is provided that returns errors for all operations.
use crate::error::{RuntimeError, RuntimeResult};
/// Thin wrapper around `tokenizers::Tokenizer`.
///
/// On non-WASM targets, delegates to the full HuggingFace tokenizers library.
/// On WASM targets, all methods return a `RuntimeError::Tokenizer` error.
pub struct TokenizerBridge {
#[cfg(not(target_arch = "wasm32"))]
inner: tokenizers::Tokenizer,
#[cfg(target_arch = "wasm32")]
_phantom: (),
}
impl TokenizerBridge {
/// Load a tokenizer from a JSON file.
#[cfg(not(target_arch = "wasm32"))]
pub fn from_file(path: &str) -> RuntimeResult<Self> {
let inner = tokenizers::Tokenizer::from_file(path)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))?;
Ok(Self { inner })
}
/// Load a tokenizer from a JSON file.
///
/// On WASM targets, always returns an error since the tokenizers library
/// requires native code not available in WebAssembly.
#[cfg(target_arch = "wasm32")]
pub fn from_file(_path: &str) -> RuntimeResult<Self> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Encode text to token IDs.
#[cfg(not(target_arch = "wasm32"))]
pub fn encode(&self, text: &str) -> RuntimeResult<Vec<u32>> {
let encoding = self
.inner
.encode(text, false)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))?;
Ok(encoding.get_ids().to_vec())
}
/// Encode text to token IDs.
///
/// On WASM targets, always returns an error.
#[cfg(target_arch = "wasm32")]
pub fn encode(&self, _text: &str) -> RuntimeResult<Vec<u32>> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Decode token IDs to text.
#[cfg(not(target_arch = "wasm32"))]
pub fn decode(&self, ids: &[u32]) -> RuntimeResult<String> {
self.inner
.decode(ids, true)
.map_err(|e| RuntimeError::Tokenizer(e.to_string()))
}
/// Decode token IDs to text.
///
/// On WASM targets, always returns an error.
#[cfg(target_arch = "wasm32")]
pub fn decode(&self, _ids: &[u32]) -> RuntimeResult<String> {
Err(RuntimeError::Tokenizer(
"tokenizers library is not available on wasm32 targets".to_string(),
))
}
/// Get the vocabulary size.
#[cfg(not(target_arch = "wasm32"))]
pub fn vocab_size(&self) -> usize {
self.inner.get_vocab_size(true)
}
/// Get the vocabulary size.
///
/// On WASM targets, returns 0 since no tokenizer is available.
#[cfg(target_arch = "wasm32")]
pub fn vocab_size(&self) -> usize {
0
}
/// Get the internal tokenizer reference.
#[cfg(not(target_arch = "wasm32"))]
pub fn inner(&self) -> &tokenizers::Tokenizer {
&self.inner
}
}