xgrammar/
tokenizer_info.rs

1use autocxx::prelude::*;
2
3use crate::{CxxUniquePtr, FFITokenizerInfo, VocabType, cxx_utils};
4
5type StopTokenIds = Option<Box<[i32]>>;
6
7/// TokenizerInfo contains the vocabulary, its type, and metadata used by the
8/// grammar-guided generation.
9///
10/// Notes:
11/// - Tokens may be encoded differently depending on `VocabType` (e.g. ByteFallback
12///   uses "<0x1B>", ByteLevel uses unicode mappings). This wrapper exposes the
13///   decoded vocabulary in the same form as the original text via
14///   `decoded_vocab_as_bytes`.
15/// - Some models pad their vocab size to a multiple of 32 or similar. If your
16///   model's vocab size differs from `encoded_vocab.len()`, use
17///   `new_with_vocab_size` to pass the model's vocab size so bitmask sizes are
18///   computed correctly.
19pub struct TokenizerInfo {
20    inner: CxxUniquePtr<FFITokenizerInfo>,
21}
22
23impl TokenizerInfo {
24    /// Construct a TokenizerInfo with vocab size derived from `encoded_vocab`.
25    ///
26    /// If the model's vocab size differs from `encoded_vocab.len()`, prefer
27    /// `new_with_vocab_size`.
28    pub fn new<T: AsRef<str>>(
29        encoded_vocab: &[T],
30        vocab_type: VocabType,
31        stop_token_ids: &StopTokenIds,
32        add_prefix_space: bool,
33    ) -> Self {
34        Self::new_with_vocab_size(
35            encoded_vocab,
36            vocab_type,
37            Some(encoded_vocab.len()),
38            stop_token_ids,
39            add_prefix_space,
40        )
41    }
42
43    /// Construct a TokenizerInfo with an explicit model `vocab_size`.
44    ///
45    /// Use this when the model's vocab size (e.g., padded to a multiple of 32)
46    /// differs from the tokenizer's `encoded_vocab.len()`. Indices in the range
47    /// `[encoded_vocab.len(), vocab_size)` are treated as special/reserved.
48    pub fn new_with_vocab_size<T: AsRef<str>>(
49        encoded_vocab: &[T],
50        vocab_type: VocabType,
51        vocab_size: Option<usize>,
52        stop_token_ids: &StopTokenIds,
53        add_prefix_space: bool,
54    ) -> Self {
55        let mut cxx_vec = cxx_utils::new_string_vector();
56        {
57            let mut cxx_vec_pin = cxx_vec.pin_mut();
58            cxx_utils::string_vec_reserve(
59                cxx_vec_pin.as_mut(),
60                encoded_vocab.len(),
61            );
62            for string in encoded_vocab.iter() {
63                let bytes = string.as_ref().as_bytes();
64                unsafe {
65                    cxx_utils::string_vec_push_bytes(
66                        cxx_vec_pin.as_mut(),
67                        bytes.as_ptr() as *const i8,
68                        bytes.len(),
69                    );
70                }
71            }
72        }
73        let (has_vocab_size, vocab_size_i32) = match vocab_size {
74            Some(sz) => (true, sz as i32),
75            None => (false, 0i32),
76        };
77
78        let (has_stop_ids, stop_ptr, stop_len) = match stop_token_ids.as_ref() {
79            Some(slice) if !slice.is_empty() => {
80                (true, slice.as_ptr(), slice.len())
81            },
82            _ => (false, std::ptr::null(), 0usize),
83        };
84
85        let ffi_obj = unsafe {
86            cxx_utils::make_tokenizer_info(
87                cxx_vec.as_ref().unwrap(),
88                vocab_type,
89                has_vocab_size,
90                vocab_size_i32,
91                has_stop_ids,
92                stop_ptr,
93                stop_len,
94                add_prefix_space,
95            )
96        };
97
98        let inner = ffi_obj;
99        Self { inner }
100    }
101
102    /// Construct TokenizerInfo from encoded vocab (bytes) and a metadata JSON
103    /// string produced by `dump_metadata`.
104    pub fn from_vocab_and_metadata_bytes<I, B>(
105        encoded_vocab: I,
106        metadata: &str,
107    ) -> Self
108    where
109        I: IntoIterator<Item = B>,
110        B: AsRef<[u8]>,
111    {
112        let mut cxx_vec = cxx_utils::new_string_vector();
113        {
114            let mut cxx_vec_pin = cxx_vec.pin_mut();
115            for string in encoded_vocab.into_iter() {
116                let bytes = string.as_ref();
117                unsafe {
118                    cxx_utils::string_vec_push_bytes(
119                        cxx_vec_pin.as_mut(),
120                        bytes.as_ptr() as *const i8,
121                        bytes.len(),
122                    );
123                }
124            }
125        }
126
127        cxx::let_cxx_string!(metadata_cxx = metadata);
128        let ffi_ptr = FFITokenizerInfo::FromVocabAndMetadata(
129            cxx_vec.as_ref().unwrap(),
130            &metadata_cxx,
131        )
132        .within_unique_ptr();
133        Self { inner: ffi_ptr }
134    }
135
136    /// The type of the vocabulary.
137    pub fn vocab_type(&self) -> VocabType {
138        self.inner
139            .as_ref()
140            .expect("FFITokenizerInfo UniquePtr was null")
141            .GetVocabType()
142    }
143
144    /// The size of the vocabulary.
145    pub fn vocab_size(&self) -> usize {
146        let sz = usize::try_from(
147            self.inner
148                .as_ref()
149                .expect("FFITokenizerInfo UniquePtr was null")
150                .GetVocabSize()
151                .0,
152        )
153            .expect("GetVocabSize returned a negative value")
154            ;
155        sz
156    }
157
158    /// Whether the tokenizer will prepend a space before the text in the tokenization
159    /// process.
160    pub fn add_prefix_space(&self) -> bool {
161        let val = self
162            .inner
163            .as_ref()
164            .expect("FFITokenizerInfo UniquePtr was null")
165            .GetAddPrefixSpace();
166        val
167    }
168
169    /// The decoded vocabulary of the tokenizer. This converts tokens in the
170    /// LLM's vocabulary back to the original text form (e.g., ByteFallback
171    /// "<0x1B>" -> "\u001b").
172    pub fn decoded_vocab(&self) -> Box<[Box<[u8]>]> {
173        let cxx_vec = self.inner.GetDecodedVocab();
174        let mut result: Vec<Box<[u8]>> = Vec::with_capacity(cxx_vec.len());
175        for cxx_string in cxx_vec.iter() {
176            result.push(
177                cxx_string
178                    .to_string_lossy()
179                    .into_owned()
180                    .into_bytes()
181                    .into_boxed_slice(),
182            );
183        }
184        result.into_boxed_slice()
185    }
186
187    /// Stop token ids.
188    pub fn stop_token_ids(&self) -> Box<[i32]> {
189        let cxx_vec = self.inner.GetStopTokenIds();
190        cxx_vec.iter().copied().collect::<Vec<_>>().into_boxed_slice()
191    }
192
193    /// The special token ids. Special tokens include control tokens, reserved tokens,
194    /// padded tokens, etc. Now it is automatically detected from the vocabulary.
195    pub fn special_token_ids(&self) -> Box<[i32]> {
196        let cxx_vec = self
197            .inner
198            .as_ref()
199            .expect("FFITokenizerInfo UniquePtr was null")
200            .GetSpecialTokenIds();
201        cxx_vec.iter().copied().collect::<Vec<_>>().into_boxed_slice()
202    }
203
204    /// Dump the metadata of the tokenizer to a json string. It can be used to construct the
205    /// tokenizer info from the vocabulary and the metadata string.
206    pub fn dump_metadata(&self) -> String {
207        self
208            .inner
209            .as_ref()
210            .expect("FFITokenizerInfo UniquePtr was null")
211            .DumpMetadata()
212            .to_string()
213    }
214
215    /// Serialize the tokenizer info to a JSON string.
216    pub fn serialize_json(&self) -> String {
217        self
218            .inner
219            .as_ref()
220            .expect("FFITokenizerInfo UniquePtr was null")
221            .SerializeJSON()
222            .to_string()
223    }
224
225    /// Deserialize a `TokenizerInfo` from a JSON string.
226    ///
227    /// Returns
228    /// - `Ok(TokenizerInfo)` on success
229    /// - `Err(String)` when deserialization fails due to any of the following:
230    ///   - invalid JSON syntax
231    ///   - schema/format mismatch with `TokenizerInfo` serialization
232    ///   - serialization version mismatch (via the `__VERSION__` field)
233    /// The error string mirrors the C++ exception message.
234    pub fn deserialize_json(json: &str) -> Result<Self, String> {
235        cxx::let_cxx_string!(json_cxx = json);
236        cxx::let_cxx_string!(error_out_cxx = "");
237        let uptr = unsafe {
238            cxx_utils::tokenizer_info_deserialize_json_or_error(
239                &json_cxx,
240                error_out_cxx.as_mut().get_unchecked_mut(),
241            )
242        };
243        if uptr.is_null() {
244            return Err(error_out_cxx.to_string());
245        }
246        Ok(Self { inner: uptr })
247    }
248
249    pub(crate) fn ffi_ref(&self) -> &FFITokenizerInfo {
250        self.inner
251            .as_ref()
252            .expect("FFITokenizerInfo UniquePtr was null")
253    }
254
255    pub(crate) fn from_unique_ptr(inner: cxx::UniquePtr<FFITokenizerInfo>) -> Self {
256        Self { inner }
257    }
258
259    // No from_pinned_ffi needed with UniquePtr ownership
260}
261
262impl Drop for TokenizerInfo {
263    fn drop(&mut self) {
264    }
265}
266
267// ---- Hugging Face tokenizers integration (feature-gated) ----
268//
269// The following helpers mirror Python's TokenizerInfo utilities:
270// - _is_tiktoken_tokenizer
271// - _is_sentencepiece_tokenizer
272// - from_huggingface
273//
274// They are adapted to Rust and the `tokenizers` crate. Detection is heuristic-based
275// using the vocabulary content, since Rust does not expose the Python runtime types.
276
277#[cfg(feature = "tokenizers")]
278impl TokenizerInfo {
279    #[inline]
280    fn extract_ordered_vocab(tokenizer: &tokenizers::Tokenizer) -> Box<[String]> {
281        let mut pairs: Vec<(usize, String)> = tokenizer
282            .get_vocab(true)
283            .into_iter()
284            .map(|(tok, id)| (id as usize, tok))
285            .collect();
286        pairs.sort_by_key(|(id, _)| *id);
287        pairs.into_iter().map(|(_, tok)| tok).collect::<Vec<_>>().into_boxed_slice()
288    }
289
290    /// Heuristically detect whether a tokenizer resembles a tiktoken-style tokenizer.
291    ///
292    /// In Python this checks `isinstance(tokenizer.tokenizer, tiktoken.Encoding)` or whether
293    /// the vocab filename contains "tiktoken". In Rust we do not have those runtime types,
294    /// so we approximate: if the vocabulary does NOT contain typical markers of
295    /// SentencePiece ("▁"), Byte-level GPT-2 ("Ġ"), or ByteFallback (tokens like "<0x1B>"),
296    /// we consider it RAW (tiktoken-like).
297    pub fn _is_tiktoken_tokenizer(tokenizer: &tokenizers::Tokenizer) -> bool {
298        let vocab = tokenizer.get_vocab(true);
299        let mut has_sentencepiece_marker = false; // '▁'
300        let mut has_bytelevel_marker = false; // 'Ġ'
301        let mut has_bytefallback_marker = false; // tokens like "<0x..>"
302        for token in vocab.keys() {
303            if !has_sentencepiece_marker && token.contains('▁') {
304                has_sentencepiece_marker = true;
305            }
306            if !has_bytelevel_marker && token.contains('Ġ') {
307                has_bytelevel_marker = true;
308            }
309            if !has_bytefallback_marker
310                && token.starts_with("<0x")
311                && token.ends_with('>')
312            {
313                has_bytefallback_marker = true;
314            }
315            if has_sentencepiece_marker
316                || has_bytelevel_marker
317                || has_bytefallback_marker
318            {
319                break;
320            }
321        }
322        !(has_sentencepiece_marker
323            || has_bytelevel_marker
324            || has_bytefallback_marker)
325    }
326
327    /// Heuristically detect whether a tokenizer is SentencePiece-based.
328    ///
329    /// In Python this checks for a `sentencepiece.SentencePieceProcessor`. Here we look for
330    /// typical SentencePiece marker "▁" in the vocabulary. This is a best-effort heuristic
331    /// and may not be perfect for all models.
332    pub fn _is_sentencepiece_tokenizer(
333        tokenizer: &tokenizers::Tokenizer
334    ) -> bool {
335        let vocab = tokenizer.get_vocab(true);
336        vocab.keys().any(|tok| tok.contains('▁'))
337    }
338
339    /// Construct from a `tokenizers::Tokenizer` with explicit options, preserving tokenizer indexing.
340    ///
341    /// This matches Python's constructor path where `encoded_vocab` is built by id order and
342    /// `vocab_size` may be larger than the tokenizer's vocab (model padding), with special ids
343    /// reserved in the tail range.
344    pub fn from_tokenizers_with_options(
345        tokenizer: &tokenizers::Tokenizer,
346        vocab_type: VocabType,
347        vocab_size: Option<usize>,
348        stop_token_ids: Option<&[i32]>,
349        add_prefix_space: bool,
350    ) -> Self {
351        let ordered = Self::extract_ordered_vocab(tokenizer);
352        let stop: Option<Box<[i32]>> =
353            stop_token_ids.map(|s| s.to_vec().into_boxed_slice());
354        Self::new_with_vocab_size(
355            &ordered,
356            vocab_type,
357            vocab_size,
358            &stop,
359            add_prefix_space,
360        )
361    }
362
363    /// Convenience: RAW vocab, detected size, no stops, no prefix space.
364    pub fn from_tokenizers_simple(tokenizer: &tokenizers::Tokenizer) -> Self {
365        Self::from_tokenizers_with_options(
366            tokenizer,
367            VocabType::RAW,
368            None,
369            None,
370            false,
371        )
372    }
373
374    /// Construct the tokenizer info from a Hugging Face `tokenizers::Tokenizer`.
375    ///
376    /// This mirrors Python's `TokenizerInfo.from_huggingface` and automatically detects
377    /// vocab type and `add_prefix_space` using vocabulary heuristics. Provide `vocab_size`
378    /// if the model's vocab differs from the tokenizer's (padding or reduced vocab). Pass
379    /// `stop_token_ids` if you want to override auto-detection (Rust tokenizers do not carry
380    /// EOS id consistently across models).
381    pub fn from_huggingface(
382        tokenizer: &tokenizers::Tokenizer,
383        vocab_size: Option<usize>,
384        stop_token_ids: Option<&[i32]>,
385    ) -> Self {
386        use crate::VocabType;
387
388        // Heuristics for vocab type and prefix-space behavior
389        let vocab = tokenizer.get_vocab(true);
390        let has_bytefallback_marker =
391            vocab.keys().any(|t| t.starts_with("<0x") && t.ends_with('>'));
392        let has_sentencepiece_marker = vocab.keys().any(|t| t.contains('▁'));
393        let has_bytelevel_marker = vocab.keys().any(|t| t.contains('Ġ'));
394
395        let (vocab_type, add_prefix_space) = if has_bytefallback_marker {
396            (VocabType::BYTE_FALLBACK, true)
397        } else if has_sentencepiece_marker {
398            // Some SentencePiece tokenizers can still be RAW; however, in Python they default
399            // to add_prefix_space=True for SP. If the vocab also contains "<0x..>" we already
400            // categorized as BYTE_FALLBACK above.
401            (VocabType::RAW, true)
402        } else if has_bytelevel_marker {
403            (VocabType::BYTE_LEVEL, false)
404        } else {
405            (VocabType::RAW, false)
406        };
407
408        // Build with explicit options, preserving token id ordering
409        Self::from_tokenizers_with_options(
410            tokenizer,
411            vocab_type,
412            vocab_size,
413            stop_token_ids,
414            add_prefix_space,
415        )
416    }
417}