1use autocxx::prelude::*;
2
3use crate::{CxxUniquePtr, FFITokenizerInfo, VocabType, cxx_utils};
4
5type StopTokenIds = Option<Box<[i32]>>;
6
7pub struct TokenizerInfo {
20 inner: CxxUniquePtr<FFITokenizerInfo>,
21}
22
23impl TokenizerInfo {
24 pub fn new<T: AsRef<str>>(
29 encoded_vocab: &[T],
30 vocab_type: VocabType,
31 stop_token_ids: &StopTokenIds,
32 add_prefix_space: bool,
33 ) -> Self {
34 Self::new_with_vocab_size(
35 encoded_vocab,
36 vocab_type,
37 Some(encoded_vocab.len()),
38 stop_token_ids,
39 add_prefix_space,
40 )
41 }
42
43 pub fn new_with_vocab_size<T: AsRef<str>>(
49 encoded_vocab: &[T],
50 vocab_type: VocabType,
51 vocab_size: Option<usize>,
52 stop_token_ids: &StopTokenIds,
53 add_prefix_space: bool,
54 ) -> Self {
55 let mut cxx_vec = cxx_utils::new_string_vector();
56 {
57 let mut cxx_vec_pin = cxx_vec.pin_mut();
58 cxx_utils::string_vec_reserve(
59 cxx_vec_pin.as_mut(),
60 encoded_vocab.len(),
61 );
62 for string in encoded_vocab.iter() {
63 let bytes = string.as_ref().as_bytes();
64 unsafe {
65 cxx_utils::string_vec_push_bytes(
66 cxx_vec_pin.as_mut(),
67 bytes.as_ptr() as *const i8,
68 bytes.len(),
69 );
70 }
71 }
72 }
73 let (has_vocab_size, vocab_size_i32) = match vocab_size {
74 Some(sz) => (true, sz as i32),
75 None => (false, 0i32),
76 };
77
78 let (has_stop_ids, stop_ptr, stop_len) = match stop_token_ids.as_ref() {
79 Some(slice) if !slice.is_empty() => {
80 (true, slice.as_ptr(), slice.len())
81 },
82 _ => (false, std::ptr::null(), 0usize),
83 };
84
85 let ffi_obj = unsafe {
86 cxx_utils::make_tokenizer_info(
87 cxx_vec.as_ref().unwrap(),
88 vocab_type,
89 has_vocab_size,
90 vocab_size_i32,
91 has_stop_ids,
92 stop_ptr,
93 stop_len,
94 add_prefix_space,
95 )
96 };
97
98 let inner = ffi_obj;
99 Self { inner }
100 }
101
102 pub fn from_vocab_and_metadata_bytes<I, B>(
105 encoded_vocab: I,
106 metadata: &str,
107 ) -> Self
108 where
109 I: IntoIterator<Item = B>,
110 B: AsRef<[u8]>,
111 {
112 let mut cxx_vec = cxx_utils::new_string_vector();
113 {
114 let mut cxx_vec_pin = cxx_vec.pin_mut();
115 for string in encoded_vocab.into_iter() {
116 let bytes = string.as_ref();
117 unsafe {
118 cxx_utils::string_vec_push_bytes(
119 cxx_vec_pin.as_mut(),
120 bytes.as_ptr() as *const i8,
121 bytes.len(),
122 );
123 }
124 }
125 }
126
127 cxx::let_cxx_string!(metadata_cxx = metadata);
128 let ffi_ptr = FFITokenizerInfo::FromVocabAndMetadata(
129 cxx_vec.as_ref().unwrap(),
130 &metadata_cxx,
131 )
132 .within_unique_ptr();
133 Self { inner: ffi_ptr }
134 }
135
136 pub fn vocab_type(&self) -> VocabType {
138 self.inner
139 .as_ref()
140 .expect("FFITokenizerInfo UniquePtr was null")
141 .GetVocabType()
142 }
143
144 pub fn vocab_size(&self) -> usize {
146 let sz = usize::try_from(
147 self.inner
148 .as_ref()
149 .expect("FFITokenizerInfo UniquePtr was null")
150 .GetVocabSize()
151 .0,
152 )
153 .expect("GetVocabSize returned a negative value")
154 ;
155 sz
156 }
157
158 pub fn add_prefix_space(&self) -> bool {
161 let val = self
162 .inner
163 .as_ref()
164 .expect("FFITokenizerInfo UniquePtr was null")
165 .GetAddPrefixSpace();
166 val
167 }
168
169 pub fn decoded_vocab(&self) -> Box<[Box<[u8]>]> {
173 let cxx_vec = self.inner.GetDecodedVocab();
174 let mut result: Vec<Box<[u8]>> = Vec::with_capacity(cxx_vec.len());
175 for cxx_string in cxx_vec.iter() {
176 result.push(
177 cxx_string
178 .to_string_lossy()
179 .into_owned()
180 .into_bytes()
181 .into_boxed_slice(),
182 );
183 }
184 result.into_boxed_slice()
185 }
186
187 pub fn stop_token_ids(&self) -> Box<[i32]> {
189 let cxx_vec = self.inner.GetStopTokenIds();
190 cxx_vec.iter().copied().collect::<Vec<_>>().into_boxed_slice()
191 }
192
193 pub fn special_token_ids(&self) -> Box<[i32]> {
196 let cxx_vec = self
197 .inner
198 .as_ref()
199 .expect("FFITokenizerInfo UniquePtr was null")
200 .GetSpecialTokenIds();
201 cxx_vec.iter().copied().collect::<Vec<_>>().into_boxed_slice()
202 }
203
204 pub fn dump_metadata(&self) -> String {
207 self
208 .inner
209 .as_ref()
210 .expect("FFITokenizerInfo UniquePtr was null")
211 .DumpMetadata()
212 .to_string()
213 }
214
215 pub fn serialize_json(&self) -> String {
217 self
218 .inner
219 .as_ref()
220 .expect("FFITokenizerInfo UniquePtr was null")
221 .SerializeJSON()
222 .to_string()
223 }
224
225 pub fn deserialize_json(json: &str) -> Result<Self, String> {
235 cxx::let_cxx_string!(json_cxx = json);
236 cxx::let_cxx_string!(error_out_cxx = "");
237 let uptr = unsafe {
238 cxx_utils::tokenizer_info_deserialize_json_or_error(
239 &json_cxx,
240 error_out_cxx.as_mut().get_unchecked_mut(),
241 )
242 };
243 if uptr.is_null() {
244 return Err(error_out_cxx.to_string());
245 }
246 Ok(Self { inner: uptr })
247 }
248
249 pub(crate) fn ffi_ref(&self) -> &FFITokenizerInfo {
250 self.inner
251 .as_ref()
252 .expect("FFITokenizerInfo UniquePtr was null")
253 }
254
255 pub(crate) fn from_unique_ptr(inner: cxx::UniquePtr<FFITokenizerInfo>) -> Self {
256 Self { inner }
257 }
258
259 }
261
262impl Drop for TokenizerInfo {
263 fn drop(&mut self) {
264 }
265}
266
267#[cfg(feature = "tokenizers")]
278impl TokenizerInfo {
279 #[inline]
280 fn extract_ordered_vocab(tokenizer: &tokenizers::Tokenizer) -> Box<[String]> {
281 let mut pairs: Vec<(usize, String)> = tokenizer
282 .get_vocab(true)
283 .into_iter()
284 .map(|(tok, id)| (id as usize, tok))
285 .collect();
286 pairs.sort_by_key(|(id, _)| *id);
287 pairs.into_iter().map(|(_, tok)| tok).collect::<Vec<_>>().into_boxed_slice()
288 }
289
290 pub fn _is_tiktoken_tokenizer(tokenizer: &tokenizers::Tokenizer) -> bool {
298 let vocab = tokenizer.get_vocab(true);
299 let mut has_sentencepiece_marker = false; let mut has_bytelevel_marker = false; let mut has_bytefallback_marker = false; for token in vocab.keys() {
303 if !has_sentencepiece_marker && token.contains('▁') {
304 has_sentencepiece_marker = true;
305 }
306 if !has_bytelevel_marker && token.contains('Ġ') {
307 has_bytelevel_marker = true;
308 }
309 if !has_bytefallback_marker
310 && token.starts_with("<0x")
311 && token.ends_with('>')
312 {
313 has_bytefallback_marker = true;
314 }
315 if has_sentencepiece_marker
316 || has_bytelevel_marker
317 || has_bytefallback_marker
318 {
319 break;
320 }
321 }
322 !(has_sentencepiece_marker
323 || has_bytelevel_marker
324 || has_bytefallback_marker)
325 }
326
327 pub fn _is_sentencepiece_tokenizer(
333 tokenizer: &tokenizers::Tokenizer
334 ) -> bool {
335 let vocab = tokenizer.get_vocab(true);
336 vocab.keys().any(|tok| tok.contains('▁'))
337 }
338
339 pub fn from_tokenizers_with_options(
345 tokenizer: &tokenizers::Tokenizer,
346 vocab_type: VocabType,
347 vocab_size: Option<usize>,
348 stop_token_ids: Option<&[i32]>,
349 add_prefix_space: bool,
350 ) -> Self {
351 let ordered = Self::extract_ordered_vocab(tokenizer);
352 let stop: Option<Box<[i32]>> =
353 stop_token_ids.map(|s| s.to_vec().into_boxed_slice());
354 Self::new_with_vocab_size(
355 &ordered,
356 vocab_type,
357 vocab_size,
358 &stop,
359 add_prefix_space,
360 )
361 }
362
363 pub fn from_tokenizers_simple(tokenizer: &tokenizers::Tokenizer) -> Self {
365 Self::from_tokenizers_with_options(
366 tokenizer,
367 VocabType::RAW,
368 None,
369 None,
370 false,
371 )
372 }
373
374 pub fn from_huggingface(
382 tokenizer: &tokenizers::Tokenizer,
383 vocab_size: Option<usize>,
384 stop_token_ids: Option<&[i32]>,
385 ) -> Self {
386 use crate::VocabType;
387
388 let vocab = tokenizer.get_vocab(true);
390 let has_bytefallback_marker =
391 vocab.keys().any(|t| t.starts_with("<0x") && t.ends_with('>'));
392 let has_sentencepiece_marker = vocab.keys().any(|t| t.contains('▁'));
393 let has_bytelevel_marker = vocab.keys().any(|t| t.contains('Ġ'));
394
395 let (vocab_type, add_prefix_space) = if has_bytefallback_marker {
396 (VocabType::BYTE_FALLBACK, true)
397 } else if has_sentencepiece_marker {
398 (VocabType::RAW, true)
402 } else if has_bytelevel_marker {
403 (VocabType::BYTE_LEVEL, false)
404 } else {
405 (VocabType::RAW, false)
406 };
407
408 Self::from_tokenizers_with_options(
410 tokenizer,
411 vocab_type,
412 vocab_size,
413 stop_token_ids,
414 add_prefix_space,
415 )
416 }
417}