1use std::{
2 collections::HashMap,
3 path::{Path, PathBuf},
4};
5
6use anyhow::{Error, Result};
7use base64::{engine::general_purpose::STANDARD, Engine as _};
8use rustc_hash::FxHashMap;
9use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
10
11use crate::{
12 chat_template::{
13 load_chat_template_from_file, ChatTemplateContentFormat, ChatTemplateParams,
14 ChatTemplateState,
15 },
16 factory::discover_chat_template_in_dir,
17 traits::{Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait},
18};
19
20const CL100K_BASE_PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
29
30type Rank = u32;
31
32#[derive(Default)]
38struct TiktokenConfig {
39 special_tokens: SpecialTokens,
40 added_tokens: HashMap<String, TokenIdType>,
42 chat_template: Option<String>,
43}
44
45fn load_tiktoken_config(config_path: &Path) -> Result<TiktokenConfig> {
47 let content = std::fs::read_to_string(config_path)?;
48 let config: serde_json::Value = serde_json::from_str(&content)?;
49
50 let added_tokens = parse_added_tokens_decoder(&config);
51 let special_tokens = parse_special_tokens(&config);
52
53 let chat_template = config
54 .get("chat_template")
55 .and_then(|v| v.as_str())
56 .map(String::from);
57
58 Ok(TiktokenConfig {
59 special_tokens,
60 added_tokens,
61 chat_template,
62 })
63}
64
65fn parse_added_tokens_decoder(config: &serde_json::Value) -> HashMap<String, TokenIdType> {
69 let mut tokens = HashMap::new();
70 if let Some(added) = config
71 .get("added_tokens_decoder")
72 .and_then(|v| v.as_object())
73 {
74 for (id_str, token_info) in added {
75 if let (Ok(id), Some(content)) = (
76 id_str.parse::<TokenIdType>(),
77 token_info.get("content").and_then(|v| v.as_str()),
78 ) {
79 tokens.insert(content.to_string(), id);
80 }
81 }
82 }
83 tokens
84}
85
86fn parse_special_tokens(config: &serde_json::Value) -> SpecialTokens {
91 let get_str = |key: &str| {
92 config.get(key).and_then(|v| {
93 v.as_str()
94 .map(String::from)
95 .or_else(|| v.get("content").and_then(|c| c.as_str()).map(String::from))
96 })
97 };
98
99 let additional: Vec<String> = config
100 .get("additional_special_tokens")
101 .and_then(|v| v.as_array())
102 .map(|arr| {
103 arr.iter()
104 .filter_map(|v| {
105 v.as_str()
106 .map(String::from)
107 .or_else(|| v.get("content").and_then(|c| c.as_str()).map(String::from))
108 })
109 .collect()
110 })
111 .unwrap_or_default();
112
113 SpecialTokens {
114 bos_token: get_str("bos_token"),
115 eos_token: get_str("eos_token"),
116 unk_token: get_str("unk_token"),
117 sep_token: get_str("sep_token"),
118 pad_token: get_str("pad_token"),
119 cls_token: get_str("cls_token"),
120 mask_token: get_str("mask_token"),
121 additional_special_tokens: additional,
122 }
123}
124
125pub struct TiktokenTokenizer {
127 tokenizer: CoreBPE,
128 special_tokens: SpecialTokens,
129 vocab: HashMap<String, TokenIdType>,
130 reverse_vocab: HashMap<TokenIdType, String>,
131 vocab_size: usize,
132 chat_template: ChatTemplateState,
133}
134
135#[derive(Debug, Clone, Copy)]
137pub enum TiktokenModel {
138 Cl100kBase,
140 P50kBase,
142 P50kEdit,
144 R50kBase,
146}
147
148impl TiktokenTokenizer {
149 pub fn new(model: TiktokenModel) -> Result<Self> {
151 let tokenizer = match model {
152 TiktokenModel::Cl100kBase => {
153 cl100k_base().map_err(|e| Error::msg(format!("Failed to load cl100k_base: {e}")))?
154 }
155 TiktokenModel::P50kBase => {
156 p50k_base().map_err(|e| Error::msg(format!("Failed to load p50k_base: {e}")))?
157 }
158 TiktokenModel::P50kEdit => {
159 p50k_edit().map_err(|e| Error::msg(format!("Failed to load p50k_edit: {e}")))?
160 }
161 TiktokenModel::R50kBase => {
162 r50k_base().map_err(|e| Error::msg(format!("Failed to load r50k_base: {e}")))?
163 }
164 };
165
166 let special_tokens = Self::get_special_tokens_for_model(model);
167
168 let vocab_size = match model {
169 TiktokenModel::Cl100kBase => 100256,
170 TiktokenModel::P50kBase | TiktokenModel::P50kEdit => 50281,
171 TiktokenModel::R50kBase => 50257,
172 };
173
174 Ok(TiktokenTokenizer {
175 tokenizer,
176 special_tokens,
177 vocab: HashMap::new(),
178 reverse_vocab: HashMap::new(),
179 vocab_size,
180 chat_template: ChatTemplateState::empty(),
181 })
182 }
183
184 pub fn from_dir(dir: &Path) -> Result<Self> {
186 Self::from_dir_with_chat_template(dir, None)
187 }
188
189 pub fn from_dir_with_chat_template(
192 dir: &Path,
193 chat_template_path: Option<&str>,
194 ) -> Result<Self> {
195 let tiktoken_path = find_tiktoken_file(dir)?;
196 Self::load_from_path(&tiktoken_path, chat_template_path)
197 }
198
199 pub fn from_file(tiktoken_path: &Path) -> Result<Self> {
202 Self::from_file_with_chat_template(tiktoken_path, None)
203 }
204
205 pub fn from_file_with_chat_template(
207 tiktoken_path: &Path,
208 chat_template_path: Option<&str>,
209 ) -> Result<Self> {
210 Self::load_from_path(tiktoken_path, chat_template_path)
211 }
212
213 fn load_from_path(tiktoken_path: &Path, chat_template_path: Option<&str>) -> Result<Self> {
215 let tiktoken_path_str = tiktoken_path
217 .to_str()
218 .ok_or_else(|| Error::msg("Tiktoken file path is not valid UTF-8"))?;
219 let encoder = load_tiktoken_bpe(tiktoken_path_str)?;
220
221 let dir = tiktoken_path
223 .parent()
224 .ok_or_else(|| Error::msg("Cannot determine parent directory of tiktoken file"))?;
225 let config_path = dir.join("tokenizer_config.json");
226 let config = if config_path.exists() {
227 load_tiktoken_config(&config_path)?
228 } else {
229 TiktokenConfig::default()
230 };
231
232 let special_tokens_encoder: FxHashMap<String, Rank> = config
234 .added_tokens
235 .iter()
236 .map(|(k, &v)| (k.clone(), v))
237 .collect();
238
239 let vocab_size = encoder
242 .values()
243 .copied()
244 .chain(special_tokens_encoder.values().copied())
245 .max()
246 .map(|id| id as usize + 1)
247 .unwrap_or(0);
248 let (vocab, reverse_vocab) = build_vocab_maps(&encoder, &config.added_tokens);
249 let tokenizer = CoreBPE::new(encoder, special_tokens_encoder, CL100K_BASE_PATTERN)?;
250
251 let chat_template = if let Some(p) = chat_template_path {
254 load_chat_template_from_file(p)?
255 } else {
256 config.chat_template.or_else(|| {
257 discover_chat_template_in_dir(dir)
258 .and_then(|p| load_chat_template_from_file(&p).ok().flatten())
259 })
260 };
261
262 Ok(TiktokenTokenizer {
263 tokenizer,
264 special_tokens: config.special_tokens,
265 vocab,
266 reverse_vocab,
267 vocab_size,
268 chat_template: ChatTemplateState::new(chat_template)?,
269 })
270 }
271
272 pub fn from_model_name(model_name: &str) -> Result<Self> {
274 let model = Self::model_from_name(model_name)?;
275 Self::new(model)
276 }
277
278 fn model_from_name(model_name: &str) -> Result<TiktokenModel> {
280 if model_name.contains("gpt-4")
281 || model_name.contains("gpt-3.5")
282 || model_name.contains("turbo")
283 {
284 Ok(TiktokenModel::Cl100kBase)
285 } else if model_name.contains("davinci-002")
286 || model_name.contains("davinci-003")
287 || model_name.contains("codex")
288 {
289 Ok(TiktokenModel::P50kBase)
290 } else if model_name.contains("edit") {
291 Ok(TiktokenModel::P50kEdit)
292 } else if model_name.contains("davinci")
293 || model_name.contains("curie")
294 || model_name.contains("babbage")
295 || model_name.contains("ada")
296 {
297 Ok(TiktokenModel::R50kBase)
298 } else {
299 Err(anyhow::anyhow!(
300 "Unrecognized OpenAI model name: '{model_name}'. Expected GPT-3, GPT-3.5, GPT-4, or related model names"
301 ))
302 }
303 }
304
305 fn get_special_tokens_for_model(model: TiktokenModel) -> SpecialTokens {
307 match model {
308 TiktokenModel::Cl100kBase => SpecialTokens {
309 bos_token: Some("<|endoftext|>".to_string()),
310 eos_token: Some("<|endoftext|>".to_string()),
311 unk_token: None,
312 sep_token: None,
313 pad_token: Some("<|endoftext|>".to_string()),
314 cls_token: None,
315 mask_token: None,
316 additional_special_tokens: vec![
317 "<|fim_prefix|>".to_string(),
318 "<|fim_middle|>".to_string(),
319 "<|fim_suffix|>".to_string(),
320 "<|endofprompt|>".to_string(),
321 ],
322 },
323 _ => SpecialTokens {
324 bos_token: Some("<|endoftext|>".to_string()),
325 eos_token: Some("<|endoftext|>".to_string()),
326 unk_token: None,
327 sep_token: None,
328 pad_token: Some("<|endoftext|>".to_string()),
329 cls_token: None,
330 mask_token: None,
331 additional_special_tokens: vec![],
332 },
333 }
334 }
335}
336
337fn load_tiktoken_bpe(path: &str) -> Result<FxHashMap<Vec<u8>, Rank>> {
341 let content = std::fs::read_to_string(path)?;
342 let mut encoder =
343 FxHashMap::with_capacity_and_hasher(content.lines().count(), Default::default());
344 for line in content.lines() {
345 if line.is_empty() {
346 continue;
347 }
348 let mut parts = line.split_whitespace();
349 let token_b64 = parts
350 .next()
351 .ok_or_else(|| Error::msg("missing token in tiktoken file"))?;
352 let rank_str = parts
353 .next()
354 .ok_or_else(|| Error::msg("missing rank in tiktoken file"))?;
355 let token_bytes = STANDARD.decode(token_b64)?;
356 let rank: Rank = rank_str.parse()?;
357 encoder.insert(token_bytes, rank);
358 }
359 Ok(encoder)
360}
361
362fn build_vocab_maps(
364 encoder: &FxHashMap<Vec<u8>, Rank>,
365 added_tokens: &HashMap<String, TokenIdType>,
366) -> (HashMap<String, TokenIdType>, HashMap<TokenIdType, String>) {
367 let capacity = encoder.len() + added_tokens.len();
368 let mut vocab = HashMap::with_capacity(capacity);
369 let mut reverse_vocab = HashMap::with_capacity(capacity);
370
371 for (token_bytes, &rank) in encoder {
373 if let Ok(token_str) = std::str::from_utf8(token_bytes) {
374 vocab.insert(token_str.to_string(), rank);
375 reverse_vocab.insert(rank, token_str.to_string());
376 }
377 }
378
379 for (token_str, &id) in added_tokens {
381 vocab.insert(token_str.clone(), id);
382 reverse_vocab.insert(id, token_str.clone());
383 }
384
385 (vocab, reverse_vocab)
386}
387
388fn find_tiktoken_file(dir: &Path) -> Result<PathBuf> {
392 let tiktoken_model = dir.join("tiktoken.model");
393 if tiktoken_model.exists() {
394 return Ok(tiktoken_model);
395 }
396
397 if let Ok(entries) = std::fs::read_dir(dir) {
399 for entry in entries.flatten() {
400 if let Some(name) = entry.file_name().to_str() {
401 if name.ends_with(".tiktoken") {
402 return Ok(entry.path());
403 }
404 }
405 }
406 }
407
408 Err(Error::msg(format!(
409 "No tiktoken model file found in '{}'",
410 dir.display()
411 )))
412}
413
414pub fn has_tiktoken_file(dir: &Path) -> bool {
416 if dir.join("tiktoken.model").exists() {
417 return true;
418 }
419 std::fs::read_dir(dir)
420 .ok()
421 .map(|entries| {
422 entries.flatten().any(|e| {
423 e.file_name()
424 .to_str()
425 .is_some_and(|n| n.ends_with(".tiktoken"))
426 })
427 })
428 .unwrap_or(false)
429}
430
431pub fn is_tiktoken_file(path: &Path) -> bool {
433 path.file_name()
434 .and_then(|n| n.to_str())
435 .is_some_and(|name| name == "tiktoken.model" || name.ends_with(".tiktoken"))
436}
437
438impl Encoder for TiktokenTokenizer {
439 fn encode(&self, input: &str, _add_special_tokens: bool) -> Result<Encoding> {
440 let tokens = self.tokenizer.encode_ordinary(input);
441 Ok(Encoding::Tiktoken(tokens))
442 }
443
444 fn encode_batch(&self, inputs: &[&str], add_special_tokens: bool) -> Result<Vec<Encoding>> {
445 inputs
446 .iter()
447 .map(|input| self.encode(input, add_special_tokens))
448 .collect()
449 }
450}
451
452impl Decoder for TiktokenTokenizer {
453 fn decode(&self, token_ids: &[TokenIdType], _skip_special_tokens: bool) -> Result<String> {
454 self.tokenizer
455 .decode(token_ids.to_vec())
456 .map_err(|e| Error::msg(format!("Decoding failed: {e}")))
457 }
458}
459
460impl TokenizerTrait for TiktokenTokenizer {
461 fn vocab_size(&self) -> usize {
462 self.vocab_size
463 }
464
465 fn get_special_tokens(&self) -> &SpecialTokens {
466 &self.special_tokens
467 }
468
469 fn token_to_id(&self, token: &str) -> Option<TokenIdType> {
470 self.vocab.get(token).copied()
471 }
472
473 fn id_to_token(&self, id: TokenIdType) -> Option<String> {
474 self.reverse_vocab.get(&id).cloned()
475 }
476
477 fn as_any(&self) -> &dyn std::any::Any {
478 self
479 }
480
481 fn apply_chat_template(
482 &self,
483 messages: &[serde_json::Value],
484 params: ChatTemplateParams,
485 ) -> Result<String> {
486 self.chat_template.apply(messages, params)
487 }
488
489 fn chat_template_content_format(&self) -> ChatTemplateContentFormat {
490 self.chat_template.content_format()
491 }
492
493 fn set_chat_template(&mut self, template: String) -> Result<()> {
494 self.chat_template.set(template)
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501 use crate::traits::{Decoder, Encoder, Tokenizer};
502
503 #[test]
504 fn test_tiktoken_creation() {
505 let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
506 assert_eq!(tokenizer.vocab_size(), 100256);
507 }
508
509 #[test]
510 fn test_model_from_name() {
511 assert!(matches!(
512 TiktokenTokenizer::model_from_name("gpt-4").unwrap(),
513 TiktokenModel::Cl100kBase
514 ));
515 assert!(matches!(
516 TiktokenTokenizer::model_from_name("gpt-3.5-turbo").unwrap(),
517 TiktokenModel::Cl100kBase
518 ));
519 assert!(matches!(
520 TiktokenTokenizer::model_from_name("text-davinci-003").unwrap(),
521 TiktokenModel::P50kBase
522 ));
523 assert!(matches!(
524 TiktokenTokenizer::model_from_name("text-davinci-edit-001").unwrap(),
525 TiktokenModel::P50kEdit
526 ));
527 assert!(matches!(
528 TiktokenTokenizer::model_from_name("davinci").unwrap(),
529 TiktokenModel::R50kBase
530 ));
531 }
532
533 #[test]
534 fn test_encode_decode() {
535 let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
536
537 let text = "Hello, world!";
538 let encoding = tokenizer.encode(text, false).unwrap();
539
540 let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
541 assert_eq!(decoded, text);
542 }
543
544 #[test]
545 fn test_batch_encode() {
546 let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
547
548 let texts = vec!["Hello", "World", "Test"];
549 let encodings = tokenizer.encode_batch(&texts, false).unwrap();
550
551 assert_eq!(encodings.len(), 3);
552 for (i, encoding) in encodings.iter().enumerate() {
553 let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
554 assert_eq!(decoded, texts[i]);
555 }
556 }
557
558 #[test]
559 fn test_special_tokens() {
560 let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
561 let special_tokens = tokenizer.get_special_tokens();
562
563 assert!(special_tokens.eos_token.is_some());
564 assert_eq!(special_tokens.eos_token.as_ref().unwrap(), "<|endoftext|>");
565 }
566
567 #[test]
568 fn test_unrecognized_model_name_returns_error() {
569 let result = TiktokenTokenizer::from_model_name("distilgpt-2");
570 assert!(result.is_err());
571 if let Err(e) = result {
572 assert!(e.to_string().contains("Unrecognized OpenAI model name"));
573 }
574
575 let result = TiktokenTokenizer::from_model_name("bert-base-uncased");
576 assert!(result.is_err());
577 if let Err(e) = result {
578 assert!(e.to_string().contains("Unrecognized OpenAI model name"));
579 }
580
581 let result = TiktokenTokenizer::from_model_name("llama-7b");
582 assert!(result.is_err());
583 if let Err(e) = result {
584 assert!(e.to_string().contains("Unrecognized OpenAI model name"));
585 }
586 }
587
588 #[test]
589 fn test_recognized_model_names() {
590 assert!(TiktokenTokenizer::from_model_name("gpt-4").is_ok());
591 assert!(TiktokenTokenizer::from_model_name("gpt-3.5-turbo").is_ok());
592 assert!(TiktokenTokenizer::from_model_name("text-davinci-003").is_ok());
593 assert!(TiktokenTokenizer::from_model_name("code-davinci-002").is_ok());
594 assert!(TiktokenTokenizer::from_model_name("text-curie-001").is_ok());
595 assert!(TiktokenTokenizer::from_model_name("text-babbage-001").is_ok());
596 assert!(TiktokenTokenizer::from_model_name("text-ada-001").is_ok());
597 }
598
599 #[test]
600 fn test_builtin_tokenizer_has_empty_vocab_maps() {
601 let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
602 assert_eq!(tokenizer.token_to_id("hello"), None);
604 assert_eq!(tokenizer.id_to_token(0), None);
605 }
606
607 #[test]
608 fn test_load_tiktoken_bpe() {
609 use std::io::Write;
610 let dir = tempfile::tempdir().unwrap();
611 let file_path = dir.path().join("test.tiktoken");
612 let mut f = std::fs::File::create(&file_path).unwrap();
613 writeln!(f, "IQ== 0").unwrap();
616 writeln!(f, "Ig== 1").unwrap();
617
618 let encoder = load_tiktoken_bpe(file_path.to_str().unwrap()).unwrap();
619 assert_eq!(encoder.len(), 2);
620 assert_eq!(encoder.get(&vec![0x21u8]), Some(&0));
621 assert_eq!(encoder.get(&vec![0x22u8]), Some(&1));
622 }
623
624 #[test]
625 fn test_build_vocab_maps() {
626 let mut encoder = FxHashMap::default();
627 encoder.insert(b"hello".to_vec(), 42u32);
628 encoder.insert(vec![0xFF, 0xFE], 99u32); let mut added = HashMap::new();
631 added.insert("<|special|>".to_string(), 1000u32);
632
633 let (vocab, reverse_vocab) = build_vocab_maps(&encoder, &added);
634
635 assert_eq!(vocab.get("hello"), Some(&42));
637 assert_eq!(reverse_vocab.get(&42), Some(&"hello".to_string()));
638
639 assert!(!vocab.contains_key("\u{FFFD}")); assert_eq!(vocab.get("<|special|>"), Some(&1000));
644 assert_eq!(reverse_vocab.get(&1000), Some(&"<|special|>".to_string()));
645 }
646
647 #[test]
648 fn test_has_tiktoken_file() {
649 let dir = tempfile::tempdir().unwrap();
650 assert!(!has_tiktoken_file(dir.path()));
651
652 std::fs::write(dir.path().join("tiktoken.model"), "test").unwrap();
653 assert!(has_tiktoken_file(dir.path()));
654 }
655
656 #[test]
657 fn test_find_tiktoken_file_model() {
658 let dir = tempfile::tempdir().unwrap();
659 std::fs::write(dir.path().join("tiktoken.model"), "test").unwrap();
660 let found = find_tiktoken_file(dir.path()).unwrap();
661 assert_eq!(found.file_name().unwrap(), "tiktoken.model");
662 }
663
664 #[test]
665 fn test_find_tiktoken_file_extension() {
666 let dir = tempfile::tempdir().unwrap();
667 std::fs::write(dir.path().join("vocab.tiktoken"), "test").unwrap();
668 let found = find_tiktoken_file(dir.path()).unwrap();
669 assert!(found
670 .file_name()
671 .unwrap()
672 .to_str()
673 .unwrap()
674 .ends_with(".tiktoken"));
675 }
676
677 #[test]
678 fn test_is_tiktoken_file() {
679 assert!(is_tiktoken_file(Path::new("tiktoken.model")));
680 assert!(is_tiktoken_file(Path::new("vocab.tiktoken")));
681 assert!(!is_tiktoken_file(Path::new("tokenizer.json")));
682 assert!(!is_tiktoken_file(Path::new("model.bin")));
683 }
684
685 #[test]
686 fn test_parse_added_tokens_decoder() {
687 let config: serde_json::Value = serde_json::json!({
688 "added_tokens_decoder": {
689 "163584": { "content": "[BOS]", "special": true },
690 "163585": { "content": "[EOS]", "special": true },
691 "163586": { "content": "<|im_end|>", "special": true }
692 }
693 });
694 let tokens = parse_added_tokens_decoder(&config);
695 assert_eq!(tokens.get("[BOS]"), Some(&163584));
696 assert_eq!(tokens.get("[EOS]"), Some(&163585));
697 assert_eq!(tokens.get("<|im_end|>"), Some(&163586));
698 }
699
700 #[test]
701 fn test_parse_special_tokens() {
702 let config: serde_json::Value = serde_json::json!({
703 "bos_token": "[BOS]",
704 "eos_token": "[EOS]",
705 "unk_token": "[UNK]",
706 "pad_token": "[PAD]",
707 "additional_special_tokens": ["<|im_end|>", "<|im_user|>"]
708 });
709 let special = parse_special_tokens(&config);
710 assert_eq!(special.bos_token.as_deref(), Some("[BOS]"));
711 assert_eq!(special.eos_token.as_deref(), Some("[EOS]"));
712 assert_eq!(special.unk_token.as_deref(), Some("[UNK]"));
713 assert_eq!(special.pad_token.as_deref(), Some("[PAD]"));
714 assert_eq!(special.additional_special_tokens.len(), 2);
715 }
716
717 #[test]
718 fn test_parse_special_tokens_object_valued() {
719 let config: serde_json::Value = serde_json::json!({
720 "bos_token": {"content": "<s>", "lstrip": false, "rstrip": false, "single_word": false, "special": true},
721 "eos_token": "</s>",
722 "unk_token": {"content": "<unk>", "special": true}
723 });
724 let special = parse_special_tokens(&config);
725 assert_eq!(special.bos_token.as_deref(), Some("<s>"));
726 assert_eq!(special.eos_token.as_deref(), Some("</s>"));
727 assert_eq!(special.unk_token.as_deref(), Some("<unk>"));
728 }
729
730 #[test]
731 fn test_tiktoken_config_default() {
732 let config = TiktokenConfig::default();
733 assert!(config.special_tokens.bos_token.is_none());
734 assert!(config.added_tokens.is_empty());
735 assert!(config.chat_template.is_none());
736 }
737}