tiktoken 3.1.4

A high-performance pure-Rust implementation of OpenAI's tiktoken BPE tokenizer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
// edge case integration tests for tiktoken
// covers scenarios not exercised by oracle, proptest, parallel, or unit tests

const ALL_ENCODINGS: &[&str] = &[
    "cl100k_base",
    "o200k_base",
    "p50k_base",
    "p50k_edit",
    "r50k_base",
    "llama3",
    "deepseek_v3",
    "qwen2",
    "mistral_v3",
];

// helper: known special tokens for each encoding
fn special_tokens_for(name: &str) -> Vec<&'static str> {
    match name {
        "cl100k_base" => vec![
            "<|endoftext|>",
            "<|fim_prefix|>",
            "<|fim_middle|>",
            "<|fim_suffix|>",
            "<|endofprompt|>",
        ],
        "o200k_base" => vec!["<|endoftext|>", "<|endofprompt|>"],
        "p50k_base" => vec!["<|endoftext|>"],
        "p50k_edit" => vec![
            "<|endoftext|>",
            "<|fim_prefix|>",
            "<|fim_middle|>",
            "<|fim_suffix|>",
        ],
        "r50k_base" => vec!["<|endoftext|>"],
        "llama3" => vec![
            "<|begin_of_text|>",
            "<|end_of_text|>",
            "<|finetune_right_pad_id|>",
            "<|start_header_id|>",
            "<|end_header_id|>",
            "<|eom_id|>",
            "<|eot_id|>",
            "<|python_tag|>",
        ],
        "deepseek_v3" => vec![
            "<\u{ff5c}begin\u{2581}of\u{2581}sentence\u{ff5c}>",
            "<\u{ff5c}end\u{2581}of\u{2581}sentence\u{ff5c}>",
            "<\u{ff5c}\u{2581}pad\u{2581}\u{ff5c}>",
            "<|EOT|>",
        ],
        "qwen2" => vec![
            "<|endoftext|>",
            "<|im_start|>",
            "<|im_end|>",
            "<|object_ref_start|>",
            "<|object_ref_end|>",
            "<|box_start|>",
            "<|box_end|>",
            "<|quad_start|>",
            "<|quad_end|>",
            "<|vision_start|>",
            "<|vision_end|>",
            "<|vision_pad|>",
            "<|image_pad|>",
            "<|video_pad|>",
        ],
        "mistral_v3" => vec![
            "<unk>",
            "<s>",
            "</s>",
            "[INST]",
            "[/INST]",
            "[AVAILABLE_TOOLS]",
            "[/AVAILABLE_TOOLS]",
            "[TOOL_RESULTS]",
            "[/TOOL_RESULTS]",
            "[TOOL_CALLS]",
            "[IMG]",
            "[IMG_BREAK]",
            "[IMG_END]",
            "[PREFIX]",
            "[MIDDLE]",
            "[SUFFIX]",
        ],
        _ => vec![],
    }
}

// --- 1. multiple adjacent special tokens ---

#[test]
fn adjacent_special_tokens_roundtrip() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }

        // two adjacent
        let text = format!("{}{}", specials[0], specials[0]);
        let tokens = enc.encode_with_special_tokens(&text);
        let decoded = enc.decode_to_string(&tokens).unwrap();
        assert_eq!(decoded, text, "[{name}] two adjacent special tokens failed");

        // all special tokens concatenated
        let all: String = specials.iter().copied().collect();
        let tokens = enc.encode_with_special_tokens(&all);
        let decoded = enc.decode_to_string(&tokens).unwrap();
        assert_eq!(decoded, all, "[{name}] all adjacent special tokens failed");
    }
}

#[test]
fn many_adjacent_special_tokens_count_consistency() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }

        // repeat first special token 10 times
        let text: String = specials[0].repeat(10);
        let tokens = enc.encode_with_special_tokens(&text);
        let count = enc.count_with_special_tokens(&text);
        assert_eq!(
            count,
            tokens.len(),
            "[{name}] count mismatch for 10 adjacent special tokens"
        );
    }
}

// --- 2. binary-like data: all 256 byte values via latin-1 ---

#[test]
fn all_256_byte_values_roundtrip() {
    // encode each byte 0x00..=0xFF as a latin-1 char in a string,
    // verify roundtrip for every encoding
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        for b in 0u8..=255 {
            // create a string containing a single character with codepoint == b
            let ch = char::from(b);
            let text = ch.to_string();
            let tokens = enc.encode(&text);
            let decoded = enc.decode(&tokens);
            assert_eq!(
                decoded,
                text.as_bytes(),
                "[{name}] byte {b:#04x} roundtrip failed"
            );
        }
    }
}

// --- 3. cross-encoding comparison: decode(encode(text)) identical across encodings ---

#[test]
fn cross_encoding_decode_consistency() {
    let texts = [
        "Hello, world!",
        "The quick brown fox jumps over the lazy dog.",
        "日本語テスト",
        "emoji: \u{1f600}\u{1f680}\u{2764}\u{fe0f}",
        "fn main() { println!(\"hello\"); }",
        "混合 mixed 内容 content 123",
    ];

    for text in &texts {
        for &name in ALL_ENCODINGS {
            let enc = tiktoken::get_encoding(name).unwrap();
            let tokens = enc.encode(text);
            let decoded = enc.decode_to_string(&tokens).unwrap();
            assert_eq!(
                &decoded, text,
                "[{name}] cross-encoding decode mismatch for {text:?}"
            );
        }
    }
}

// --- 4. stress test: text with every special token for each encoding ---

#[test]
fn stress_all_special_tokens_interleaved() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }

        // build text: "word0<special0>word1<special1>...wordN"
        let mut text = String::new();
        for (i, &special) in specials.iter().enumerate() {
            text.push_str(&format!("word{i}"));
            text.push_str(special);
        }
        text.push_str("final");

        let tokens = enc.encode_with_special_tokens(&text);
        let decoded = enc.decode_to_string(&tokens).unwrap();
        assert_eq!(
            decoded, text,
            "[{name}] interleaved special tokens roundtrip failed"
        );

        // count must match
        let count = enc.count_with_special_tokens(&text);
        assert_eq!(
            count,
            tokens.len(),
            "[{name}] count mismatch for interleaved special tokens"
        );
    }
}

// --- 5. vocab size sanity checks ---

#[test]
fn vocab_size_sanity() {
    // verify vocab sizes via special token ids known from encoding.rs
    // (encoding, special_token_text, expected_id, expected_regular_vocab_description)
    let checks: &[(&str, &str, u32)] = &[
        ("cl100k_base", "<|endoftext|>", 100257), // regular: 100,256
        ("o200k_base", "<|endoftext|>", 199999),  // regular: 199,998
        ("p50k_base", "<|endoftext|>", 50256),    // regular: 50,256
        ("p50k_edit", "<|endoftext|>", 50256),    // same base as p50k
        ("r50k_base", "<|endoftext|>", 50256),    // regular: 50,256
        ("llama3", "<|begin_of_text|>", 128000),  // regular: 128,000
        ("deepseek_v3", "<|EOT|>", 128805),       // regular: 128,000 + specials
        ("qwen2", "<|endoftext|>", 151643),       // regular: 151,643
        ("mistral_v3", "[INST]", 3),              // mistral has low special ids
    ];

    for &(name, special, expected_id) in checks {
        let enc = tiktoken::get_encoding(name).unwrap();
        let tokens = enc.encode_with_special_tokens(special);
        assert_eq!(
            tokens.len(),
            1,
            "[{name}] {special:?} should encode to exactly 1 token"
        );
        assert_eq!(
            tokens[0], expected_id,
            "[{name}] {special:?} should have token id {expected_id}, got {}",
            tokens[0]
        );
    }

    // additionally verify that distinct token ids span the expected range
    // by encoding many diverse strings and collecting unique ids
    let diverse_texts: Vec<String> = (0u8..=255)
        .map(|b| char::from(b).to_string())
        .chain((0..100).map(|i| format!("word{i}")))
        .collect();

    let range_checks: &[(&str, u32)] = &[
        ("cl100k_base", 100_000),
        ("o200k_base", 199_000),
        ("p50k_base", 50_000),
        ("r50k_base", 50_000),
        ("llama3", 127_000),
        ("qwen2", 151_000),
        ("mistral_v3", 130_000),
    ];

    for &(name, min_max_id) in range_checks {
        let enc = tiktoken::get_encoding(name).unwrap();
        let mut max_id: u32 = 0;
        for text in &diverse_texts {
            for &id in &enc.encode(text) {
                max_id = max_id.max(id);
            }
        }
        assert!(
            max_id >= 200,
            "[{name}] max token id seen {max_id} is suspiciously low"
        );
        // the max id from a diverse sample won't cover the full vocab,
        // but the special token id checks above confirm the vocab reaches
        // the expected range
        let _ = min_max_id; // used only for documentation
    }
}

// --- 6. encode single special token text ---

#[test]
fn encode_single_special_token_only() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);

        for &special in &specials {
            // with special token recognition: should produce exactly 1 token
            let tokens = enc.encode_with_special_tokens(special);
            assert_eq!(
                tokens.len(),
                1,
                "[{name}] encode_with_special_tokens({special:?}) should produce 1 token, got {}",
                tokens.len()
            );

            // roundtrip
            let decoded = enc.decode_to_string(&tokens).unwrap();
            assert_eq!(
                decoded, special,
                "[{name}] single special token roundtrip failed for {special:?}"
            );

            // without special token recognition: should produce >0 tokens
            // and the special token text should still roundtrip
            let tokens_plain = enc.encode(special);
            assert!(
                !tokens_plain.is_empty(),
                "[{name}] encode({special:?}) should produce tokens"
            );
            let decoded_plain = enc.decode_to_string(&tokens_plain).unwrap();
            assert_eq!(
                decoded_plain, special,
                "[{name}] plain encode roundtrip failed for {special:?}"
            );
        }
    }
}

// --- 7. mixed special and non-special tokens in various orders ---

#[test]
fn mixed_special_nonspecial_orders() {
    let patterns = [
        // (prefix, suffix) around a special token
        ("", ""),             // special only (covered above but different assertion)
        ("hello ", ""),       // text before
        ("", " world"),       // text after
        ("hello ", " world"), // text both sides
        ("\n", "\n"),         // newlines around
        ("123 ", " 456"),     // numbers around
    ];

    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }
        let special = specials[0];

        for (prefix, suffix) in &patterns {
            let text = format!("{prefix}{special}{suffix}");
            let tokens = enc.encode_with_special_tokens(&text);
            let decoded = enc.decode_to_string(&tokens).unwrap();
            assert_eq!(
                decoded, text,
                "[{name}] mixed order failed for prefix={prefix:?} suffix={suffix:?}"
            );
        }
    }
}

#[test]
fn special_between_identical_words() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }
        let special = specials[0];

        // same word on both sides
        let text = format!("test{special}test");
        let tokens = enc.encode_with_special_tokens(&text);
        let decoded = enc.decode_to_string(&tokens).unwrap();
        assert_eq!(
            decoded, text,
            "[{name}] special between identical words failed"
        );
    }
}

// --- 8. very long text (100KB+) roundtrip ---

#[test]
fn long_text_100kb_roundtrip() {
    // generate ~100KB of mixed content
    let base = "The quick brown fox jumps over the lazy dog. \
                Hello, 世界! Rust is fast. 🚀 Numbers: 12345. \
                Special chars: @#$%^&*(). Newline:\n";
    let repeat_count = (100 * 1024) / base.len() + 1;
    let text: String = base.repeat(repeat_count);
    assert!(text.len() >= 100 * 1024, "text should be >= 100KB");

    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let tokens = enc.encode(&text);
        let decoded = enc.decode_to_string(&tokens).unwrap();
        assert_eq!(
            decoded,
            text,
            "[{name}] 100KB roundtrip failed (text len={})",
            text.len()
        );

        // count must match
        assert_eq!(
            enc.count(&text),
            tokens.len(),
            "[{name}] count mismatch for 100KB text"
        );
    }
}

// --- 9. decode_to_string with valid unicode ---

#[test]
fn decode_to_string_multilingual() {
    let texts = [
        "English text",
        "日本語テスト",
        "한국어 테스트",
        "العربية",
        "Ελληνικά",
        "Кириллица",
        "ไทย",
        "emoji mix: \u{1f600}\u{1f389}\u{1f4a1}\u{2728}",
        "diacritics: \u{00e9}\u{00e8}\u{00ea}\u{00eb}\u{00f1}\u{00fc}\u{00e4}\u{00f6}",
        "math: \u{221e} \u{2200}x \u{2203}y \u{2208} \u{2124}",
    ];

    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        for text in &texts {
            let tokens = enc.encode(text);
            let result = enc.decode_to_string(&tokens);
            assert!(
                result.is_ok(),
                "[{name}] decode_to_string failed for {text:?}: {:?}",
                result.err()
            );
            assert_eq!(
                result.unwrap(),
                *text,
                "[{name}] decode_to_string mismatch for {text:?}"
            );
        }
    }
}

// --- 10. empty string for all encodings (integration test form) ---

#[test]
fn empty_string_all_encodings_integration() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();

        // encode
        let tokens = enc.encode("");
        assert!(tokens.is_empty(), "[{name}] encode empty should be empty");

        // encode_with_special_tokens
        let tokens_special = enc.encode_with_special_tokens("");
        assert!(
            tokens_special.is_empty(),
            "[{name}] encode_with_special_tokens empty should be empty"
        );

        // count
        assert_eq!(enc.count(""), 0, "[{name}] count empty should be 0");

        // count_with_special_tokens
        assert_eq!(
            enc.count_with_special_tokens(""),
            0,
            "[{name}] count_with_special_tokens empty should be 0"
        );

        // decode empty
        let decoded = enc.decode(&[]);
        assert!(decoded.is_empty(), "[{name}] decode empty should be empty");

        // decode_to_string empty
        assert_eq!(
            enc.decode_to_string(&[]).unwrap(),
            "",
            "[{name}] decode_to_string empty should be empty string"
        );
    }
}

// --- bonus: encode_with_special_tokens vs encode for text without specials ---

#[test]
fn encode_with_special_tokens_matches_encode_for_plain_text() {
    let texts = [
        "hello world",
        "The quick brown fox.",
        "日本語テスト 🚀",
        "fn main() {}",
        "1234567890",
    ];

    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        for text in &texts {
            let plain = enc.encode(text);
            let special = enc.encode_with_special_tokens(text);
            assert_eq!(
                plain, special,
                "[{name}] encode vs encode_with_special_tokens mismatch for plain text {text:?}"
            );
        }
    }
}

// --- bonus: repeated special tokens produce correct token count ---

#[test]
fn repeated_special_tokens_correct_count() {
    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        let specials = special_tokens_for(name);
        if specials.is_empty() {
            continue;
        }
        let special = specials[0];

        for n in [1, 2, 5, 20] {
            let text = special.repeat(n);
            let tokens = enc.encode_with_special_tokens(&text);
            // each special token occurrence should produce exactly 1 token
            assert_eq!(
                tokens.len(),
                n,
                "[{name}] {n} repeated {special:?} should produce {n} tokens, got {}",
                tokens.len()
            );
        }
    }
}

// --- bonus: surrogate-range unicode (edge of BMP) ---

#[test]
fn bmp_boundary_characters_roundtrip() {
    // characters near interesting unicode boundaries
    let edge_chars = [
        '\u{007F}',   // DEL (last ASCII)
        '\u{0080}',   // first latin-1 supplement
        '\u{00FF}',   // last latin-1 supplement
        '\u{0100}',   // first latin extended
        '\u{FFFD}',   // replacement character
        '\u{FFFE}',   // noncharacter (but valid in Rust strings)
        '\u{10000}',  // first supplementary character
        '\u{10FFFF}', // last valid unicode scalar
    ];

    for &name in ALL_ENCODINGS {
        let enc = tiktoken::get_encoding(name).unwrap();
        for ch in &edge_chars {
            let text = ch.to_string();
            let tokens = enc.encode(&text);
            let decoded = enc.decode(&tokens);
            assert_eq!(
                decoded,
                text.as_bytes(),
                "[{name}] BMP boundary char U+{:04X} roundtrip failed",
                *ch as u32
            );
        }
    }
}