tiktoken 3.5.1 - Docs.rs

//! Encoding definitions and data parsing for tiktoken-compatible BPE vocabularies.
//!
//! Each encoding consists of:
//! - A `.tiktoken.zst` data file (zstd-compressed, base64-encoded token → rank lines, embedded at compile time)
//! - A regex pattern that splits input text into pieces before BPE processing
//! - A set of special tokens (e.g. `<|endoftext|>`) with designated token ids
//!
//! Pattern source: <https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py>

use base64::Engine;
use rustc_hash::FxHashMap;

use crate::bpe::CoreBpe;
use crate::pretokenize::FastPath;

// embedded encoding data files — zstd-compressed, decompressed on first use via OnceLock in lib.rs
const CL100K_BASE_DATA: &[u8] = include_bytes!("encodings/cl100k_base.tiktoken.zst");
const O200K_BASE_DATA: &[u8] = include_bytes!("encodings/o200k_base.tiktoken.zst");
const P50K_BASE_DATA: &[u8] = include_bytes!("encodings/p50k_base.tiktoken.zst");
const R50K_BASE_DATA: &[u8] = include_bytes!("encodings/r50k_base.tiktoken.zst");
const LLAMA3_DATA: &[u8] = include_bytes!("encodings/llama3.tiktoken.zst");
const DEEPSEEK_V3_DATA: &[u8] = include_bytes!("encodings/deepseek_v3.tiktoken.zst");
const QWEN2_DATA: &[u8] = include_bytes!("encodings/qwen2.tiktoken.zst");
const MISTRAL_V3_DATA: &[u8] = include_bytes!("encodings/mistral_v3.tiktoken.zst");

// cl100k pattern: handles English contractions, Unicode letters/numbers, punctuation, whitespace.
// original tiktoken uses `\s+(?!\S)|\s+` but we use plain `\s+` and emulate the negative
// lookahead in bpe.rs::adjust_whitespace_end — this lets us use the `regex` crate's DFA engine
// instead of a slower backtracking engine like fancy-regex or pcre2.
pub(crate) const CL100K_PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+";

// o200k pattern: similar to cl100k but with finer Unicode category distinctions
// (Lu/Lt/Lm/Lo/M vs plain \p{L}), supporting better CamelCase and mixed-script splitting
pub(crate) const O200K_PATTERN: &str = concat!(
    r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+",
    r"(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
    r"|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*",
    r"(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
    r"|\p{N}{1,3}",
    r"| ?[^\s\p{L}\p{N}]+[\r\n]*",
    r"|\s*[\r\n]+",
    r"|\s+",
);

// p50k/r50k pattern: simpler, older pattern used by GPT-3 era models
pub(crate) const P50K_PATTERN: &str =
    r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+";

// llama3 pattern: same structure as cl100k (contractions, letters, numbers, punctuation, whitespace)
// original uses `\s+(?!\S)|\s+` — we emulate the lookahead in pretokenize.rs
const LLAMA3_PATTERN: &str = CL100K_PATTERN;

// deepseek v3 pattern: 3 sequential splits combined into one alternation.
// priority: numbers (1-3 digits) > CJK/Japanese > general pattern
// final catch-all `[\s\S]` ensures format chars (ZWJ etc.) are not skipped,
// matching HF's Split/Isolated behavior where non-matching text is kept.
pub(crate) const DEEPSEEK_V3_PATTERN: &str = concat!(
    r"\p{N}{1,3}",
    r"|[一-龥\x{3040}-\x{309F}\x{30A0}-\x{30FF}]+",
    r"|[!-/:-@\[-`{-~][A-Za-z]+",
    r"|[^\r\n\p{L}\p{P}\p{S}]?[\p{L}\p{M}]+",
    r"| ?[\p{P}\p{S}]+[\r\n]*",
    r"|\s*[\r\n]+",
    r"|\s+",
    r"|[\s\S]",
);

// qwen2 pattern: similar to cl100k but \p{N} matches single digits (not 1-3)
pub(crate) const QWEN2_PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+";

// mistral v3 (tekken) pattern: same as cl100k
const MISTRAL_V3_PATTERN: &str = CL100K_PATTERN;

/// Parse a zstd-compressed `.tiktoken` file into a rank map.
///
/// The compressed data is first decompressed, then parsed line by line.
/// Each line is: `<base64-encoded token bytes> <integer rank>`
/// The rank determines merge priority in the BPE algorithm (lower = merged first).
pub(crate) fn parse_tiktoken_data(compressed: &[u8]) -> FxHashMap<Vec<u8>, u32> {
    let mut decoder =
        ruzstd::decoding::StreamingDecoder::new(compressed).expect("zstd decompression failed");
    let mut data = Vec::new();
    std::io::Read::read_to_end(&mut decoder, &mut data).expect("zstd decompression failed");
    parse_tiktoken_lines(&data)
}

/// Parse raw (uncompressed) `.tiktoken` lines into a rank map.
fn parse_tiktoken_lines(data: &[u8]) -> FxHashMap<Vec<u8>, u32> {
    let engine = base64::engine::general_purpose::STANDARD;
    let content = std::str::from_utf8(data).expect("tiktoken data must be valid UTF-8");

    let mut ranks = FxHashMap::default();
    ranks.reserve(data.len() / 20); // rough estimate: ~20 bytes per line

    for line in content.lines() {
        let line = line.trim();
        if line.is_empty() {
            continue;
        }
        let mut parts = line.splitn(2, ' ');
        let token_b64 = parts.next().expect("missing token");
        let rank_str = parts.next().expect("missing rank");
        let token_bytes = engine
            .decode(token_b64)
            .expect("invalid base64 in tiktoken data");
        let rank: u32 = rank_str.parse().expect("invalid rank in tiktoken data");
        ranks.insert(token_bytes, rank);
    }

    ranks
}

/// Build a special token map from `(text, id)` pairs.
fn special_tokens(pairs: &[(&str, u32)]) -> FxHashMap<Vec<u8>, u32> {
    pairs
        .iter()
        .map(|&(s, v)| (s.as_bytes().to_vec(), v))
        .collect()
}

/// Construct the cl100k_base encoding (GPT-4, GPT-3.5 Turbo, embeddings).
/// Vocabulary size: 100,256 regular tokens + 5 special tokens.
pub fn cl100k_base() -> CoreBpe {
    let encoder = parse_tiktoken_data(CL100K_BASE_DATA);
    let special = special_tokens(&[
        ("<|endoftext|>", 100257),
        ("<|fim_prefix|>", 100258),
        ("<|fim_middle|>", 100259),
        ("<|fim_suffix|>", 100260),
        ("<|endofprompt|>", 100276),
    ]);
    CoreBpe::new(encoder, special, CL100K_PATTERN, FastPath::Cl100k)
}

/// Construct the p50k_base encoding (text-davinci-002/003, code-davinci, code-cushman).
/// Vocabulary size: 50,256 regular tokens + 1 special token.
pub fn p50k_base() -> CoreBpe {
    let encoder = parse_tiktoken_data(P50K_BASE_DATA);
    let special = special_tokens(&[("<|endoftext|>", 50256)]);
    CoreBpe::new(encoder, special, P50K_PATTERN, FastPath::None)
}

/// Construct the p50k_edit encoding (text-davinci-edit, code-davinci-edit).
/// Same merge ranks as p50k_base but with additional FIM (fill-in-middle) special tokens.
pub fn p50k_edit() -> CoreBpe {
    let encoder = parse_tiktoken_data(P50K_BASE_DATA);
    let special = special_tokens(&[
        ("<|endoftext|>", 50256),
        ("<|fim_prefix|>", 50281),
        ("<|fim_middle|>", 50282),
        ("<|fim_suffix|>", 50283),
    ]);
    CoreBpe::new(encoder, special, P50K_PATTERN, FastPath::None)
}

/// Construct the o200k_base encoding (GPT-4o, o1, o3, o4-mini).
/// Vocabulary size: 199,998 regular tokens + 2 special tokens.
pub fn o200k_base() -> CoreBpe {
    let encoder = parse_tiktoken_data(O200K_BASE_DATA);
    let special = special_tokens(&[("<|endoftext|>", 199999), ("<|endofprompt|>", 200018)]);
    CoreBpe::new(encoder, special, O200K_PATTERN, FastPath::O200k)
}

/// Construct the o200k_harmony encoding (gpt-oss family / harmony chat format).
///
/// Shares merge ranks and regex with [`o200k_base`]; the only delta is the
/// special-token table — 15 named tokens (199998..=200012) plus 1075 reserved
/// placeholders (`<|reserved_200013|>`..=`<|reserved_201087|>`).
///
/// Note: `<|reserved_200018|>` shadows `<|endofprompt|>` from o200k_base
/// at the same id; this mirrors the upstream Python implementation.
pub fn o200k_harmony() -> CoreBpe {
    let encoder = parse_tiktoken_data(O200K_BASE_DATA);
    let mut special: FxHashMap<Vec<u8>, u32> = FxHashMap::default();
    for (name, id) in [
        ("<|startoftext|>", 199998_u32),
        ("<|endoftext|>", 199999),
        ("<|reserved_200000|>", 200000),
        ("<|reserved_200001|>", 200001),
        ("<|return|>", 200002),
        ("<|constrain|>", 200003),
        ("<|reserved_200004|>", 200004),
        ("<|channel|>", 200005),
        ("<|start|>", 200006),
        ("<|end|>", 200007),
        ("<|message|>", 200008),
        ("<|reserved_200009|>", 200009),
        ("<|reserved_200010|>", 200010),
        ("<|reserved_200011|>", 200011),
        ("<|call|>", 200012),
    ] {
        special.insert(name.as_bytes().to_vec(), id);
    }
    for id in 200013..=201087_u32 {
        special.insert(format!("<|reserved_{id}|>").into_bytes(), id);
    }
    CoreBpe::new(encoder, special, O200K_PATTERN, FastPath::O200k)
}

/// Construct the r50k_base encoding (GPT-3 era: davinci, curie, babbage, ada).
/// Vocabulary size: 50,256 regular tokens + 1 special token.
/// Uses the same merge ranks and regex pattern as p50k_base.
pub fn r50k_base() -> CoreBpe {
    let encoder = parse_tiktoken_data(R50K_BASE_DATA);
    let special = special_tokens(&[("<|endoftext|>", 50256)]);
    CoreBpe::new(encoder, special, P50K_PATTERN, FastPath::None)
}

/// Construct the `gpt2` encoding (GPT-2 BPE).
///
/// Byte-for-byte identical to [`r50k_base`] — same merge ranks, regex, and
/// single special token (`<|endoftext|>` at 50256). Exposed as a distinct
/// name for parity with upstream `openai/tiktoken`; the runtime shares
/// `r50k_base`'s cached instance.
pub fn gpt2() -> CoreBpe {
    r50k_base()
}

/// Construct the llama3 encoding (Llama 3 / 3.1 / 3.2 / 3.3).
/// Vocabulary size: 128,000 regular tokens + 256 special tokens.
pub fn llama3() -> CoreBpe {
    let encoder = parse_tiktoken_data(LLAMA3_DATA);
    let special = special_tokens(&[
        ("<|begin_of_text|>", 128000),
        ("<|end_of_text|>", 128001),
        ("<|finetune_right_pad_id|>", 128004),
        ("<|start_header_id|>", 128006),
        ("<|end_header_id|>", 128007),
        ("<|eom_id|>", 128008),
        ("<|eot_id|>", 128009),
        ("<|python_tag|>", 128010),
    ]);
    CoreBpe::new(encoder, special, LLAMA3_PATTERN, FastPath::Cl100k)
}

/// Construct the deepseek_v3 encoding (DeepSeek V3, R1).
/// Vocabulary size: 128,000 regular tokens + 804 special tokens.
pub fn deepseek_v3() -> CoreBpe {
    let encoder = parse_tiktoken_data(DEEPSEEK_V3_DATA);
    let special = special_tokens(&[
        ("<｜begin▁of▁sentence｜>", 0),
        ("<｜end▁of▁sentence｜>", 1),
        ("<｜▁pad▁｜>", 2),
        ("<|EOT|>", 128805),
    ]);
    CoreBpe::new(encoder, special, DEEPSEEK_V3_PATTERN, FastPath::Deepseek)
}

/// Construct the qwen2 encoding (Qwen 2.5 / 3).
/// Vocabulary size: 151,643 regular tokens + 14 special tokens.
pub fn qwen2() -> CoreBpe {
    let encoder = parse_tiktoken_data(QWEN2_DATA);
    let special = special_tokens(&[
        ("<|endoftext|>", 151643),
        ("<|im_start|>", 151644),
        ("<|im_end|>", 151645),
        ("<|object_ref_start|>", 151646),
        ("<|object_ref_end|>", 151647),
        ("<|box_start|>", 151648),
        ("<|box_end|>", 151649),
        ("<|quad_start|>", 151650),
        ("<|quad_end|>", 151651),
        ("<|vision_start|>", 151652),
        ("<|vision_end|>", 151653),
        ("<|vision_pad|>", 151654),
        ("<|image_pad|>", 151655),
        ("<|video_pad|>", 151656),
    ]);
    CoreBpe::new(encoder, special, QWEN2_PATTERN, FastPath::Qwen2)
}

/// Construct the mistral_v3 encoding (Mistral, Mixtral with Tekken tokenizer).
/// Vocabulary size: 131,072 regular tokens + 1000 special tokens.
pub fn mistral_v3() -> CoreBpe {
    let encoder = parse_tiktoken_data(MISTRAL_V3_DATA);
    let special = special_tokens(&[
        ("<unk>", 0),
        ("<s>", 1),
        ("</s>", 2),
        ("[INST]", 3),
        ("[/INST]", 4),
        ("[AVAILABLE_TOOLS]", 5),
        ("[/AVAILABLE_TOOLS]", 6),
        ("[TOOL_RESULTS]", 7),
        ("[/TOOL_RESULTS]", 8),
        ("[TOOL_CALLS]", 9),
        ("[IMG]", 10),
        ("[IMG_BREAK]", 12),
        ("[IMG_END]", 13),
        ("[PREFIX]", 14),
        ("[MIDDLE]", 15),
        ("[SUFFIX]", 16),
    ]);
    CoreBpe::new(encoder, special, MISTRAL_V3_PATTERN, FastPath::Cl100k)
}

/// Expose cl100k rank map for internal tests (e.g. Vocab equivalence)
#[cfg(test)]
pub(crate) fn parse_tiktoken_data_for_test() -> FxHashMap<Vec<u8>, u32> {
    parse_tiktoken_data(CL100K_BASE_DATA)
}