tiktoken_rs/
tokenizer.rs

1/*!
2 * lists out the available tokenizers for different OpenAI models.
3 */
4
5use std::collections::HashMap;
6
7use lazy_static::lazy_static;
8
9/// Enum representing the available tokenizers for different OpenAI models.
10///
11/// This enum lists the possible tokenizer types that can be used for tokenizing text
12/// when working with various OpenAI models.
13///
14/// # Example
15///
16/// ```
17/// use tiktoken_rs::tokenizer::Tokenizer;
18///
19/// let tokenizer = Tokenizer::Cl100kBase;
20/// ```
21#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
22pub enum Tokenizer {
23    O200kHarmony,
24    O200kBase,
25    Cl100kBase,
26    P50kBase,
27    R50kBase,
28    P50kEdit,
29    Gpt2,
30}
31
32// Keep this in sync with:
33// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L7-L27
34const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
35    ("o1-", Tokenizer::O200kBase),
36    ("o3-", Tokenizer::O200kBase),
37    ("o4-", Tokenizer::O200kBase),
38    // chat
39    ("gpt-5-", Tokenizer::O200kBase),
40    ("gpt-4.5-", Tokenizer::O200kBase),
41    ("gpt-4.1-", Tokenizer::O200kBase),
42    ("chatgpt-4o-", Tokenizer::O200kBase),
43    ("gpt-4o-", Tokenizer::O200kBase), // e.g., gpt-4o-2024-05-13
44    ("gpt-4-", Tokenizer::Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
45    ("gpt-3.5-turbo-", Tokenizer::Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
46    ("gpt-35-turbo-", Tokenizer::Cl100kBase), // Azure deployment name
47    ("gpt-oss-", Tokenizer::O200kHarmony),
48    // fine-tuned
49    ("ft:gpt-4o", Tokenizer::O200kBase),
50    ("ft:gpt-4", Tokenizer::Cl100kBase),
51    ("ft:gpt-3.5-turbo", Tokenizer::Cl100kBase),
52    ("ft:davinci-002", Tokenizer::Cl100kBase),
53    ("ft:babbage-002", Tokenizer::Cl100kBase),
54];
55
56// Keep this in sync with:
57// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L29-L84
58const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
59    // reasoning
60    ("o1", Tokenizer::O200kBase),
61    ("o3", Tokenizer::O200kBase),
62    ("o4", Tokenizer::O200kBase),
63    // chat
64    ("gpt-5", Tokenizer::O200kBase),
65    ("gpt-4.1", Tokenizer::O200kBase),
66    ("chatgpt-4o-latest", Tokenizer::O200kBase),
67    ("gpt-4o", Tokenizer::O200kBase),
68    ("gpt-4", Tokenizer::Cl100kBase),
69    ("gpt-3.5-turbo", Tokenizer::Cl100kBase),
70    ("gpt-3.5", Tokenizer::Cl100kBase),      // Common shorthand
71    ("gpt-35-turbo", Tokenizer::Cl100kBase), // Azure deployment name
72    // base
73    ("davinci-002", Tokenizer::Cl100kBase),
74    ("babbage-002", Tokenizer::Cl100kBase),
75    // embeddings
76    ("text-embedding-ada-002", Tokenizer::Cl100kBase),
77    ("text-embedding-3-small", Tokenizer::Cl100kBase),
78    ("text-embedding-3-large", Tokenizer::Cl100kBase),
79    // DEPRECATED MODELS
80    // text (DEPRECATED)
81    ("text-davinci-003", Tokenizer::P50kBase),
82    ("text-davinci-002", Tokenizer::P50kBase),
83    ("text-davinci-001", Tokenizer::R50kBase),
84    ("text-curie-001", Tokenizer::R50kBase),
85    ("text-babbage-001", Tokenizer::R50kBase),
86    ("text-ada-001", Tokenizer::R50kBase),
87    ("davinci", Tokenizer::R50kBase),
88    ("curie", Tokenizer::R50kBase),
89    ("babbage", Tokenizer::R50kBase),
90    ("ada", Tokenizer::R50kBase),
91    // code (DEPRECATED)
92    ("code-davinci-002", Tokenizer::P50kBase),
93    ("code-davinci-001", Tokenizer::P50kBase),
94    ("code-cushman-002", Tokenizer::P50kBase),
95    ("code-cushman-001", Tokenizer::P50kBase),
96    ("davinci-codex", Tokenizer::P50kBase),
97    ("cushman-codex", Tokenizer::P50kBase),
98    // edit (DEPRECATED)
99    ("text-davinci-edit-001", Tokenizer::P50kEdit),
100    ("code-davinci-edit-001", Tokenizer::P50kEdit),
101    // old embeddings (DEPRECATED)
102    ("text-similarity-davinci-001", Tokenizer::R50kBase),
103    ("text-similarity-curie-001", Tokenizer::R50kBase),
104    ("text-similarity-babbage-001", Tokenizer::R50kBase),
105    ("text-similarity-ada-001", Tokenizer::R50kBase),
106    ("text-search-davinci-doc-001", Tokenizer::R50kBase),
107    ("text-search-curie-doc-001", Tokenizer::R50kBase),
108    ("text-search-babbage-doc-001", Tokenizer::R50kBase),
109    ("text-search-ada-doc-001", Tokenizer::R50kBase),
110    ("code-search-babbage-code-001", Tokenizer::R50kBase),
111    ("code-search-ada-code-001", Tokenizer::R50kBase),
112    // open source
113    ("gpt2", Tokenizer::Gpt2),
114    ("gpt-2", Tokenizer::Gpt2), // Maintains consistency with gpt-4
115];
116
117lazy_static! {
118    static ref MODEL_TO_TOKENIZER_MAP: HashMap<&'static str, Tokenizer> = {
119        let mut map = HashMap::new();
120        MODEL_TO_TOKENIZER.iter().for_each(|&(model, tokenizer)| {
121            map.insert(model, tokenizer);
122        });
123        map
124    };
125}
126
127/// Returns the tokenizer type used by a model.
128///
129/// This function retrieves the corresponding tokenizer enum variant for the given model name. It first looks
130/// for an exact match in the `MODEL_TO_TOKENIZER` mapping. If it doesn't find a match, it checks for
131/// model name prefixes in the `MODEL_PREFIX_TO_TOKENIZER` mapping.
132///
133/// # Arguments
134///
135/// * `model_name` - A string slice representing the model name for which the tokenizer should be retrieved.
136///
137/// # Examples
138///
139/// ```
140/// use tiktoken_rs::tokenizer::{get_tokenizer, Tokenizer};
141/// let model = "gpt-4-0314";
142/// let tokenizer = get_tokenizer(model).unwrap();
143/// assert_eq!(tokenizer, Tokenizer::Cl100kBase);
144/// ```
145///
146/// # Returns
147///
148/// If a tokenizer is found for the given model name, the function returns an `Option` containing the tokenizer
149/// enum variant; otherwise, it returns `None`.
150pub fn get_tokenizer(model_name: &str) -> Option<Tokenizer> {
151    if let Some(tokenizer) = MODEL_TO_TOKENIZER_MAP.get(model_name) {
152        return Some(*tokenizer);
153    }
154    if let Some(tokenizer) = MODEL_PREFIX_TO_TOKENIZER
155        .iter()
156        .find(|(model_prefix, _)| model_name.starts_with(*model_prefix))
157    {
158        return Some(tokenizer.1);
159    }
160
161    None
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_get_tokenizer() {
170        assert_eq!(get_tokenizer("gpt-5"), Some(Tokenizer::O200kBase));
171        assert_eq!(get_tokenizer("gpt-oss-20b"), Some(Tokenizer::O200kHarmony));
172        assert_eq!(get_tokenizer("gpt-oss-120b"), Some(Tokenizer::O200kHarmony));
173        assert_eq!(
174            get_tokenizer("chatgpt-4o-latest"),
175            Some(Tokenizer::O200kBase)
176        );
177        assert_eq!(
178            get_tokenizer("gpt-4o-2024-05-13"),
179            Some(Tokenizer::O200kBase)
180        );
181        assert_eq!(
182            get_tokenizer("gpt-4-0125-preview"),
183            Some(Tokenizer::Cl100kBase)
184        );
185        assert_eq!(get_tokenizer("gpt-4-32k-0314"), Some(Tokenizer::Cl100kBase));
186        assert_eq!(
187            get_tokenizer("gpt-4-1106-preview"),
188            Some(Tokenizer::Cl100kBase)
189        );
190        assert_eq!(
191            get_tokenizer("gpt-3.5-turbo-0125"),
192            Some(Tokenizer::Cl100kBase),
193        );
194        assert_eq!(
195            get_tokenizer("gpt-3.5-turbo-1106"),
196            Some(Tokenizer::Cl100kBase),
197        );
198        assert_eq!(get_tokenizer("gpt-3.5-turbo"), Some(Tokenizer::Cl100kBase));
199        assert_eq!(
200            get_tokenizer("ft:gpt-3.5-turbo:XXXXXX:2023-11-11"),
201            Some(Tokenizer::Cl100kBase)
202        );
203        assert_eq!(
204            get_tokenizer("gpt-3.5-turbo-0301"),
205            Some(Tokenizer::Cl100kBase)
206        );
207        assert_eq!(get_tokenizer("text-davinci-003"), Some(Tokenizer::P50kBase));
208        assert_eq!(
209            get_tokenizer("code-search-ada-code-001"),
210            Some(Tokenizer::R50kBase)
211        );
212        assert_eq!(get_tokenizer("foo"), None);
213    }
214}