1use std::collections::HashMap;
6
7use lazy_static::lazy_static;
8
9#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
22pub enum Tokenizer {
23 O200kHarmony,
24 O200kBase,
25 Cl100kBase,
26 P50kBase,
27 R50kBase,
28 P50kEdit,
29 Gpt2,
30}
31
32const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
35 ("o1-", Tokenizer::O200kBase),
36 ("o3-", Tokenizer::O200kBase),
37 ("o4-", Tokenizer::O200kBase),
38 ("gpt-5-", Tokenizer::O200kBase),
40 ("gpt-4.5-", Tokenizer::O200kBase),
41 ("gpt-4.1-", Tokenizer::O200kBase),
42 ("chatgpt-4o-", Tokenizer::O200kBase),
43 ("gpt-4o-", Tokenizer::O200kBase), ("gpt-4-", Tokenizer::Cl100kBase), ("gpt-3.5-turbo-", Tokenizer::Cl100kBase), ("gpt-35-turbo-", Tokenizer::Cl100kBase), ("gpt-oss-", Tokenizer::O200kHarmony),
48 ("ft:gpt-4o", Tokenizer::O200kBase),
50 ("ft:gpt-4", Tokenizer::Cl100kBase),
51 ("ft:gpt-3.5-turbo", Tokenizer::Cl100kBase),
52 ("ft:davinci-002", Tokenizer::Cl100kBase),
53 ("ft:babbage-002", Tokenizer::Cl100kBase),
54];
55
56const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
59 ("o1", Tokenizer::O200kBase),
61 ("o3", Tokenizer::O200kBase),
62 ("o4", Tokenizer::O200kBase),
63 ("gpt-5", Tokenizer::O200kBase),
65 ("gpt-4.1", Tokenizer::O200kBase),
66 ("chatgpt-4o-latest", Tokenizer::O200kBase),
67 ("gpt-4o", Tokenizer::O200kBase),
68 ("gpt-4", Tokenizer::Cl100kBase),
69 ("gpt-3.5-turbo", Tokenizer::Cl100kBase),
70 ("gpt-3.5", Tokenizer::Cl100kBase), ("gpt-35-turbo", Tokenizer::Cl100kBase), ("davinci-002", Tokenizer::Cl100kBase),
74 ("babbage-002", Tokenizer::Cl100kBase),
75 ("text-embedding-ada-002", Tokenizer::Cl100kBase),
77 ("text-embedding-3-small", Tokenizer::Cl100kBase),
78 ("text-embedding-3-large", Tokenizer::Cl100kBase),
79 ("text-davinci-003", Tokenizer::P50kBase),
82 ("text-davinci-002", Tokenizer::P50kBase),
83 ("text-davinci-001", Tokenizer::R50kBase),
84 ("text-curie-001", Tokenizer::R50kBase),
85 ("text-babbage-001", Tokenizer::R50kBase),
86 ("text-ada-001", Tokenizer::R50kBase),
87 ("davinci", Tokenizer::R50kBase),
88 ("curie", Tokenizer::R50kBase),
89 ("babbage", Tokenizer::R50kBase),
90 ("ada", Tokenizer::R50kBase),
91 ("code-davinci-002", Tokenizer::P50kBase),
93 ("code-davinci-001", Tokenizer::P50kBase),
94 ("code-cushman-002", Tokenizer::P50kBase),
95 ("code-cushman-001", Tokenizer::P50kBase),
96 ("davinci-codex", Tokenizer::P50kBase),
97 ("cushman-codex", Tokenizer::P50kBase),
98 ("text-davinci-edit-001", Tokenizer::P50kEdit),
100 ("code-davinci-edit-001", Tokenizer::P50kEdit),
101 ("text-similarity-davinci-001", Tokenizer::R50kBase),
103 ("text-similarity-curie-001", Tokenizer::R50kBase),
104 ("text-similarity-babbage-001", Tokenizer::R50kBase),
105 ("text-similarity-ada-001", Tokenizer::R50kBase),
106 ("text-search-davinci-doc-001", Tokenizer::R50kBase),
107 ("text-search-curie-doc-001", Tokenizer::R50kBase),
108 ("text-search-babbage-doc-001", Tokenizer::R50kBase),
109 ("text-search-ada-doc-001", Tokenizer::R50kBase),
110 ("code-search-babbage-code-001", Tokenizer::R50kBase),
111 ("code-search-ada-code-001", Tokenizer::R50kBase),
112 ("gpt2", Tokenizer::Gpt2),
114 ("gpt-2", Tokenizer::Gpt2), ];
116
117lazy_static! {
118 static ref MODEL_TO_TOKENIZER_MAP: HashMap<&'static str, Tokenizer> = {
119 let mut map = HashMap::new();
120 MODEL_TO_TOKENIZER.iter().for_each(|&(model, tokenizer)| {
121 map.insert(model, tokenizer);
122 });
123 map
124 };
125}
126
127pub fn get_tokenizer(model_name: &str) -> Option<Tokenizer> {
151 if let Some(tokenizer) = MODEL_TO_TOKENIZER_MAP.get(model_name) {
152 return Some(*tokenizer);
153 }
154 if let Some(tokenizer) = MODEL_PREFIX_TO_TOKENIZER
155 .iter()
156 .find(|(model_prefix, _)| model_name.starts_with(*model_prefix))
157 {
158 return Some(tokenizer.1);
159 }
160
161 None
162}
163
164#[cfg(test)]
165mod tests {
166 use super::*;
167
168 #[test]
169 fn test_get_tokenizer() {
170 assert_eq!(get_tokenizer("gpt-5"), Some(Tokenizer::O200kBase));
171 assert_eq!(get_tokenizer("gpt-oss-20b"), Some(Tokenizer::O200kHarmony));
172 assert_eq!(get_tokenizer("gpt-oss-120b"), Some(Tokenizer::O200kHarmony));
173 assert_eq!(
174 get_tokenizer("chatgpt-4o-latest"),
175 Some(Tokenizer::O200kBase)
176 );
177 assert_eq!(
178 get_tokenizer("gpt-4o-2024-05-13"),
179 Some(Tokenizer::O200kBase)
180 );
181 assert_eq!(
182 get_tokenizer("gpt-4-0125-preview"),
183 Some(Tokenizer::Cl100kBase)
184 );
185 assert_eq!(get_tokenizer("gpt-4-32k-0314"), Some(Tokenizer::Cl100kBase));
186 assert_eq!(
187 get_tokenizer("gpt-4-1106-preview"),
188 Some(Tokenizer::Cl100kBase)
189 );
190 assert_eq!(
191 get_tokenizer("gpt-3.5-turbo-0125"),
192 Some(Tokenizer::Cl100kBase),
193 );
194 assert_eq!(
195 get_tokenizer("gpt-3.5-turbo-1106"),
196 Some(Tokenizer::Cl100kBase),
197 );
198 assert_eq!(get_tokenizer("gpt-3.5-turbo"), Some(Tokenizer::Cl100kBase));
199 assert_eq!(
200 get_tokenizer("ft:gpt-3.5-turbo:XXXXXX:2023-11-11"),
201 Some(Tokenizer::Cl100kBase)
202 );
203 assert_eq!(
204 get_tokenizer("gpt-3.5-turbo-0301"),
205 Some(Tokenizer::Cl100kBase)
206 );
207 assert_eq!(get_tokenizer("text-davinci-003"), Some(Tokenizer::P50kBase));
208 assert_eq!(
209 get_tokenizer("code-search-ada-code-001"),
210 Some(Tokenizer::R50kBase)
211 );
212 assert_eq!(get_tokenizer("foo"), None);
213 }
214}