use crate::{
TokenType,
alloc::{
string::{
String,
ToString,
},
vec::Vec,
},
declare_carrot_special,
vocab::utility::{
ToTokenList,
format_reserved_carrot,
},
};
declare_carrot_special!(
(STARTOFTEXT, "startoftext"),
(ENDOFTEXT, "endoftext"),
(ENDOFPROMPT, "endofprompt"),
(FIM_PREFIX, "fim_prefix"),
(FIM_MIDDLE, "fim_middle"),
(FIM_SUFFIX, "fim_suffix"),
(RETURN, "return"),
(CONSTRAIN, "constrain"),
(CHANNEL, "channel"),
(START, "start"),
(END, "end"),
(MESSAGE, "message"),
(CALL, "call"),
);
pub const OA_R50K_SPECIAL_TOKENS: &[(&str, usize)] = &[(ENDOFTEXT, 50256)];
pub fn oa_r50k_base_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_R50K_SPECIAL_TOKENS.to_token_list::<T>()
}
pub const OA_P50K_BASE_SPECIAL_TOKENS: &[(&str, usize)] = &[(ENDOFTEXT, 50256)];
pub fn oa_p50k_base_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_P50K_BASE_SPECIAL_TOKENS.to_token_list::<T>()
}
pub const OA_P50K_EDIT_SPECIAL_TOKENS: &[(&str, usize)] = &[
(ENDOFTEXT, 50256),
(FIM_PREFIX, 50281),
(FIM_MIDDLE, 50282),
(FIM_SUFFIX, 50283),
];
pub fn oa_p50k_edit_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_P50K_EDIT_SPECIAL_TOKENS.to_token_list::<T>()
}
pub const OA_CL100K_BASE_SPECIAL_TOKENS: &[(&str, usize)] = &[
(ENDOFTEXT, 100257),
(FIM_PREFIX, 100258),
(FIM_MIDDLE, 100259),
(FIM_SUFFIX, 100260),
(ENDOFPROMPT, 100276),
];
pub fn oa_cl100k_base_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_CL100K_BASE_SPECIAL_TOKENS.to_token_list::<T>()
}
pub const OA_O200K_BASE_SPECIAL_TOKENS: &[(&str, usize)] =
&[(ENDOFTEXT, 199999), (ENDOFPROMPT, 200018)];
pub fn oa_o200k_base_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_O200K_BASE_SPECIAL_TOKENS.to_token_list::<T>()
}
pub const OA_O200K_HARMONY_NAMED_SPECIAL_TOKENS: &[(&str, usize)] = &[
(STARTOFTEXT, 199998),
(ENDOFTEXT, 199999),
(ENDOFPROMPT, 200018),
(RETURN, 200002),
(CONSTRAIN, 200003),
(CHANNEL, 200005),
(START, 200006),
(END, 200007),
(MESSAGE, 200008),
(CALL, 200012),
];
pub fn oa_o200k_harmony_named_special_tokens<T: TokenType>() -> Vec<(String, T)> {
OA_O200K_HARMONY_NAMED_SPECIAL_TOKENS.to_token_list::<T>()
}
pub fn oa_o200k_harmony_reserved_tokens<T: TokenType>() -> Vec<(String, T)> {
let mut specials: Vec<(String, usize)> = Vec::with_capacity(6 + (201088 - 200013));
let mut reserve = |val| {
specials.push((format_reserved_carrot(val), val));
};
reserve(200000);
reserve(200001);
reserve(200004);
reserve(200009);
reserve(200010);
reserve(200011);
for val in 200013..201088 {
reserve(val);
}
specials
.iter()
.map(|(s, t)| (s.to_string(), T::from_usize(*t).unwrap()))
.collect()
}
pub fn oa_o200k_harmony_special_tokens<T: TokenType>() -> Vec<(String, T)> {
let mut specials = oa_o200k_harmony_named_special_tokens();
specials.extend(oa_o200k_harmony_reserved_tokens());
specials
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
alloc::{
string::ToString,
vec,
vec::Vec,
},
vocab::utility::format_reserved_carrot,
};
#[test]
fn test_oa_gpt2_r50k_specials() {
assert_eq!(
oa_r50k_base_special_tokens::<usize>(),
vec![("<|endoftext|>".to_string(), 50256),]
);
}
#[test]
fn test_oa_gpt2_p50k_base_specials() {
assert_eq!(
oa_p50k_base_special_tokens::<usize>(),
vec![("<|endoftext|>".to_string(), 50256),]
);
}
#[test]
fn test_oa_gpt2_p50k_edit_specials() {
assert_eq!(
oa_p50k_edit_special_tokens::<usize>(),
vec![
("<|endoftext|>".to_string(), 50256),
("<|fim_prefix|>".to_string(), 50281),
("<|fim_middle|>".to_string(), 50282),
("<|fim_suffix|>".to_string(), 50283),
]
);
}
#[test]
fn test_oa_gpt3_cl100k_edit_specials() {
assert_eq!(
oa_cl100k_base_special_tokens::<usize>(),
vec![
("<|endoftext|>".to_string(), 100257),
("<|fim_prefix|>".to_string(), 100258),
("<|fim_middle|>".to_string(), 100259),
("<|fim_suffix|>".to_string(), 100260),
("<|endofprompt|>".to_string(), 100276),
]
);
}
#[test]
fn test_oa_gpt5_o200k_base_specials() {
assert_eq!(
oa_o200k_base_special_tokens::<usize>(),
vec![
("<|endoftext|>".to_string(), 199999),
("<|endofprompt|>".to_string(), 200018)
]
)
}
#[test]
fn test_oa_gpt5_o200k_harmony_specials() {
let mut expected = vec![
("<|reserved_200000|>".to_string(), 200000),
("<|reserved_200001|>".to_string(), 200001),
("<|reserved_200004|>".to_string(), 200004),
("<|reserved_200009|>".to_string(), 200009),
("<|reserved_200010|>".to_string(), 200010),
("<|reserved_200011|>".to_string(), 200011),
];
(200013..201088).for_each(|i| expected.push((format_reserved_carrot(i), i)));
let reserved = oa_o200k_harmony_reserved_tokens();
assert_eq!(&reserved, &expected);
let named = oa_o200k_harmony_named_special_tokens();
assert_eq!(
&named,
&vec![
("<|startoftext|>".to_string(), 199998),
("<|endoftext|>".to_string(), 199999),
("<|endofprompt|>".to_string(), 200018),
("<|return|>".to_string(), 200002),
("<|constrain|>".to_string(), 200003),
("<|channel|>".to_string(), 200005),
("<|start|>".to_string(), 200006),
("<|end|>".to_string(), 200007),
("<|message|>".to_string(), 200008),
("<|call|>".to_string(), 200012),
]
);
let expected = named
.iter()
.chain(reserved.iter())
.cloned()
.collect::<Vec<_>>();
assert_eq!(oa_o200k_harmony_special_tokens(), expected);
}
}