pub const ENDOFTEXT: &str = "<|endoftext|>";
pub const FIM_PREFIX: &str = "<|fim_prefix|>";
pub const FIM_MIDDLE: &str = "<|fim_middle|>";
pub const FIM_SUFFIX: &str = "<|fim_suffix|>";
pub const ENDOFPROMPT: &str = "<|endofprompt|>";
use anyhow::Result;
use base64::{engine::general_purpose, Engine as _};
use rustc_hash::FxHashMap as HashMap;
use crate::vendor_tiktoken::CoreBPE;
pub fn r50k_base() -> Result<CoreBPE> {
let bpe_file = include_str!("../../assets/r50k_base.tiktoken");
let mut encoder = HashMap::default();
for line in bpe_file.lines() {
let mut parts = line.split(' ');
let token = &general_purpose::STANDARD.decode(parts.next().unwrap())?;
let rank: usize = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}
let mut special_tokens = HashMap::default();
special_tokens.insert(String::from(ENDOFTEXT), 50256);
let bpe = CoreBPE::new(
encoder,
special_tokens,
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
)?;
Ok(bpe)
}
pub fn p50k_base() -> Result<CoreBPE> {
let bpe_file = include_str!("../../assets/p50k_base.tiktoken");
let mut encoder = HashMap::default();
for line in bpe_file.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
let token = &general_purpose::STANDARD.decode(raw)?;
let rank: usize = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}
let mut special_tokens = HashMap::default();
special_tokens.insert(String::from(ENDOFTEXT), 50256);
let bpe = CoreBPE::new(
encoder,
special_tokens,
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
)?;
Ok(bpe)
}
pub fn p50k_edit() -> Result<CoreBPE> {
let bpe_file = include_str!("../../assets/p50k_base.tiktoken");
let mut encoder = HashMap::default();
for line in bpe_file.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
let token = &general_purpose::STANDARD.decode(raw)?;
let rank: usize = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}
let mut special_tokens = HashMap::default();
special_tokens.insert(String::from(ENDOFTEXT), 50256);
special_tokens.insert(String::from(FIM_PREFIX), 50281);
special_tokens.insert(String::from(FIM_MIDDLE), 50282);
special_tokens.insert(String::from(FIM_SUFFIX), 50283);
let bpe = CoreBPE::new(
encoder,
special_tokens,
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
)?;
Ok(bpe)
}
pub fn cl100k_base() -> Result<CoreBPE> {
let cl100k_base = include_str!("../../assets/cl100k_base.tiktoken");
let mut encoder = HashMap::default();
for line in cl100k_base.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
let token = &general_purpose::STANDARD.decode(raw)?;
let rank: usize = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}
let mut special_tokens = HashMap::default();
special_tokens.insert(String::from(ENDOFTEXT), 100257);
special_tokens.insert(String::from(FIM_PREFIX), 100258);
special_tokens.insert(String::from(FIM_MIDDLE), 100259);
special_tokens.insert(String::from(FIM_SUFFIX), 100260);
special_tokens.insert(String::from(ENDOFPROMPT), 100276);
let bpe = CoreBPE::new(
encoder,
special_tokens,
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
)?;
Ok(bpe)
}