use toktrie::recognizer::FunctionalRecognizer;
use toktrie::{SimpleVob, TokRxInfo, TokTrie, TokenId};
pub const EOS_TOKEN: TokenId = 25;
pub const VOCAB_SIZE: u32 = 26;
pub fn vocab() -> Vec<Vec<u8>> {
vec![
b"".to_vec(), b"a".to_vec(), b"an".to_vec(), b"ant".to_vec(), b"app".to_vec(), b"apple".to_vec(), b"apply".to_vec(), b"apps".to_vec(), b"b".to_vec(), b"ba".to_vec(), b"bat".to_vec(), b"ban".to_vec(), b"band".to_vec(), b"c".to_vec(), b"ca".to_vec(), b"cat".to_vec(), b"car".to_vec(), b"card".to_vec(), b"d".to_vec(), b"do".to_vec(), b"dog".to_vec(), b"dot".to_vec(), b"e".to_vec(), b" ".to_vec(), b"th".to_vec(), b"the".to_vec(), ]
}
pub fn build_test_trie() -> TokTrie {
let words = vocab();
let info = TokRxInfo {
vocab_size: VOCAB_SIZE,
tok_eos: EOS_TOKEN,
tok_bos: None,
tok_pad: None,
tok_unk: None,
tok_end_of_turn: None,
};
TokTrie::from(&info, &words)
}
pub fn allowed_set(vob: &SimpleVob) -> Vec<TokenId> {
let mut v = Vec::new();
for i in 0..VOCAB_SIZE {
if vob.is_allowed(i) {
v.push(i);
}
}
v
}
#[derive(Clone, Copy)]
pub struct AlphaOnly;
impl FunctionalRecognizer<()> for AlphaOnly {
fn initial(&self) {}
fn try_append(&self, _state: (), byte: u8) -> Option<()> {
if byte.is_ascii_lowercase() {
Some(())
} else {
None
}
}
fn get_error(&self, _state: ()) -> Option<String> {
Some("AlphaOnly: expected lowercase ASCII letter (a-z)".to_string())
}
}
#[derive(Clone, Copy)]
pub struct CaPrefix;
impl FunctionalRecognizer<u8> for CaPrefix {
fn initial(&self) -> u8 {
0 }
fn try_append(&self, state: u8, byte: u8) -> Option<u8> {
match state {
0 if byte == b'c' => Some(1),
1 if byte == b'a' => Some(2),
2 if byte.is_ascii_lowercase() => Some(3),
_ => None,
}
}
fn get_error(&self, state: u8) -> Option<String> {
match state {
0 => Some("CaPrefix: expected 'c'".to_string()),
1 => Some("CaPrefix: expected 'a' after 'c'".to_string()),
2 => Some("CaPrefix: expected lowercase letter after \"ca\"".to_string()),
3 => Some("CaPrefix: pattern complete, no further bytes accepted".to_string()),
_ => Some(format!("CaPrefix: unexpected state {state}")),
}
}
}