use std::collections::HashMap;
use std::path::PathBuf;
use codec_rs::{
encode_byte_level_chars, BPETokenizer, Detokenizer, ITokenizer, TokenizerMap,
};
fn find_qwen_map() -> Option<PathBuf> {
for c in [
"/mnt/h/dev/codec-maps/maps/qwen/qwen2.json",
r"H:\dev\codec-maps\maps\qwen\qwen2.json",
] {
let p = PathBuf::from(c);
if p.exists() {
return Some(p);
}
}
None
}
fn find_p50k_map() -> Option<PathBuf> {
for c in [
"/mnt/h/dev/codec-maps/maps/openai/p50k_base.json",
r"H:\dev\codec-maps\maps\openai\p50k_base.json",
] {
let p = PathBuf::from(c);
if p.exists() {
return Some(p);
}
}
None
}
fn find_codec_map(rel: &str) -> Option<PathBuf> {
for base in ["/mnt/h/dev/codec-maps/maps", r"H:\dev\codec-maps\maps"] {
let p = PathBuf::from(format!("{base}/{rel}"));
if p.exists() {
return Some(p);
}
}
None
}
fn make_byte_level_fixture() -> TokenizerMap {
let space = encode_byte_level_chars(&[0x20]);
let mut vocab: HashMap<String, u32> = HashMap::new();
vocab.insert("h".into(), 0);
vocab.insert("e".into(), 1);
vocab.insert("l".into(), 2);
vocab.insert("o".into(), 3);
vocab.insert("w".into(), 4);
vocab.insert("r".into(), 5);
vocab.insert("d".into(), 6);
vocab.insert(space.clone(), 7);
vocab.insert("!".into(), 8);
vocab.insert("he".into(), 9);
vocab.insert("hel".into(), 10);
vocab.insert("hell".into(), 11);
vocab.insert("hello".into(), 12);
vocab.insert("wo".into(), 13);
vocab.insert("wor".into(), 14);
vocab.insert("worl".into(), 15);
vocab.insert("world".into(), 16);
vocab.insert(format!("{space}world"), 17);
let merges = vec![
"h e".to_string(),
"he l".to_string(),
"hel l".to_string(),
"hell o".to_string(),
"w o".to_string(),
"wo r".to_string(),
"wor l".to_string(),
"worl d".to_string(),
format!("{space} world"),
];
TokenizerMap {
id: "test/byte_level".into(),
version: "2".into(),
vocab_size: vocab.len() as i64,
vocab: Some(vocab),
tokens: None,
encoder: Some("byte_level".into()),
merges: Some(merges),
pre_tokenizer_pattern: Some(" ?[A-Za-z]+| ?[^A-Za-z\\s]+|\\s+".into()),
pre_tokenizer_program: None,
byte_fallback_start: None,
byte_fallback_end: None,
special_tokens: None,
tool_calling: None,
published_at: None,
}
}
#[test]
fn encodes_hello_world_exactly() {
let map = make_byte_level_fixture();
let tok = BPETokenizer::new(&map).expect("supports");
let ids = ITokenizer::encode(&tok, "hello world!");
assert_eq!(ids, vec![12, 17, 8]);
}
#[test]
fn round_trips_through_detokenizer() {
let map = make_byte_level_fixture();
let tok = BPETokenizer::new(&map).expect("supports");
let mut detok = Detokenizer::new(&map);
let text = "hello world!";
let ids = ITokenizer::encode(&tok, text);
assert_eq!(detok.render(&ids, Default::default()), text);
}
#[test]
fn merges_greedily_by_priority_not_left_to_right() {
let mut vocab: HashMap<String, u32> = HashMap::new();
vocab.insert("a".into(), 0);
vocab.insert("b".into(), 1);
vocab.insert("c".into(), 2);
vocab.insert("ab".into(), 3);
vocab.insert("bc".into(), 4);
vocab.insert("abc".into(), 5);
let merges = vec!["b c".to_string(), "a b".to_string()];
let map = TokenizerMap {
id: "test/priority".into(),
version: "2".into(),
vocab_size: 6,
vocab: Some(vocab),
tokens: None,
encoder: Some("byte_level".into()),
merges: Some(merges),
pre_tokenizer_pattern: Some("\\S+".into()),
pre_tokenizer_program: None,
byte_fallback_start: None,
byte_fallback_end: None,
special_tokens: None,
tool_calling: None,
published_at: None,
};
let tok = BPETokenizer::new(&map).expect("supports");
assert_eq!(ITokenizer::encode(&tok, "abc"), vec![0, 4]);
}
#[test]
fn chat_template_and_fim_specials_emit_atomic_ids() {
let Some(path) = find_qwen_map() else {
eprintln!("skipping — codec-maps/qwen/qwen2.json not present locally");
return;
};
let bytes = std::fs::read(&path).expect("read map");
let map = TokenizerMap::from_json(&bytes).expect("parse map");
let tok = BPETokenizer::new(&map).expect("supports");
let cases: &[(&str, &[u32])] = &[
(
"<|im_start|>user\nWhat is 2+2?<|im_end|>",
&[151644, 872, 198, 3838, 374, 220, 17, 10, 17, 30, 151645],
),
(
"<|fim_prefix|>def foo(x):<|fim_suffix|> return x<|fim_middle|>\n",
&[151659, 750, 15229, 2075, 1648, 151661, 262, 470, 856, 151660, 198],
),
(
"<|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>user\nHello<|im_end|>",
&[
151644, 8948, 198, 2610, 525, 10950, 13, 151645, 198, 151644, 872, 198, 9707,
151645,
],
),
];
for (text, expected) in cases {
let got = ITokenizer::encode(&tok, text);
assert_eq!(
&got[..],
*expected,
"mismatch on {text:?}: expected {expected:?}, got {got:?}"
);
}
}
#[test]
fn p50k_base_round_trips_via_lead_space_program_ops() {
let Some(path) = find_p50k_map() else {
eprintln!("skipping — codec-maps/openai/p50k_base.json not present locally");
return;
};
let bytes = std::fs::read(&path).expect("read map");
let map = TokenizerMap::from_json(&bytes).expect("parse map");
let tok = BPETokenizer::new(&map).expect("supports");
let cases: &[(&str, &[u32])] = &[
("Hello, world!", &[15496, 11, 995, 0]),
(
"1 2 12 123 1234 12345",
&[16, 362, 1105, 17031, 1105, 2682, 17031, 2231],
),
(" spaces", &[50257, 9029]),
];
for (text, expected) in cases {
let got = ITokenizer::encode(&tok, text);
assert_eq!(
&got[..],
*expected,
"mismatch on {text:?}: expected {expected:?}, got {got:?}"
);
}
}
#[test]
fn o200k_base_case_aware_splits_via_letters_cased() {
let Some(path) = find_codec_map("openai/o200k_base.json") else {
eprintln!("skipping — codec-maps/openai/o200k_base.json not present locally");
return;
};
let bytes = std::fs::read(&path).expect("read map");
let map = TokenizerMap::from_json(&bytes).expect("parse map");
let tok = BPETokenizer::new(&map).expect("supports");
let cases: &[(&str, &[u32])] = &[
("MyCamelCase", &[5444, 137910, 6187]),
("iPhone", &[72, 7081]),
("isn't", &[276, 3023]),
("1234567", &[7633, 19354, 22]),
("XMLHttpRequest", &[13836, 4682, 2303]),
];
for (text, expected) in cases {
let got = ITokenizer::encode(&tok, text);
assert_eq!(
&got[..],
*expected,
"o200k mismatch on {text:?}: expected {expected:?}, got {got:?}"
);
}
}
#[test]
fn mistral_nemo_case_aware_splits_via_letters_cased() {
let Some(path) = find_codec_map("mistralai/mistral-nemo.json") else {
eprintln!("skipping — codec-maps/mistralai/mistral-nemo.json not present locally");
return;
};
let bytes = std::fs::read(&path).expect("read map");
let map = TokenizerMap::from_json(&bytes).expect("parse map");
let tok = BPETokenizer::new(&map).expect("supports");
let cases: &[(&str, &[u32])] = &[
("MyCamelCase", &[6720, 38487, 1299, 11139]),
("iPhone", &[1105, 16742]),
("1234567", &[1049, 1050, 1051, 1052, 1053, 1054, 1055]),
];
for (text, expected) in cases {
let got = ITokenizer::encode(&tok, text);
assert_eq!(
&got[..],
*expected,
"nemo mismatch on {text:?}: expected {expected:?}, got {got:?}"
);
}
}