#[cfg(test)]
mod tokenizer_vocab_contract {
use std::collections::HashMap;
use std::path::Path;
use crate::format::model_family::ModelFamilyConfig;
use crate::format::model_family_loader::load_family_yaml;
#[derive(Debug)]
struct TvFamily {
tokenizer_type: String,
vocab_size: u64,
bos_id: u32,
eos_id: u32,
pad_id: u32,
im_start_id: u32,
im_end_id: u32,
}
#[derive(Debug)]
struct StFamily {
vocab_size: u64,
bos_id: u32,
eos_id: u32,
pad_id: u32,
im_start_id: u32,
im_end_id: u32,
}
#[derive(Default)]
struct TokenAcc {
vocab_size: u64,
bos_id: u32,
eos_id: u32,
pad_id: u32,
im_start_id: u32,
im_end_id: u32,
}
impl TokenAcc {
fn reset(&mut self) {
*self = Self::default();
}
fn try_parse_field(&mut self, trimmed: &str) -> bool {
let pairs: &[(&str, fn(&mut Self, u32))] = &[
("vocab_size:", |s, v| s.vocab_size = u64::from(v)),
("bos_id:", |s, v| s.bos_id = v),
("eos_id:", |s, v| s.eos_id = v),
("pad_id:", |s, v| s.pad_id = v),
("im_start_id:", |s, v| s.im_start_id = v),
("im_end_id:", |s, v| s.im_end_id = v),
];
for &(prefix, setter) in pairs {
if let Some(rest) = trimmed.strip_prefix(prefix) {
let val: u32 = field_val(rest).parse().unwrap_or(0);
setter(self, val);
return true;
}
}
false
}
fn try_parse_vocab(&mut self, trimmed: &str) -> bool {
if let Some(rest) = trimmed.strip_prefix("vocab_size:") {
self.vocab_size = field_val(rest).parse().unwrap_or(0);
return true;
}
false
}
}
fn read_file(name: &str) -> String {
let path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../..")
.join(name);
assert!(path.exists(), "{name} must exist at workspace root");
std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("Failed to read {name}: {e}"))
}
fn field_val(line: &str) -> &str {
line.split('#').next().unwrap_or("").trim()
}
fn is_section_end(line: &str) -> bool {
let trimmed = line.trim();
!trimmed.is_empty()
&& !line.starts_with(' ')
&& !line.starts_with('\t')
&& !trimmed.starts_with('#')
}
fn is_family_name(line: &str, trimmed: &str) -> bool {
let indent = line.len() - line.trim_start().len();
indent == 2 && trimmed.ends_with(':') && !trimmed.contains(' ')
}
fn parse_tokenizer_types(content: &str) -> Vec<String> {
let mut types = Vec::new();
let mut in_section = false;
for line in content.lines() {
let trimmed = line.trim();
if trimmed == "tokenizer_types:" {
in_section = true;
continue;
}
if in_section && is_section_end(line) {
break;
}
if !in_section {
continue;
}
if is_family_name(line, trimmed) {
types.push(trimmed.trim_end_matches(':').to_string());
}
}
types
}
fn parse_tv_families(content: &str) -> HashMap<String, TvFamily> {
let mut families = HashMap::new();
let mut in_section = false;
let mut in_special_tokens = false;
let mut current_name = String::new();
let mut current_type = String::new();
let mut acc = TokenAcc::default();
for line in content.lines() {
let trimmed = line.trim();
if trimmed == "families:" {
in_section = true;
continue;
}
if in_section && is_section_end(line) {
break;
}
if !in_section {
continue;
}
let indent = line.len() - line.trim_start().len();
if is_family_name(line, trimmed) {
save_tv_family(&mut families, ¤t_name, ¤t_type, &acc);
current_name = trimmed.trim_end_matches(':').to_string();
current_type.clear();
acc.reset();
in_special_tokens = false;
continue;
}
if indent == 4 && trimmed == "special_tokens:" {
in_special_tokens = true;
continue;
}
if indent == 4 && !trimmed.starts_with('#') {
in_special_tokens = parse_tv_line(trimmed, &mut current_type, &mut acc);
}
if indent == 6 && in_special_tokens {
acc.try_parse_field(trimmed);
}
}
save_tv_family(&mut families, ¤t_name, ¤t_type, &acc);
families
}
fn parse_tv_line(trimmed: &str, current_type: &mut String, acc: &mut TokenAcc) -> bool {
if let Some(rest) = trimmed.strip_prefix("tokenizer_type:") {
*current_type = field_val(rest).to_string();
return false;
}
if acc.try_parse_vocab(trimmed) {
return false;
}
false
}
fn save_tv_family(
families: &mut HashMap<String, TvFamily>,
name: &str,
tok_type: &str,
acc: &TokenAcc,
) {
if !name.is_empty() && acc.vocab_size > 0 {
families.insert(
name.to_string(),
TvFamily {
tokenizer_type: tok_type.to_string(),
vocab_size: acc.vocab_size,
bos_id: acc.bos_id,
eos_id: acc.eos_id,
pad_id: acc.pad_id,
im_start_id: acc.im_start_id,
im_end_id: acc.im_end_id,
},
);
}
}
fn parse_st_families(content: &str) -> HashMap<String, StFamily> {
let mut families = HashMap::new();
let mut in_section = false;
let mut current_name = String::new();
let mut acc = TokenAcc::default();
for line in content.lines() {
let trimmed = line.trim();
if trimmed == "families:" {
in_section = true;
continue;
}
if in_section && is_section_end(line) {
break;
}
if !in_section {
continue;
}
let indent = line.len() - line.trim_start().len();
if is_family_name(line, trimmed) {
save_st_family(&mut families, ¤t_name, &acc);
current_name = trimmed.trim_end_matches(':').to_string();
acc.reset();
continue;
}
if indent >= 4 {
acc.try_parse_field(trimmed);
}
}
save_st_family(&mut families, ¤t_name, &acc);
families
}
fn save_st_family(families: &mut HashMap<String, StFamily>, name: &str, acc: &TokenAcc) {
if !name.is_empty() && acc.vocab_size > 0 {
families.insert(
name.to_string(),
StFamily {
vocab_size: acc.vocab_size,
bos_id: acc.bos_id,
eos_id: acc.eos_id,
pad_id: acc.pad_id,
im_start_id: acc.im_start_id,
im_end_id: acc.im_end_id,
},
);
}
}
fn load_model_family_configs() -> Vec<(String, ModelFamilyConfig)> {
let families_dir =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../contracts/model-families");
let mut families = Vec::new();
let entries = std::fs::read_dir(&families_dir).expect("read model-families dir");
for entry in entries {
let entry = entry.expect("dir entry");
let path = entry.path();
let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
let ext_ok = path
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("yaml"));
if !ext_ok || file_name.starts_with('_') {
continue;
}
let head = match std::fs::read_to_string(&path) {
Ok(s) => s,
Err(_) => continue,
};
if head.lines().any(|l| l.starts_with("contract_id:")) {
continue;
}
if let Ok(config) = load_family_yaml(&path) {
let name = file_name
.trim_end_matches(".yaml")
.trim_end_matches(".yml")
.to_string();
families.push((name, config));
}
}
families
}
#[test]
fn falsify_tv_001_tokens_match_registry() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let st_content = read_file("contracts/special-tokens-registry-v1.yaml");
let tv_families = parse_tv_families(&tv_content);
let st_families = parse_st_families(&st_content);
let mut violations = Vec::new();
for (name, tv) in &tv_families {
let Some(st) = st_families.get(name) else {
violations.push(format!(
"{name}: in tokenizer-vocab but not in special-tokens-registry"
));
continue;
};
let checks: &[(&str, u32, u32)] = &[
("bos_id", tv.bos_id, st.bos_id),
("eos_id", tv.eos_id, st.eos_id),
("pad_id", tv.pad_id, st.pad_id),
("im_start_id", tv.im_start_id, st.im_start_id),
("im_end_id", tv.im_end_id, st.im_end_id),
];
for &(field, tv_val, st_val) in checks {
if tv_val != st_val {
violations.push(format!(
"{name}.{field}: tokenizer-vocab={tv_val}, special-tokens-registry={st_val}"
));
}
}
}
assert!(
violations.is_empty(),
"FALSIFY-TV-001: Token ID divergence between contracts:\n{}",
violations.join("\n")
);
}
#[test]
fn falsify_tv_002_vocab_size_matches_registry() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let st_content = read_file("contracts/special-tokens-registry-v1.yaml");
let tv_families = parse_tv_families(&tv_content);
let st_families = parse_st_families(&st_content);
let mut violations = Vec::new();
for (name, tv) in &tv_families {
if let Some(st) = st_families.get(name) {
if tv.vocab_size != st.vocab_size {
violations.push(format!(
"{name}: tokenizer-vocab vocab_size={}, special-tokens-registry vocab_size={}",
tv.vocab_size, st.vocab_size
));
}
}
}
assert!(
violations.is_empty(),
"FALSIFY-TV-002: Vocab size divergence between contracts:\n{}",
violations.join("\n")
);
}
#[test]
fn falsify_tv_003_vocab_matches_model_families() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let tv_families = parse_tv_families(&tv_content);
let mf_configs = load_model_family_configs();
let mut violations = Vec::new();
for (tv_name, tv) in &tv_families {
let matching_mf = mf_configs.iter().find(|(name, _)| name == tv_name);
if let Some((_, mf_config)) = matching_mf {
let has_match = mf_config
.size_variants
.values()
.any(|size| size.vocab_size as u64 == tv.vocab_size);
if !has_match && !mf_config.size_variants.is_empty() {
let observed: Vec<String> = mf_config
.size_variants
.values()
.map(|s| format!("{}", s.vocab_size))
.collect();
violations.push(format!(
"{tv_name}: tokenizer-vocab vocab_size={}, model-family has [{}]",
tv.vocab_size,
observed.join(", ")
));
}
}
}
assert!(
violations.is_empty(),
"FALSIFY-TV-003: Vocab size mismatch with model families:\n{}",
violations.join("\n")
);
}
#[test]
fn falsify_tv_004_tokenizer_types_valid() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let known_types = parse_tokenizer_types(&tv_content);
let tv_families = parse_tv_families(&tv_content);
let mut violations = Vec::new();
for (name, tv) in &tv_families {
if !known_types.contains(&tv.tokenizer_type) {
violations.push(format!(
"{name}: tokenizer_type '{}' not in tokenizer_types section (known: {:?})",
tv.tokenizer_type, known_types
));
}
}
assert!(
violations.is_empty(),
"FALSIFY-TV-004: Invalid tokenizer type references:\n{}",
violations.join("\n")
);
}
#[test]
fn falsify_tv_005_token_ids_within_bounds() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let tv_families = parse_tv_families(&tv_content);
let mut violations = Vec::new();
for (name, tv) in &tv_families {
let checks: &[(&str, u32)] = &[
("bos_id", tv.bos_id),
("eos_id", tv.eos_id),
("pad_id", tv.pad_id),
("im_start_id", tv.im_start_id),
("im_end_id", tv.im_end_id),
];
for &(field, id) in checks {
if id > 0 && u64::from(id) >= tv.vocab_size {
violations.push(format!(
"{name}.{field}: {id} >= vocab_size {}",
tv.vocab_size
));
}
}
}
assert!(
violations.is_empty(),
"FALSIFY-TV-005: Token IDs out of vocab bounds:\n{}",
violations.join("\n")
);
}
#[test]
fn falsify_tv_006_family_count_parity() {
let tv_content = read_file("contracts/tokenizer-vocab-v1.yaml");
let st_content = read_file("contracts/special-tokens-registry-v1.yaml");
let tv_families = parse_tv_families(&tv_content);
let st_families = parse_st_families(&st_content);
assert_eq!(
tv_families.len(),
st_families.len(),
"FALSIFY-TV-006: tokenizer-vocab has {} families, special-tokens-registry has {}. \
Sync contracts/tokenizer-vocab-v1.yaml with contracts/special-tokens-registry-v1.yaml",
tv_families.len(),
st_families.len()
);
}
}