#![allow(clippy::unwrap_used)]
#![allow(clippy::arithmetic_side_effects)]
#![allow(clippy::indexing_slicing)]
#![allow(clippy::as_conversions)]
#![allow(unused_results)]
use std::env;
use std::fmt::Write;
use std::fs;
use std::io::Read;
use std::path::Path;
fn main() {
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("generated_vocab.rs");
let model_path = Path::new("tools/vocab_model.bin");
let rs = if model_path.exists() {
match generate_from_model(model_path) {
Ok(s) => s,
Err(e) => {
println!("cargo:warning=Failed to read vocab_model.bin: {e} — using fail-closed fallback (all inputs flagged)");
generate_fallback()
}
}
} else {
println!("cargo:warning=No vocab_model.bin found — using fail-closed vocabulary (all inputs flagged)");
generate_fallback()
};
fs::write(&dest_path, rs).unwrap();
println!("cargo:rerun-if-changed=tools/vocab_model.bin");
println!("cargo:rerun-if-changed=build.rs");
}
fn generate_from_model(path: &Path) -> Result<String, String> {
let mut f = fs::File::open(path).map_err(|e| e.to_string())?;
let mut buf = Vec::new();
f.read_to_end(&mut buf).map_err(|e| e.to_string())?;
if buf.len() < 12 {
return Err(format!("Model file too small: {} bytes", buf.len()));
}
let vocab_size = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]) as usize;
let threshold = f32::from_le_bytes([buf[4], buf[5], buf[6], buf[7]]);
let intercept = f32::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]);
let expected_len = 12 + vocab_size * 16;
if buf.len() != expected_len {
return Err(format!(
"Model file size mismatch: expected {} bytes for {} entries, got {}",
expected_len,
vocab_size,
buf.len()
));
}
let mut entries = Vec::with_capacity(vocab_size);
let mut prev: i128 = -1;
for i in 0..vocab_size {
let off = 12 + i * 16;
let hash = u64::from_le_bytes([
buf[off],
buf[off + 1],
buf[off + 2],
buf[off + 3],
buf[off + 4],
buf[off + 5],
buf[off + 6],
buf[off + 7],
]);
let idf = f32::from_le_bytes([buf[off + 8], buf[off + 9], buf[off + 10], buf[off + 11]]);
let coef = f32::from_le_bytes([buf[off + 12], buf[off + 13], buf[off + 14], buf[off + 15]]);
if !idf.is_finite() || !coef.is_finite() {
return Err(format!(
"Non-finite value at entry {i}: idf={idf}, coef={coef}"
));
}
if i128::from(hash) <= prev {
return Err(format!(
"Vocabulary not strictly sorted at entry {i}: hash {hash:016x} <= prev {prev:016x}"
));
}
prev = i128::from(hash);
entries.push((hash, idf, coef));
}
let threshold_finite = threshold.is_finite();
let intercept_finite = intercept.is_finite();
if !threshold_finite || !intercept_finite {
return Err(format!(
"Non-finite threshold={threshold} or intercept={intercept}"
));
}
let mut rs = String::new();
let header = format!(
"// Generated by build.rs from tools/vocab_model.bin\n\
// {vocab_size} entries, threshold={threshold}, intercept={intercept}\n\
#[allow(dead_code)]\n\
const VOCAB_SIZE: usize = {vocab_size};\n\
const THRESHOLD: f32 = {threshold}f32;\n\
const INTERCEPT: f32 = {intercept}f32;\n\
#[allow(clippy::excessive_precision, clippy::large_const_arrays, clippy::approx_constant)]\n\
#[rustfmt::skip]\n\
const VOCAB: [(u64, f32, f32); {vocab_size}] = [\n",
);
rs.push_str(&header);
for (hash, idf, coef) in &entries {
let _ = writeln!(rs, " ({hash:#018x}u64, {idf:.5}f32, {coef:.5}f32),");
}
rs.push_str("];\n");
rs.push_str("const _VOCAB_SIZE_CHECK: () = assert!(VOCAB.len() == VOCAB_SIZE, \"vocab size mismatch\");\n");
Ok(rs)
}
fn generate_fallback() -> String {
"#[allow(dead_code)]
const VOCAB_SIZE: usize = 0;
const THRESHOLD: f32 = 0.5f32;
const INTERCEPT: f32 = 1.0f32;
const VOCAB: [(u64, f32, f32); 0] = [];
const _VOCAB_SIZE_CHECK: () = assert!(VOCAB.len() == VOCAB_SIZE, \"vocab size mismatch\");
"
.to_owned()
}