llmosafe 0.7.4

Safety-critical cognitive safety library for AI agents. 4-tier architecture (Resource Body, Kernel, Working Memory, Sifter) with formal verification primitives, detection layer, and integration primitives.
Documentation
// Build-time vocabulary embedding. Reads the serialized TF-IDF model
// (tools/vocab_model.bin) and generates a compile-time const VOCAB array.
// This is code generation tooling, not runtime library code — DO-178C
// runtime safety rules do not apply. Unwrap/panic here aborts the build
// with a clear message, which is correct build-script behavior.
#![allow(clippy::unwrap_used)]
#![allow(clippy::arithmetic_side_effects)]
#![allow(clippy::indexing_slicing)]
#![allow(clippy::as_conversions)]
#![allow(unused_results)]
use std::env;
use std::fmt::Write;
use std::fs;
use std::io::Read;
use std::path::Path;

fn main() {
    let out_dir = env::var("OUT_DIR").unwrap();
    let dest_path = Path::new(&out_dir).join("generated_vocab.rs");

    let model_path = Path::new("tools/vocab_model.bin");

    let rs = if model_path.exists() {
        match generate_from_model(model_path) {
            Ok(s) => s,
            Err(e) => {
                println!("cargo:warning=Failed to read vocab_model.bin: {e} — using fail-closed fallback (all inputs flagged)");
                generate_fallback()
            }
        }
    } else {
        println!("cargo:warning=No vocab_model.bin found — using fail-closed vocabulary (all inputs flagged)");
        generate_fallback()
    };

    fs::write(&dest_path, rs).unwrap();

    println!("cargo:rerun-if-changed=tools/vocab_model.bin");
    println!("cargo:rerun-if-changed=build.rs");
}

fn generate_from_model(path: &Path) -> Result<String, String> {
    let mut f = fs::File::open(path).map_err(|e| e.to_string())?;
    let mut buf = Vec::new();
    f.read_to_end(&mut buf).map_err(|e| e.to_string())?;

    if buf.len() < 12 {
        return Err(format!("Model file too small: {} bytes", buf.len()));
    }

    let vocab_size = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]) as usize;
    let threshold = f32::from_le_bytes([buf[4], buf[5], buf[6], buf[7]]);
    let intercept = f32::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]);

    let expected_len = 12 + vocab_size * 16;
    if buf.len() != expected_len {
        return Err(format!(
            "Model file size mismatch: expected {} bytes for {} entries, got {}",
            expected_len,
            vocab_size,
            buf.len()
        ));
    }

    let mut entries = Vec::with_capacity(vocab_size);
    let mut prev: i128 = -1;
    for i in 0..vocab_size {
        let off = 12 + i * 16;
        let hash = u64::from_le_bytes([
            buf[off],
            buf[off + 1],
            buf[off + 2],
            buf[off + 3],
            buf[off + 4],
            buf[off + 5],
            buf[off + 6],
            buf[off + 7],
        ]);
        let idf = f32::from_le_bytes([buf[off + 8], buf[off + 9], buf[off + 10], buf[off + 11]]);
        let coef = f32::from_le_bytes([buf[off + 12], buf[off + 13], buf[off + 14], buf[off + 15]]);

        if !idf.is_finite() || !coef.is_finite() {
            return Err(format!(
                "Non-finite value at entry {i}: idf={idf}, coef={coef}"
            ));
        }
        if i128::from(hash) <= prev {
            return Err(format!(
                "Vocabulary not strictly sorted at entry {i}: hash {hash:016x} <= prev {prev:016x}"
            ));
        }
        prev = i128::from(hash);
        entries.push((hash, idf, coef));
    }

    let threshold_finite = threshold.is_finite();
    let intercept_finite = intercept.is_finite();

    if !threshold_finite || !intercept_finite {
        return Err(format!(
            "Non-finite threshold={threshold} or intercept={intercept}"
        ));
    }

    let mut rs = String::new();
    let header = format!(
        "// Generated by build.rs from tools/vocab_model.bin\n\
         // {vocab_size} entries, threshold={threshold}, intercept={intercept}\n\
         #[allow(dead_code)]\n\
         const VOCAB_SIZE: usize = {vocab_size};\n\
         const THRESHOLD: f32 = {threshold}f32;\n\
         const INTERCEPT: f32 = {intercept}f32;\n\
         #[allow(clippy::excessive_precision, clippy::large_const_arrays, clippy::approx_constant)]\n\
         #[rustfmt::skip]\n\
         const VOCAB: [(u64, f32, f32); {vocab_size}] = [\n",
    );
    rs.push_str(&header);

    for (hash, idf, coef) in &entries {
        let _ = writeln!(rs, "    ({hash:#018x}u64, {idf:.5}f32, {coef:.5}f32),");
    }

    rs.push_str("];\n");
    rs.push_str("const _VOCAB_SIZE_CHECK: () = assert!(VOCAB.len() == VOCAB_SIZE, \"vocab size mismatch\");\n");

    Ok(rs)
}

fn generate_fallback() -> String {
    "#[allow(dead_code)]
const VOCAB_SIZE: usize = 0;
const THRESHOLD: f32 = 0.5f32;
const INTERCEPT: f32 = 1.0f32;
const VOCAB: [(u64, f32, f32); 0] = [];
const _VOCAB_SIZE_CHECK: () = assert!(VOCAB.len() == VOCAB_SIZE, \"vocab size mismatch\");
"
    .to_owned()
}