keyhog-core 0.5.39

keyhog-core: shared data model and detector specifications for the KeyHog secret scanner
Documentation
use std::env;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let manifest_dir = env::var("CARGO_MANIFEST_DIR")
        .map_err(|error| io::Error::other(format!("CARGO_MANIFEST_DIR is not set: {error}")))?;
    let out_dir = env::var("OUT_DIR")
        .map_err(|error| io::Error::other(format!("OUT_DIR is not set: {error}")))?;
    let output_path = Path::new(&out_dir).join("embedded_detectors.rs");

    // Build provenance: pin "what is benched" to a commit. The v32 F1=0.8896
    // vs HEAD F1=0.801 comparison was unverifiable because both binaries
    // reported the same `v0.5.37` and the result JSONs carried an empty
    // version - so a regression could not be bisected. Stamping the git SHA
    // into the binary makes every future scan trace back to an exact commit
    // and lets the bench fail-closed when it scores a stale build.
    stamp_git_hash(Path::new(&manifest_dir));

    let candidates = [
        Path::new(&manifest_dir).join("detectors"),
        Path::new(&manifest_dir)
            .parent()
            .and_then(|p| p.parent())
            .map(|p| p.join("detectors"))
            .unwrap_or_default(),
    ];

    let detectors_dir = candidates
        .iter()
        .find(|path| path.exists() && path.is_dir());
    let Some(detectors_dir) = detectors_dir else {
        println!("cargo:warning=detectors/ directory not found, embedded detectors will be empty");
        // Always emit the digest, even for the empty set, so consumers can
        // rely on `env!("KEYHOG_DETECTOR_DIGEST")` existing unconditionally and
        // the bench can fail-closed on a binary that baked in zero detectors
        // instead of hitting a missing-env compile error.
        println!(
            "cargo:rustc-env=KEYHOG_DETECTOR_DIGEST={}",
            detector_set_digest(&[])
        );
        write_embedded_detectors(&output_path, &[])?;
        return Ok(());
    };

    let entries = read_detector_entries(detectors_dir)?;
    if entries.is_empty() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            format!(
                "detectors directory '{}' contains no .toml files. Fix: add detector TOML files or remove the empty directory",
                detectors_dir.display()
            ),
        )
        .into());
    }

    // Build provenance: stamp the digest of the EXACT detector set that is
    // about to be baked into the binary. The CLI surfaces this so the
    // benchmark can assert the running binary's embedded detectors match the
    // on-disk `detectors/` tree (cargo's `rerun-if-changed` cannot be trusted
    // across in-place TOML edits, so a fresh-from-this-build digest is the
    // authoritative answer to "what got compiled in"). Self-contained FNV-1a
    // (no build-dependency on a hashing crate) — it identifies the set, it is
    // not a security primitive.
    let detector_digest = detector_set_digest(&entries);
    println!("cargo:rustc-env=KEYHOG_DETECTOR_DIGEST={detector_digest}");

    write_embedded_detectors(&output_path, &entries)?;

    // Re-run when the directory contents change (file add/remove).
    println!("cargo:rerun-if-changed={}", detectors_dir.display());
    // Cargo'"'"'s `rerun-if-changed=<dir>` only watches the directory'"'"'s own
    // mtime - which most filesystems do NOT bump when a file INSIDE the
    // directory is modified in-place. Without per-file watchers, editing
    // an existing detector TOML would leave a stale `embedded_detectors.rs`
    // baked into the binary until `cargo clean`. Emit one
    // `rerun-if-changed` line per .toml so any individual edit triggers
    // a rebuild.
    let toml_paths = detector_toml_paths(detectors_dir)?;
    for path in &toml_paths {
        println!("cargo:rerun-if-changed={}", path.display());
    }
    println!(
        "cargo:warning=Embedded {} detectors ({} bytes)",
        entries.len(),
        entries
            .iter()
            .map(|(_, content)| content.len())
            .sum::<usize>()
    );
    Ok(())
}

fn detector_toml_paths(detectors_dir: &Path) -> io::Result<Vec<PathBuf>> {
    let mut paths = Vec::new();
    for entry in fs::read_dir(detectors_dir)? {
        let entry = entry?;
        let path = entry.path();
        if path.extension().is_some_and(|ext| ext == "toml") {
            paths.push(path);
        }
    }
    paths.sort();
    Ok(paths)
}

fn read_detector_entries(detectors_dir: &Path) -> io::Result<Vec<(String, String)>> {
    let mut entries = Vec::new();
    for entry in fs::read_dir(detectors_dir).map_err(|error| {
        io::Error::new(
            error.kind(),
            format!(
                "failed to read detectors directory '{}': {}. Fix: check directory permissions",
                detectors_dir.display(),
                error
            ),
        )
    })? {
        let entry = entry.map_err(|error| {
            io::Error::new(
                error.kind(),
                format!(
                    "failed to enumerate detectors in '{}': {}. Fix: check directory permissions",
                    detectors_dir.display(),
                    error
                ),
            )
        })?;
        let path = entry.path();
        if path.extension().is_some_and(|ext| ext == "toml") {
            let name = file_name(&path)?;
            let content = fs::read_to_string(&path).map_err(|error| {
                io::Error::new(
                    error.kind(),
                    format!(
                        "failed to read detector '{}': {}. Fix: check file permissions and TOML encoding",
                        path.display(),
                        error
                    ),
                )
            })?;
            entries.push((name, content));
        }
    }
    entries.sort_by(|a, b| a.0.cmp(&b.0));
    Ok(entries)
}

fn write_embedded_detectors(output_path: &PathBuf, entries: &[(String, String)]) -> io::Result<()> {
    let mut code = String::from("pub const EMBEDDED_DETECTORS: &[(&str, &str)] = &[\n");
    for (name, content) in entries {
        code.push_str(&format!("    ({name:?}, {content:?}),\n"));
    }
    code.push_str("];\n");
    fs::write(output_path, code).map_err(|error| {
        io::Error::new(
            error.kind(),
            format!(
                "failed to write generated detector table '{}': {}. Fix: verify OUT_DIR is writable",
                output_path.display(),
                error
            ),
        )
    })
}

/// Resolve the current git commit and emit it as `cargo:rustc-env=GIT_HASH`,
/// registering `rerun-if-changed` on the files that hold the SHA so a new
/// commit re-stamps the binary.
///
/// Failure is non-fatal: a `cargo package` / crates.io build has no `.git`
/// tree, and a worktree may be checked out without `git` on PATH. In those
/// cases we stamp the sentinel `unknown` rather than abort the build - the
/// detector digest and version still ship, and the CLI prints `unknown` for
/// the SHA. `CARGO_MANIFEST_DIR` is `crates/core`, so the workspace `.git`
/// lives two directories up.
fn stamp_git_hash(manifest_dir: &Path) {
    let workspace_root = manifest_dir
        .parent()
        .and_then(|p| p.parent())
        .unwrap_or(manifest_dir);
    let git_dir = workspace_root.join(".git");

    // Re-run when HEAD moves (checkout / new commit on the same branch). HEAD
    // is usually `ref: refs/heads/<branch>`, so the branch ref file holds the
    // SHA that actually changes per commit - watch both. `.git/packed-refs`
    // covers a freshly cloned tree whose loose ref has been packed away.
    let head_file = git_dir.join("HEAD");
    println!("cargo:rerun-if-changed={}", head_file.display());
    if let Some(ref_path) = head_ref_path(&git_dir) {
        println!("cargo:rerun-if-changed={}", ref_path.display());
    }
    println!(
        "cargo:rerun-if-changed={}",
        git_dir.join("packed-refs").display()
    );

    let hash = git_hash(workspace_root).unwrap_or_else(|| "unknown".to_string());
    println!("cargo:rustc-env=GIT_HASH={hash}");
}

/// Path to the ref file referenced by `.git/HEAD` (e.g.
/// `.git/refs/heads/main`), or `None` when HEAD is detached or unreadable.
fn head_ref_path(git_dir: &Path) -> Option<PathBuf> {
    let head = fs::read_to_string(git_dir.join("HEAD")).ok()?;
    let reference = head.trim().strip_prefix("ref:")?.trim();
    Some(git_dir.join(reference))
}

/// `git rev-parse HEAD`, trimmed. `None` if git is absent or the command
/// fails (no repo, shallow placeholder, etc.).
fn git_hash(workspace_root: &Path) -> Option<String> {
    let output = std::process::Command::new("git")
        .arg("-C")
        .arg(workspace_root)
        .args(["rev-parse", "HEAD"])
        .output()
        .ok()?;
    if !output.status.success() {
        return None;
    }
    let hash = String::from_utf8(output.stdout).ok()?;
    let hash = hash.trim();
    if hash.is_empty() {
        None
    } else {
        Some(hash.to_string())
    }
}

/// Digest of the exact embedded detector set (sorted `(name, content)` pairs),
/// as a stable lowercase-hex string. FNV-1a 64-bit over name+content of every
/// entry, mirroring the scanner build script's `model_version` hash so both
/// build scripts speak the same self-contained, build-dependency-free dialect.
/// This identifies "which detectors got compiled in"; it is not a tamper seal.
fn detector_set_digest(entries: &[(String, String)]) -> String {
    let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
    let mut mix = |bytes: &[u8]| {
        for &b in bytes {
            hash ^= b as u64;
            hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
        }
    };
    for (name, content) in entries {
        mix(name.as_bytes());
        // NUL separator so ("ab","c") and ("a","bc") cannot collide.
        mix(&[0]);
        mix(content.as_bytes());
        mix(&[0]);
    }
    format!("{}-{hash:016x}", entries.len())
}

fn file_name(path: &Path) -> io::Result<String> {
    path.file_name()
        .and_then(|name| name.to_str())
        .map(ToOwned::to_owned)
        .ok_or_else(|| {
            io::Error::new(
                io::ErrorKind::InvalidData,
                format!(
                    "detector path '{}' does not have a valid UTF-8 file name. Fix: rename the detector file",
                    path.display()
                ),
            )
        })
}