nornir 0.4.21

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
// build.rs — compile `proto/nornir.proto` into prost+tonic bindings, and
// (for the `embed-tract` / `embed-ort` features) fetch the jina-v2-base-code
// model weights.
//
// We use `protox` (pure-Rust .proto parser) instead of relying on a
// system / vendored `protoc` binary. That keeps the build dep-tree
// 100% Rust, matching the README's "no shellout, no C" rule.
//
// Only runs when the `server` (server side) or `mcp` (client side)
// features are enabled — pure CLI builds shouldn't pay the codegen cost.

use std::io::Read;
use std::path::{Path, PathBuf};

// The embedding-model registry, shared verbatim with the crate
// (`src/vector/embed_registry.rs`). `include!` keeps `build.rs` and the runtime
// in lock-step: the model fetched here is exactly the one the embedder loads.
// The file is `std`-only by design (no sibling-module `use`, no extern crates).
#[allow(dead_code)] // build.rs only uses `selected`; the rest is the crate's API.
mod embed_registry {
    include!("src/vector/embed_registry.rs");
}

/// Stamp a distinct build identifier into the binary so the viz can answer
/// "is this a new client?" — git short SHA (+ `-dirty` for uncommitted trees) and
/// the build epoch. Emitted UNCONDITIONALLY (every build, all features), before
/// any feature-gated early return. Falls back to `unknown` off a git checkout.
fn emit_build_stamp() {
    use std::process::Command;
    println!("cargo:rerun-if-changed=.git/HEAD");
    println!("cargo:rerun-if-changed=.git/index");
    let sha = Command::new("git")
        .args(["rev-parse", "--short", "HEAD"])
        .output()
        .ok()
        .filter(|o| o.status.success())
        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
        .filter(|s| !s.is_empty())
        .unwrap_or_else(|| "unknown".to_string());
    let dirty = Command::new("git")
        .args(["status", "--porcelain"])
        .output()
        .ok()
        .map(|o| !o.stdout.is_empty())
        .unwrap_or(false);
    let sha = if dirty { format!("{sha}-dirty") } else { sha };
    println!("cargo:rustc-env=NORNIR_BUILD_SHA={sha}");
    let epoch = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs())
        .unwrap_or(0);
    println!("cargo:rustc-env=NORNIR_BUILD_EPOCH={epoch}");
}

fn main() {
    emit_build_stamp();
    // Re-run if the proto changes.
    println!("cargo:rerun-if-changed=proto/nornir.proto");
    println!("cargo:rerun-if-changed=build.rs");

    if std::env::var_os("CARGO_FEATURE_EMBED_TRACT").is_some()
        || std::env::var_os("CARGO_FEATURE_EMBED_ORT").is_some()
    {
        fetch_model();
    }

    // Only generate when something consumes the bindings.
    let want_server = std::env::var_os("CARGO_FEATURE_SERVER").is_some();
    let want_mcp    = std::env::var_os("CARGO_FEATURE_MCP").is_some();
    // `viz` builds the `urdr-threads` thin client (Viz.Timeline) — client only.
    let want_viz    = std::env::var_os("CARGO_FEATURE_VIZ").is_some();
    if !(want_server || want_mcp || want_viz) {
        return;
    }

    let fds = protox::compile(["proto/nornir.proto"], ["proto"])
        .expect("protox: compile nornir.proto");

    tonic_build::configure()
        .build_server(want_server)
        // Client bindings: MCP (tonic conversion) + the CLI's bench-submit
        // client + the viz thin client — all want them.
        .build_client(want_mcp || want_server || want_viz)
        .compile_fds(fds)
        .expect("tonic-build: codegen");
}

/// Download the **selected** embedding model (onnx model + tokenizer.json) into
/// a stable per-model local cache, then export its path + content hashes to the
/// crate via `rustc-env`. The model is chosen by `$NORNIR_EMBED_MODEL` from the
/// shared [`embed_registry`] (default `jina-v2-base-code`, 768-dim) — the exact
/// same selection the embedder makes at runtime, so the fetched weights and the
/// loaded model can never disagree. Files are fetched once and reused; set
/// `NORNIR_SKIP_MODEL_FETCH=1` to use whatever is already present.
fn fetch_model() {
    // Re-run if the model selection changes.
    println!("cargo:rerun-if-env-changed=NORNIR_EMBED_MODEL");
    println!("cargo:rerun-if-env-changed=NORNIR_MODEL_CACHE");

    let model = embed_registry::selected().unwrap_or_else(|e| panic!("{e}"));
    // (remote path, local filename)
    let files: &[(&str, &str)] = &[
        (model.onnx_path, "model.onnx"),
        (model.tokenizer_path, "tokenizer.json"),
    ];

    let cache = model_cache_dir(model.cache_subdir);
    std::fs::create_dir_all(&cache).expect("create model cache dir");

    let skip = std::env::var_os("NORNIR_SKIP_MODEL_FETCH").is_some();
    for (remote, local) in files {
        let dest = cache.join(local);
        if dest.exists() {
            continue;
        }
        if skip {
            panic!(
                "NORNIR_SKIP_MODEL_FETCH set but {} is missing",
                dest.display()
            );
        }
        let url = format!(
            "https://huggingface.co/{}/resolve/main/{remote}?download=true",
            model.hf_repo
        );
        eprintln!(
            "nornir: fetching {remote} for embed feature (model `{}`) …",
            model.id
        );
        download(&url, &dest);
    }

    let weights_sha = sha256_file(&cache.join("model.onnx"));
    let tokenizer_sha = sha256_file(&cache.join("tokenizer.json"));

    println!("cargo:rustc-env=NORNIR_MODEL_DIR={}", cache.display());
    println!("cargo:rustc-env=NORNIR_MODEL_WEIGHTS_SHA={weights_sha}");
    println!("cargo:rustc-env=NORNIR_MODEL_TOKENIZER_SHA={tokenizer_sha}");
}

fn model_cache_dir(subdir: &str) -> PathBuf {
    // `NORNIR_MODEL_CACHE`, when set, is the model dir itself (back-compat: the
    // operator points it straight at one model's files).
    if let Some(dir) = std::env::var_os("NORNIR_MODEL_CACHE") {
        return PathBuf::from(dir);
    }
    let base = std::env::var_os("HOME")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from(std::env::var_os("OUT_DIR").expect("OUT_DIR")));
    base.join(".cache/nornir/models").join(subdir)
}

fn download(url: &str, dest: &Path) {
    let resp = ureq::get(url)
        .call()
        .unwrap_or_else(|e| panic!("download {url}: {e}"));
    let mut reader = resp.into_reader();
    let tmp = dest.with_extension("part");
    let mut out = std::fs::File::create(&tmp)
        .unwrap_or_else(|e| panic!("create {}: {e}", tmp.display()));
    let mut buf = [0u8; 1 << 16];
    loop {
        let n = reader
            .read(&mut buf)
            .unwrap_or_else(|e| panic!("read {url}: {e}"));
        if n == 0 {
            break;
        }
        use std::io::Write;
        out.write_all(&buf[..n])
            .unwrap_or_else(|e| panic!("write {}: {e}", tmp.display()));
    }
    drop(out);
    std::fs::rename(&tmp, dest)
        .unwrap_or_else(|e| panic!("rename {} -> {}: {e}", tmp.display(), dest.display()));
}

fn sha256_file(path: &Path) -> String {
    use sha2::{Digest, Sha256};
    let mut f = std::fs::File::open(path)
        .unwrap_or_else(|e| panic!("open {}: {e}", path.display()));
    let mut h = Sha256::new();
    let mut buf = [0u8; 1 << 16];
    loop {
        let n = f.read(&mut buf).expect("read for hashing");
        if n == 0 {
            break;
        }
        h.update(&buf[..n]);
    }
    let digest = h.finalize();
    let mut s = String::with_capacity(64);
    const HEX: &[u8; 16] = b"0123456789abcdef";
    for &b in digest.iter() {
        s.push(HEX[(b >> 4) as usize] as char);
        s.push(HEX[(b & 0x0f) as usize] as char);
    }
    s
}