nornir 0.5.1

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
//! Vector (semantic) search index — a hand-written **exact-flat** ANN over
//! `f32` vectors, keyed by stable `u64` ids.
//!
//! Design priorities (per `plan.md`): **maximum precision** and speed, 100%
//! Rust, no C/FFI, self-contained airgapped binary.
//!
//! - **Exact, not approximate.** Every query scores against every stored
//!   vector, so recall is 100% — no quantization loss, no graph-traversal
//!   miss. The embedding model already spends storage on precision
//!   (`jina-v2-base-code`, 768-dim); the index does not throw that away.
//! - **Cosine similarity.** Vectors are L2-normalized on insert and the query
//!   is normalized per search, so the score is a plain dot product (cosine).
//!   Higher score = closer.
//! - **SIMD, runtime-detected.** The per-vector dot product dispatches once
//!   per search to the best kernel the *running* CPU supports: AVX-512F →
//!   AVX2+FMA → scalar. The binary builds and runs everywhere; it just goes
//!   faster where the silicon allows (e.g. AVX-512 on Zen 4).
//! - **int8 quantization + VNNI (G2).** Because stored vectors are
//!   L2-normalized (components in `[-1,1]`), they quantize losslessly-enough to
//!   `i8` (scale 127). The int8 cosine matches the f32 cosine to ~1e-2, runs on
//!   a single AVX-512 **VNNI** `vpdpbusd` (64 int8 MACs/instr vs 16 f32 FMA
//!   lanes), and quarters the bytes/row the (memory-bound) scoring loop streams.
//!   See [`score_i8_batch`] / [`VectorIndex::search_i8`]; both fall back to a
//!   scalar int8 kernel where VNNI is absent. [`bench_kernels`] times every
//!   path and is exercised by `nornir vector bench`.
//! - **Multicore.** For large corpora the scoring loop is split across cores
//!   via scoped threads (no `Arc`, no new dependency), each computing a local
//!   top-k that is merged into the global top-k.
//!
//! Ids map back to warehouse rows (chunk id → `{repo, git_sha, model, file,
//! span, excerpt}`), so the index stays a pure derived artifact — the same
//! shape as the Tantivy full-text index, and snapshot/restore-able the same
//! way. The embedding model that produces the `f32` vectors (Candle, feature
//! `embed-tract` / `embed-ort`) is a separate layer; this module is
//! model-agnostic and
//! only cares about dimensionality.
//!
//! Cargo feature: `vector`.

pub mod chunk;
pub mod store;

// The interchangeable embedding-model registry (plan item G3). Pure-std, no
// deps — also `include!`d by `build.rs` so the model it fetches matches the one
// the embedder loads. Available whenever `vector` is on (the CLI reports the
// selected model), not just under an embed backend.
pub mod embed_registry;

// Shared embedder support — compiled when either backend is enabled.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
pub mod embed_support;

// Embedder backends (the `store::Embedder` trait is the interface). Both run
// the same jina code ONNX model; pick by Cargo feature.
#[cfg(feature = "embed-tract")]
pub mod embed; // tract-onnx, CPU, pure Rust
#[cfg(feature = "embed-ort")]
pub mod embed_ort; // ort / ONNX Runtime, GPU (CUDA/ROCm) or CPU
#[cfg(feature = "embed-ort")]
pub mod cuda; // runtime CUDA-lib + onnxruntime discovery so the ort NVIDIA EP "just works"
#[cfg(feature = "embed-ort-rocm")]
pub mod rocm; // runtime ROCm-lib discovery + probe for the ort AMD EP (G1)

// Runtime backend selector (#9). Active only when BOTH the CPU (tract) and GPU
// (ort) embedders are compiled — the default `cargo install nornir` build — so
// the single binary picks CUDA/ROCm/CPU at runtime. With only one backend
// compiled, `load_embedder` selects it directly (below).
#[cfg(all(feature = "embed-tract", feature = "embed-ort"))]
pub mod select;

/// Load the default embedder as a trait object, choosing the best available
/// backend **at runtime** (#9): when both backends are compiled (the default
/// build) the [`select`] module probes the box (NVIDIA → AMD → CPU) and loads
/// the right one; otherwise the single compiled backend is loaded directly.
/// Both backends produce vectors with the same `model_profile`, so they
/// interoperate in the warehouse.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
#[allow(clippy::needless_return)] // returns disambiguate the cfg branches
pub fn load_embedder() -> anyhow::Result<Box<dyn store::Embedder>> {
    // Default build: both compiled → runtime selection.
    #[cfg(all(feature = "embed-tract", feature = "embed-ort"))]
    {
        return select::load();
    }
    // Only ort compiled.
    #[cfg(all(feature = "embed-ort", not(feature = "embed-tract")))]
    {
        return Ok(Box::new(embed_ort::OrtEmbedder::load()?));
    }
    // Only tract compiled.
    #[cfg(all(feature = "embed-tract", not(feature = "embed-ort")))]
    {
        return Ok(Box::new(embed::JinaEmbedder::load()?));
    }
}

/// Human-readable name of the backend [`load_embedder`] selects. In the default
/// (both-backend) build this **probes the running machine** and reports the
/// runtime choice (CUDA / ROCm / CPU); with a single backend it names it.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
pub fn embedder_backend() -> &'static str {
    #[cfg(all(feature = "embed-tract", feature = "embed-ort"))]
    {
        return select::chosen_backend().label();
    }
    #[cfg(all(feature = "embed-ort", not(feature = "embed-tract")))]
    {
        return "ort (ONNX Runtime, CUDA→CPU)";
    }
    #[cfg(all(feature = "embed-tract", not(feature = "embed-ort")))]
    {
        "tract-onnx (CPU, pure Rust)"
    }
}

/// `id` of the active embedding model (from `$NORNIR_EMBED_MODEL` or the
/// registry default). Available whenever `vector` is on, for diagnostics — the
/// model is selectable independently of the backend. See
/// [`embed_registry::selected`].
pub fn selected_model_id() -> &'static str {
    embed_registry::selected().map(|m| m.id).unwrap_or("<invalid>")
}

/// Human-readable `"<model-name> (<dim>-dim)"` of the active model, for CLI /
/// diagnostics. Reports the registry default (jina-v2-base-code, 768-dim)
/// unless `$NORNIR_EMBED_MODEL` selects another.
pub fn selected_model_desc() -> String {
    match embed_registry::selected() {
        Ok(m) => format!("{} ({}-dim)", m.model_name, m.dim),
        Err(e) => e,
    }
}

// ----- vector-ANN engine ------------------------------------------------------
//
// The pure compute engine — the exact-flat `VectorIndex`, the SIMD/int8 cosine
// kernels, and the `bench_kernels` sweep — now lives in `znippy_zoomies::vann`
// (extracted verbatim; nornir already depended on znippy-zoomies for gatling).
// It is re-exported here so every existing `nornir::vector::…` path still
// resolves unchanged (`VectorIndex`, `quantize_i8`, `score_i8_batch`, …). The
// embedder glue above (chunk/store/embed*/registry) stays in nornir.
pub use znippy_zoomies::vann::{
    active_simd, bench_kernels, normalized, quantize_i8, score_i8_batch, top_k,
    vnni_available, BenchReport, KernelTiming, VectorIndex, I8_SCAN_THRESHOLD,
};