nornir 0.4.34 - Docs.rs

//! Vector (semantic) search index — a hand-written **exact-flat** ANN over
//! `f32` vectors, keyed by stable `u64` ids.
//!
//! Design priorities (per `plan.md`): **maximum precision** and speed, 100%
//! Rust, no C/FFI, self-contained airgapped binary.
//!
//! - **Exact, not approximate.** Every query scores against every stored
//!   vector, so recall is 100% — no quantization loss, no graph-traversal
//!   miss. The embedding model already spends storage on precision
//!   (`jina-v2-base-code`, 768-dim); the index does not throw that away.
//! - **Cosine similarity.** Vectors are L2-normalized on insert and the query
//!   is normalized per search, so the score is a plain dot product (cosine).
//!   Higher score = closer.
//! - **SIMD, runtime-detected.** The per-vector dot product dispatches once
//!   per search to the best kernel the *running* CPU supports: AVX-512F →
//!   AVX2+FMA → scalar. The binary builds and runs everywhere; it just goes
//!   faster where the silicon allows (e.g. AVX-512 on Zen 4).
//! - **int8 quantization + VNNI (G2).** Because stored vectors are
//!   L2-normalized (components in `[-1,1]`), they quantize losslessly-enough to
//!   `i8` (scale 127). The int8 cosine matches the f32 cosine to ~1e-2, runs on
//!   a single AVX-512 **VNNI** `vpdpbusd` (64 int8 MACs/instr vs 16 f32 FMA
//!   lanes), and quarters the bytes/row the (memory-bound) scoring loop streams.
//!   See [`score_i8_batch`] / [`VectorIndex::search_i8`]; both fall back to a
//!   scalar int8 kernel where VNNI is absent. [`bench_kernels`] times every
//!   path and is exercised by `nornir vector bench`.
//! - **Multicore.** For large corpora the scoring loop is split across cores
//!   via scoped threads (no `Arc`, no new dependency), each computing a local
//!   top-k that is merged into the global top-k.
//!
//! Ids map back to warehouse rows (chunk id → `{repo, git_sha, model, file,
//! span, excerpt}`), so the index stays a pure derived artifact — the same
//! shape as the Tantivy full-text index, and snapshot/restore-able the same
//! way. The embedding model that produces the `f32` vectors (Candle, feature
//! `embed-tract` / `embed-ort`) is a separate layer; this module is
//! model-agnostic and
//! only cares about dimensionality.
//!
//! Cargo feature: `vector`.

pub mod chunk;
pub mod store;

// The interchangeable embedding-model registry (plan item G3). Pure-std, no
// deps — also `include!`d by `build.rs` so the model it fetches matches the one
// the embedder loads. Available whenever `vector` is on (the CLI reports the
// selected model), not just under an embed backend.
pub mod embed_registry;

// Shared embedder support — compiled when either backend is enabled.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
pub mod embed_support;

// Embedder backends (the `store::Embedder` trait is the interface). Both run
// the same jina code ONNX model; pick by Cargo feature.
#[cfg(feature = "embed-tract")]
pub mod embed; // tract-onnx, CPU, pure Rust
#[cfg(feature = "embed-ort")]
pub mod embed_ort; // ort / ONNX Runtime, GPU (CUDA/ROCm) or CPU
#[cfg(feature = "embed-ort")]
pub mod cuda; // runtime CUDA-lib discovery so the ort NVIDIA EP "just works"
#[cfg(feature = "embed-ort-rocm")]
pub mod rocm; // runtime ROCm-lib discovery + probe for the ort AMD EP (G1)

/// Load the default embedder as a trait object, choosing the best available
/// backend: **ort** (GPU-capable, CUDA→CPU fallback) when `embed-ort` is on,
/// otherwise the pure-Rust **tract** CPU backend. Both produce vectors with the
/// same `model_profile`, so they interoperate in the warehouse.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
#[allow(clippy::needless_return)] // returns disambiguate the cfg branches
pub fn load_embedder() -> anyhow::Result<Box<dyn store::Embedder>> {
    #[cfg(feature = "embed-ort")]
    {
        return Ok(Box::new(embed_ort::OrtEmbedder::load()?));
    }
    #[cfg(all(feature = "embed-tract", not(feature = "embed-ort")))]
    {
        return Ok(Box::new(embed::JinaEmbedder::load()?));
    }
}

/// Human-readable name of the backend [`load_embedder`] selects.
#[cfg(any(feature = "embed-tract", feature = "embed-ort"))]
pub fn embedder_backend() -> &'static str {
    #[cfg(feature = "embed-ort")]
    {
        "ort (ONNX Runtime, CUDA→CPU)"
    }
    #[cfg(all(feature = "embed-tract", not(feature = "embed-ort")))]
    {
        "tract-onnx (CPU, pure Rust)"
    }
}

/// `id` of the active embedding model (from `$NORNIR_EMBED_MODEL` or the
/// registry default). Available whenever `vector` is on, for diagnostics — the
/// model is selectable independently of the backend. See
/// [`embed_registry::selected`].
pub fn selected_model_id() -> &'static str {
    embed_registry::selected().map(|m| m.id).unwrap_or("<invalid>")
}

/// Human-readable `"<model-name> (<dim>-dim)"` of the active model, for CLI /
/// diagnostics. Reports the registry default (jina-v2-base-code, 768-dim)
/// unless `$NORNIR_EMBED_MODEL` selects another.
pub fn selected_model_desc() -> String {
    match embed_registry::selected() {
        Ok(m) => format!("{} ({}-dim)", m.model_name, m.dim),
        Err(e) => e,
    }
}

use std::collections::HashMap;
use std::path::Path;

use anyhow::{bail, ensure, Context, Result};

/// Minimum rows a single thread should own before we bother spawning more.
/// Below `2 * MIN_ROWS_PER_THREAD` the search runs single-threaded (spawning
/// is pure overhead for tiny corpora).
const MIN_ROWS_PER_THREAD: usize = 1024;

/// On-disk format magic + version (`NVF` = nornir vector flat, gen 1).
const MAGIC: &[u8; 4] = b"NVF1";

/// An exact (brute-force) nearest-neighbour index over `f32` vectors of a
/// fixed dimensionality, keyed by stable `u64` ids.
pub struct VectorIndex {
    dim: usize,
    /// External ids, parallel to the rows of [`Self::data`].
    ids: Vec<u64>,
    /// Row-major, L2-normalized vectors: `ids.len() * dim` floats.
    data: Vec<f32>,
    /// `id → row index`, for O(1) `contains` / `remove`.
    pos: HashMap<u64, usize>,
}

impl VectorIndex {
    /// Create an empty index over `dim`-dimensional vectors. `dim` must be
    /// non-zero.
    pub fn new(dim: usize) -> Result<Self> {
        ensure!(dim != 0, "vector dim must be non-zero");
        Ok(Self {
            dim,
            ids: Vec::new(),
            data: Vec::new(),
            pos: HashMap::new(),
        })
    }

    /// Vector dimensionality this index was built for.
    pub fn dim(&self) -> usize {
        self.dim
    }

    /// Number of vectors currently stored.
    pub fn len(&self) -> usize {
        self.ids.len()
    }

    /// True when no vectors are stored.
    pub fn is_empty(&self) -> bool {
        self.ids.is_empty()
    }

    /// True when `id` is present in the index.
    pub fn contains(&self, id: u64) -> bool {
        self.pos.contains_key(&id)
    }

    /// Add `ids.len()` vectors. `vectors` is the row-major flattened matrix —
    /// exactly `ids.len() * dim` floats. Each vector is L2-normalized before
    /// storage. Ids must be unique both within this call and against vectors
    /// already present.
    pub fn add(&mut self, vectors: &[f32], ids: &[u64]) -> Result<()> {
        ensure!(
            vectors.len() == ids.len() * self.dim,
            "vectors len {} != ids len {} * dim {}",
            vectors.len(),
            ids.len(),
            self.dim
        );
        // Validate ids up front so a partial add is impossible.
        let mut seen = std::collections::HashSet::with_capacity(ids.len());
        for &id in ids {
            ensure!(
                !self.pos.contains_key(&id) && seen.insert(id),
                "duplicate id {id}"
            );
        }
        self.ids.reserve(ids.len());
        self.data.reserve(vectors.len());
        self.pos.reserve(ids.len());
        for (i, &id) in ids.iter().enumerate() {
            let row = &vectors[i * self.dim..(i + 1) * self.dim];
            let row_idx = self.ids.len();
            push_normalized(&mut self.data, row);
            self.ids.push(id);
            self.pos.insert(id, row_idx);
        }
        Ok(())
    }

    /// Remove the vector with this id (O(1) swap-remove). Returns `true` if it
    /// was present.
    pub fn remove(&mut self, id: u64) -> bool {
        let Some(idx) = self.pos.remove(&id) else {
            return false;
        };
        let last = self.ids.len() - 1;
        let dim = self.dim;
        if idx != last {
            // Move the last row into the hole, fix up its id → index entry.
            self.data
                .copy_within(last * dim..(last + 1) * dim, idx * dim);
            let moved_id = self.ids[last];
            self.ids[idx] = moved_id;
            self.pos.insert(moved_id, idx);
        }
        self.ids.pop();
        self.data.truncate(last * dim);
        true
    }

    /// Top-`k` nearest ids to `query` (a single `dim`-length vector), best
    /// match first, as `(id, score)` pairs. `score` is cosine similarity in
    /// `[-1, 1]`; higher = closer. Exact — every stored vector is scored.
    ///
    /// # Panics
    /// If `query.len() != dim` (a programmer error, surfaced loudly).
    pub fn search(&self, query: &[f32], k: usize) -> Vec<(u64, f32)> {
        assert_eq!(
            query.len(),
            self.dim,
            "query dim {} != index dim {}",
            query.len(),
            self.dim
        );
        let n = self.ids.len();
        let m = k.min(n);
        if m == 0 {
            return Vec::new();
        }
        let qn = normalized(query);
        let kernel = select_dot_kernel();

        let threads = thread_count(n);
        let mut merged = if threads <= 1 {
            self.score_range(0, n, &qn, kernel, m)
        } else {
            let chunk = n.div_ceil(threads);
            std::thread::scope(|s| {
                let mut handles = Vec::with_capacity(threads);
                let mut start = 0;
                while start < n {
                    let end = (start + chunk).min(n);
                    let qn = &qn;
                    handles.push(s.spawn(move || self.score_range(start, end, qn, kernel, m)));
                    start = end;
                }
                let mut out = Vec::with_capacity(handles.len() * m);
                for h in handles {
                    out.extend(h.join().expect("scoring thread panicked"));
                }
                out
            })
        };

        top_k(&mut merged, m);
        merged
    }

    /// Top-`k` like [`Self::search`], but scoring through the **int8-quantized
    /// VNNI path** (G2): every stored vector is quantized to `i8` on the fly,
    /// then [`score_i8_batch`] runs the AVX-512 VNNI (or scalar) int8 cosine.
    /// Results match [`Self::search`]'s ranking; scores agree to ~1e-2 (the
    /// quantization tolerance). Useful when the corpus is large enough that the
    /// f32 matrix no longer fits comfortably in cache — the i8 copy is 4× denser.
    ///
    /// This convenience method re-quantizes the whole matrix per call; a corpus
    /// you query repeatedly should keep a persistent i8 copy (see
    /// [`Self::quantized`]).
    ///
    /// # Panics
    /// If `query.len() != dim`.
    pub fn search_i8(&self, query: &[f32], k: usize) -> Vec<(u64, f32)> {
        assert_eq!(query.len(), self.dim, "query dim {} != index dim {}", query.len(), self.dim);
        let n = self.ids.len();
        let m = k.min(n);
        if m == 0 {
            return Vec::new();
        }
        let (rows, sums) = self.quantized();
        let scores = score_i8_batch(query, &rows, self.dim, &sums);
        let mut scored: Vec<(u64, f32)> = self.ids.iter().copied().zip(scores).collect();
        top_k(&mut scored, m);
        scored
    }

    /// Quantize the whole stored matrix to `i8`, returning `(rows, row_sums)`
    /// ready for [`score_i8_batch`]. `rows` is the row-major `n × dim` i8 matrix;
    /// `row_sums[i] = Σ rows[i]`, the VNNI bias-correction term.
    pub fn quantized(&self) -> (Vec<i8>, Vec<i32>) {
        let n = self.ids.len();
        let mut rows = Vec::with_capacity(n * self.dim);
        let mut sums = Vec::with_capacity(n);
        for idx in 0..n {
            let row = &self.data[idx * self.dim..(idx + 1) * self.dim];
            sums.push(quantize_i8(row, &mut rows));
        }
        (rows, sums)
    }

    /// Score rows `[start, end)` against the normalized query `qn`, returning
    /// this range's local top-`m` (already sorted, descending).
    fn score_range(&self, start: usize, end: usize, qn: &[f32], kernel: DotFn, m: usize) -> Vec<(u64, f32)> {
        let mut local: Vec<(u64, f32)> = Vec::with_capacity(end - start);
        for idx in start..end {
            let row = &self.data[idx * self.dim..(idx + 1) * self.dim];
            // SAFETY: `kernel` was chosen by `select_dot_kernel` to match a
            // CPU feature confirmed present at runtime (or the scalar
            // fallback, which is always sound). `qn` and `row` are both
            // `self.dim` long.
            let score = unsafe { kernel(qn, row) };
            local.push((self.ids[idx], score));
        }
        top_k(&mut local, m);
        local
    }

    /// Serialize the index to `path` (a small dependency-free binary format:
    /// magic, dim, count, ids, then the normalized f32 matrix).
    pub fn write(&self, path: impl AsRef<Path>) -> Result<()> {
        let path = path.as_ref();
        let n = self.ids.len();
        let mut buf = Vec::with_capacity(16 + n * 8 + self.data.len() * 4);
        buf.extend_from_slice(MAGIC);
        buf.extend_from_slice(&(self.dim as u32).to_le_bytes());
        buf.extend_from_slice(&(n as u64).to_le_bytes());
        for &id in &self.ids {
            buf.extend_from_slice(&id.to_le_bytes());
        }
        for &f in &self.data {
            buf.extend_from_slice(&f.to_le_bytes());
        }
        std::fs::write(path, &buf).with_context(|| format!("write vector index {}", path.display()))
    }

    /// Load an index previously written by [`Self::write`].
    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
        let path = path.as_ref();
        let buf =
            std::fs::read(path).with_context(|| format!("read vector index {}", path.display()))?;
        ensure!(buf.len() >= 16, "vector index too short");
        ensure!(&buf[0..4] == MAGIC, "bad vector index magic");
        let dim = u32::from_le_bytes(buf[4..8].try_into().unwrap()) as usize;
        let n = u64::from_le_bytes(buf[8..16].try_into().unwrap()) as usize;
        ensure!(dim != 0, "vector index has zero dim");
        let want = 16 + n * 8 + n * dim * 4;
        if buf.len() != want {
            bail!(
                "vector index length {} != expected {want} (dim {dim}, n {n})",
                buf.len()
            );
        }
        let mut off = 16;
        let mut ids = Vec::with_capacity(n);
        let mut pos = HashMap::with_capacity(n);
        for row_idx in 0..n {
            let id = u64::from_le_bytes(buf[off..off + 8].try_into().unwrap());
            off += 8;
            ensure!(pos.insert(id, row_idx).is_none(), "duplicate id {id} in file");
            ids.push(id);
        }
        let mut data = Vec::with_capacity(n * dim);
        for _ in 0..n * dim {
            data.push(f32::from_le_bytes(buf[off..off + 4].try_into().unwrap()));
            off += 4;
        }
        Ok(Self { dim, ids, data, pos })
    }
}

/// Name of the SIMD kernel the running CPU will use — `"avx512f"`,
/// `"avx2+fma"`, or `"scalar"`. Diagnostics / tests only.
pub fn active_simd() -> &'static str {
    #[cfg(target_arch = "x86_64")]
    {
        if std::is_x86_feature_detected!("avx512f") {
            return "avx512f";
        }
        if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
            return "avx2+fma";
        }
    }
    "scalar"
}

// ----- dot-product kernels ---------------------------------------------------

/// A dot-product kernel. `unsafe` because the SIMD variants require their
/// target feature to be present; callers must only select a variant via
/// [`select_dot_kernel`]. Both slices must be the same length.
type DotFn = unsafe fn(&[f32], &[f32]) -> f32;

fn select_dot_kernel() -> DotFn {
    #[cfg(target_arch = "x86_64")]
    {
        if std::is_x86_feature_detected!("avx512f") {
            return dot_avx512;
        }
        if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
            return dot_avx2;
        }
    }
    dot_scalar
}

/// Portable scalar fallback. `unsafe` only to share [`DotFn`]; always sound.
unsafe fn dot_scalar(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b).map(|(x, y)| x * y).sum()
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
unsafe fn dot_avx512(a: &[f32], b: &[f32]) -> f32 {
    use std::arch::x86_64::*;
    let n = a.len();
    let mut acc = _mm512_setzero_ps();
    let mut i = 0;
    while i + 16 <= n {
        let va = _mm512_loadu_ps(a.as_ptr().add(i));
        let vb = _mm512_loadu_ps(b.as_ptr().add(i));
        acc = _mm512_fmadd_ps(va, vb, acc);
        i += 16;
    }
    let mut s = _mm512_reduce_add_ps(acc);
    while i < n {
        s += a[i] * b[i];
        i += 1;
    }
    s
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn dot_avx2(a: &[f32], b: &[f32]) -> f32 {
    use std::arch::x86_64::*;
    let n = a.len();
    let mut acc = _mm256_setzero_ps();
    let mut i = 0;
    while i + 8 <= n {
        let va = _mm256_loadu_ps(a.as_ptr().add(i));
        let vb = _mm256_loadu_ps(b.as_ptr().add(i));
        acc = _mm256_fmadd_ps(va, vb, acc);
        i += 8;
    }
    // Horizontal sum of the 8 lanes.
    let mut tmp = [0f32; 8];
    _mm256_storeu_ps(tmp.as_mut_ptr(), acc);
    let mut s = tmp.iter().sum::<f32>();
    while i < n {
        s += a[i] * b[i];
        i += 1;
    }
    s
}

// ----- int8 quantized dot (G2: AVX-512 VNNI) ---------------------------------
//
// The stored vectors are already L2-normalized, so every component lies in
// `[-1, 1]`. We quantize to `i8` by scaling by `Q = 127` and rounding; the dot
// of two quantized vectors, divided by `Q*Q`, recovers the cosine to ~1e-2.
// That halves-then-quarters the memory traffic (4 B f32 → 1 B i8) — the scoring
// loop is memory-bound on a big corpus, so less bytes/row ≈ proportionally
// faster — and lets a single AVX-512 VNNI `vpdpbusd` fold 64 lanes of
// multiply-accumulate into int32 per instruction (vs 16 f32 FMA lanes).
//
// VNNI's `vpdpbusd` is **unsigned × signed**. We keep the row signed (`i8`) and
// bias the query into unsigned: with `qu = q + 128 ∈ [0,255]`,
//   Σ q·r = Σ (qu-128)·r = Σ qu·r − 128·Σ r.
// `Σ qu·r` is the `vpdpbusd` accumulation; `Σ r` (the row's int sum) is
// precomputed once at quantization time, so the correction is a single scalar
// fixup per row.

/// `Q` — the int8 quantization scale (max |component| of a unit vector is 1).
const Q: f32 = 127.0;

/// Quantize a (normalized) f32 vector to `i8`, returning the components' integer
/// sum `Σ r` (needed for the VNNI unsigned-bias correction). Round-to-nearest,
/// clamped to `[-127, 127]` so the `+128` query bias never overflows `u8`.
fn quantize_i8(v: &[f32], out: &mut Vec<i8>) -> i32 {
    let mut sum = 0i32;
    for &x in v {
        let q = (x * Q).round().clamp(-127.0, 127.0) as i32;
        sum += q;
        out.push(q as i8);
    }
    sum
}

/// Scalar int8 dot, returned as the **f32 cosine** (divided back by `Q*Q`).
/// Always sound; the reference the SIMD int8 path is checked against.
fn dot_i8_scalar(q: &[i8], r: &[i8]) -> f32 {
    let mut acc = 0i32;
    for (a, b) in q.iter().zip(r) {
        acc += (*a as i32) * (*b as i32);
    }
    acc as f32 / (Q * Q)
}

/// AVX-512 VNNI int8 dot via `vpdpbusd`. `q_biased` is the query pre-biased to
/// `u8` (`q + 128`); `row_sum` is `Σ row` (the i8 components). Returns the f32
/// cosine. 64 int8 MACs per instruction.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vnni,avx512bw")]
unsafe fn dot_i8_vnni(q_biased: &[u8], row: &[i8], row_sum: i32) -> f32 {
    use std::arch::x86_64::*;
    let n = row.len();
    let mut acc = _mm512_setzero_si512();
    let mut i = 0;
    while i + 64 <= n {
        let vu = _mm512_loadu_si512(q_biased.as_ptr().add(i) as *const _);
        let vi = _mm512_loadu_si512(row.as_ptr().add(i) as *const _);
        // vpdpbusd: acc += Σ (u8 lane × i8 lane) widened to i32, 4 per dword.
        acc = _mm512_dpbusd_epi32(acc, vu, vi);
        i += 64;
    }
    let mut biased = _mm512_reduce_add_epi32(acc);
    while i < n {
        biased += (q_biased[i] as i32) * (row[i] as i32);
        i += 1;
    }
    // Undo the +128 query bias: Σ q·r = Σ qu·r − 128·Σ r.
    ((biased - 128 * row_sum) as f32) / (Q * Q)
}

/// True if the running CPU can run the int8 VNNI kernel.
pub fn vnni_available() -> bool {
    #[cfg(target_arch = "x86_64")]
    {
        return std::is_x86_feature_detected!("avx512f")
            && std::is_x86_feature_detected!("avx512vnni")
            && std::is_x86_feature_detected!("avx512bw");
    }
    #[allow(unreachable_code)]
    false
}

/// Score `query` against `rows` (a row-major `n × dim` **i8**-quantized matrix)
/// using the best int8 kernel the CPU supports (VNNI → scalar), returning each
/// row's cosine. `row_sums[i] = Σ rows[i]` (from [`quantize_i8`]). The query is
/// quantized + biased once and reused across all rows — batching the per-query
/// setup. Cache-blocking falls out naturally: rows are contiguous i8, four times
/// denser than f32, so a cache line covers 4× the components.
pub fn score_i8_batch(query: &[f32], rows: &[i8], dim: usize, row_sums: &[i32]) -> Vec<f32> {
    let n = row_sums.len();
    debug_assert_eq!(rows.len(), n * dim);
    // Quantize the query once; pre-bias to u8 for VNNI.
    let mut q_i8 = Vec::with_capacity(dim);
    let qn = normalized(query);
    quantize_i8(&qn, &mut q_i8);

    if vnni_available() {
        let q_biased: Vec<u8> = q_i8.iter().map(|&x| (x as i16 + 128) as u8).collect();
        (0..n)
            .map(|i| {
                let row = &rows[i * dim..(i + 1) * dim];
                // SAFETY: vnni_available() confirmed the target features above.
                unsafe { dot_i8_vnni(&q_biased, row, row_sums[i]) }
            })
            .collect()
    } else {
        (0..n)
            .map(|i| {
                let row = &rows[i * dim..(i + 1) * dim];
                dot_i8_scalar(&q_i8, row)
            })
            .collect()
    }
}

// ----- bench (G2) ------------------------------------------------------------

/// One kernel's timing in a [`bench_kernels`] run.
#[derive(Debug, Clone)]
pub struct KernelTiming {
    /// Kernel name: `"scalar"`, `"simd (avx512f|avx2+fma)"`, `"int8 (vnni|scalar)"`.
    pub name: String,
    /// Wall time to score the whole corpus once, in microseconds.
    pub micros: u128,
    /// Millions of dot-products per second.
    pub mdps: f64,
    /// Max absolute cosine error vs the scalar f32 reference (0 for scalar).
    pub max_err: f32,
}

/// Result of a [`bench_kernels`] sweep over an `n × dim` synthetic corpus.
#[derive(Debug, Clone)]
pub struct BenchReport {
    pub n: usize,
    pub dim: usize,
    pub simd_kernel: &'static str,
    pub timings: Vec<KernelTiming>,
}

impl BenchReport {
    /// Speedup of the SIMD f32 kernel over the scalar baseline.
    pub fn simd_speedup(&self) -> f64 {
        let s = self.timings.iter().find(|t| t.name == "scalar");
        let v = self.timings.iter().find(|t| t.name.starts_with("simd"));
        match (s, v) {
            (Some(s), Some(v)) if v.micros > 0 => s.micros as f64 / v.micros as f64,
            _ => 1.0,
        }
    }
    /// Speedup of the int8 kernel over the scalar baseline.
    pub fn int8_speedup(&self) -> f64 {
        let s = self.timings.iter().find(|t| t.name == "scalar");
        let q = self.timings.iter().find(|t| t.name.starts_with("int8"));
        match (s, q) {
            (Some(s), Some(q)) if q.micros > 0 => s.micros as f64 / q.micros as f64,
            _ => 1.0,
        }
    }
}

/// Benchmark the CPU cosine kernels (G2) on a deterministic synthetic corpus of
/// `n` `dim`-dim vectors: scalar f32, the runtime-selected SIMD f32 kernel, and
/// the int8 (VNNI/scalar) kernel. Every kernel scores the *same* single query
/// against *all* rows; the int8 errors are measured against the scalar f32
/// reference so the bench doubles as a correctness check. `iters` averages out
/// noise. Pure-Rust, no external bench dep — drives `nornir vector bench`.
pub fn bench_kernels(n: usize, dim: usize, iters: usize) -> BenchReport {
    use std::time::Instant;
    let iters = iters.max(1);

    // Deterministic corpus + query, normalized like real stored vectors.
    let mk = |seed: f32| -> Vec<f32> {
        let v: Vec<f32> = (0..dim).map(|i| ((i as f32 + 1.0) * seed).sin()).collect();
        normalized(&v)
    };
    let mut data = Vec::with_capacity(n * dim);
    for r in 0..n {
        data.extend(mk(0.001 + r as f32 * 0.0003));
    }
    let query = mk(0.737);
    let qn = normalized(&query);

    // Scalar reference scores (also the correctness oracle).
    let mut reference = vec![0f32; n];
    for (r, slot) in reference.iter_mut().enumerate() {
        let row = &data[r * dim..(r + 1) * dim];
        // SAFETY: dot_scalar is always sound.
        *slot = unsafe { dot_scalar(&qn, row) };
    }

    let dps = |micros: u128| -> f64 {
        if micros == 0 { 0.0 } else { (n as f64) / (micros as f64) }
    };

    let mut timings = Vec::new();

    // 1. Scalar.
    let t = Instant::now();
    for _ in 0..iters {
        for r in 0..n {
            let row = &data[r * dim..(r + 1) * dim];
            std::hint::black_box(unsafe { dot_scalar(&qn, row) });
        }
    }
    let micros = t.elapsed().as_micros() / iters as u128;
    timings.push(KernelTiming { name: "scalar".into(), micros, mdps: dps(micros), max_err: 0.0 });

    // 2. SIMD f32 (runtime-selected).
    let kernel = select_dot_kernel();
    let simd_kernel = active_simd();
    let t = Instant::now();
    let mut simd_err = 0f32;
    for it in 0..iters {
        for r in 0..n {
            let row = &data[r * dim..(r + 1) * dim];
            // SAFETY: kernel matches a confirmed CPU feature (or scalar).
            let s = unsafe { kernel(&qn, row) };
            std::hint::black_box(s);
            if it == 0 {
                simd_err = simd_err.max((s - reference[r]).abs());
            }
        }
    }
    let micros = t.elapsed().as_micros() / iters as u128;
    timings.push(KernelTiming {
        name: format!("simd ({simd_kernel})"),
        micros,
        mdps: dps(micros),
        max_err: simd_err,
    });

    // 3. int8 (VNNI / scalar). Quantize once, score per iter.
    let mut rows = Vec::with_capacity(n * dim);
    let mut sums = Vec::with_capacity(n);
    for r in 0..n {
        sums.push(quantize_i8(&data[r * dim..(r + 1) * dim], &mut rows));
    }
    let i8_kernel = if vnni_available() { "vnni" } else { "scalar" };
    let t = Instant::now();
    let mut i8_scores = Vec::new();
    for _ in 0..iters {
        i8_scores = score_i8_batch(&query, &rows, dim, &sums);
        std::hint::black_box(&i8_scores);
    }
    let micros = t.elapsed().as_micros() / iters as u128;
    let i8_err = i8_scores
        .iter()
        .zip(&reference)
        .map(|(a, b)| (a - b).abs())
        .fold(0f32, f32::max);
    timings.push(KernelTiming {
        name: format!("int8 ({i8_kernel})"),
        micros,
        mdps: dps(micros),
        max_err: i8_err,
    });

    BenchReport { n, dim, simd_kernel, timings }
}

// ----- helpers ---------------------------------------------------------------

/// Number of worker threads to use for scoring `n` rows. 1 for small corpora.
fn thread_count(n: usize) -> usize {
    if n < 2 * MIN_ROWS_PER_THREAD {
        return 1;
    }
    let hw = std::thread::available_parallelism()
        .map(|x| x.get())
        .unwrap_or(1);
    hw.min(n / MIN_ROWS_PER_THREAD).max(1)
}

/// L2-normalize `v` into a fresh `Vec`. A zero vector is returned unchanged.
fn normalized(v: &[f32]) -> Vec<f32> {
    let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm > 0.0 {
        v.iter().map(|x| x / norm).collect()
    } else {
        v.to_vec()
    }
}

/// Append `row`, L2-normalized, to `data`.
fn push_normalized(data: &mut Vec<f32>, row: &[f32]) {
    let norm = row.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm > 0.0 {
        data.extend(row.iter().map(|x| x / norm));
    } else {
        data.extend_from_slice(row);
    }
}

/// Reduce `v` to its top-`m` by descending score, sorted. `f32::total_cmp`
/// gives a deterministic order even for NaN.
fn top_k(v: &mut Vec<(u64, f32)>, m: usize) {
    if v.len() > m {
        v.select_nth_unstable_by(m - 1, |a, b| b.1.total_cmp(&a.1));
        v.truncate(m);
    }
    v.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
}

#[cfg(test)]
mod tests {
    use super::*;

    fn unit(dim: usize, axis: usize) -> Vec<f32> {
        let mut v = vec![0.0f32; dim];
        v[axis] = 1.0;
        v
    }

    #[test]
    fn rejects_zero_dim() {
        match VectorIndex::new(0) {
            Ok(_) => panic!("dim 0 should be rejected"),
            Err(e) => assert!(e.to_string().contains("non-zero"), "{e}"),
        }
    }

    #[test]
    fn add_and_search_nearest() {
        let mut idx = VectorIndex::new(8).unwrap();
        idx.add(&unit(8, 0), &[10]).unwrap();
        idx.add(&unit(8, 1), &[20]).unwrap();
        idx.add(&unit(8, 2), &[30]).unwrap();
        assert_eq!(idx.len(), 3);
        assert!(!idx.is_empty());

        let mut q = unit(8, 0);
        q[1] = 0.1; // mostly axis-0
        let hits = idx.search(&q, 2);
        assert_eq!(hits.len(), 2);
        assert_eq!(hits[0].0, 10, "nearest is the axis-0 vector");
        assert_eq!(hits[1].0, 20, "runner-up is the axis-1 vector");
        assert!(hits[0].1 > hits[1].1, "scores sorted descending");
    }

    #[test]
    fn add_rejects_wrong_buffer_len() {
        let mut idx = VectorIndex::new(8).unwrap();
        let err = idx.add(&[1.0, 2.0, 3.0, 4.0], &[1]).unwrap_err();
        assert!(err.to_string().contains("!= ids len"), "{err}");
    }

    #[test]
    fn add_rejects_duplicate_id() {
        let mut idx = VectorIndex::new(8).unwrap();
        idx.add(&unit(8, 0), &[7]).unwrap();
        let err = idx.add(&unit(8, 1), &[7]).unwrap_err();
        assert!(err.to_string().contains("duplicate id 7"), "{err}");
        // and a duplicate within the same call
        let mut two = unit(8, 0);
        two.extend(unit(8, 1));
        let err = idx.add(&two, &[9, 9]).unwrap_err();
        assert!(err.to_string().contains("duplicate id 9"), "{err}");
    }

    #[test]
    fn remove_and_contains() {
        let mut idx = VectorIndex::new(8).unwrap();
        idx.add(&unit(8, 0), &[10]).unwrap();
        idx.add(&unit(8, 1), &[20]).unwrap();
        idx.add(&unit(8, 2), &[30]).unwrap();
        assert!(idx.contains(20));
        assert!(idx.remove(20));
        assert!(!idx.contains(20));
        assert!(!idx.remove(20), "second remove is a no-op");
        assert_eq!(idx.len(), 2);
        // Surviving ids still searchable and correctly mapped.
        let hits = idx.search(&unit(8, 2), 1);
        assert_eq!(hits[0].0, 30);
    }

    #[test]
    fn write_then_load_roundtrips() {
        let mut idx = VectorIndex::new(8).unwrap();
        idx.add(&unit(8, 0), &[10]).unwrap();
        idx.add(&unit(8, 1), &[20]).unwrap();
        idx.add(&unit(8, 2), &[30]).unwrap();
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("basis.nvf");
        idx.write(&path).unwrap();

        let loaded = VectorIndex::load(&path).unwrap();
        assert_eq!(loaded.len(), 3);
        assert_eq!(loaded.dim(), 8);
        let hits = loaded.search(&unit(8, 2), 1);
        assert_eq!(hits[0].0, 30, "nearest to axis-2 query is id 30");
    }

    #[test]
    fn load_rejects_corrupt_header() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("bad.nvf");
        std::fs::write(&path, b"NOPExxxxxxxxxxxx").unwrap();
        assert!(VectorIndex::load(&path).is_err());
    }

    /// Exercises the high-dim SIMD path (768 = jina dim) and confirms the
    /// active kernel agrees with an independent scalar reference.
    #[test]
    fn high_dim_search_matches_reference() {
        let dim = 768;
        let mut idx = VectorIndex::new(dim).unwrap();
        // Three distinct directions built deterministically.
        let mk = |seed: f32| -> Vec<f32> { (0..dim).map(|i| (i as f32 * seed).sin()).collect() };
        let a = mk(0.013);
        let b = mk(0.027);
        let c = mk(0.041);
        idx.add(&a, &[1]).unwrap();
        idx.add(&b, &[2]).unwrap();
        idx.add(&c, &[3]).unwrap();

        // Query == b's direction → b must win.
        let hits = idx.search(&b, 1);
        assert_eq!(hits[0].0, 2);
        // Self-cosine of a normalized vector is ~1.0.
        assert!((hits[0].1 - 1.0).abs() < 1e-3, "score {}", hits[0].1);
    }

    /// Triggers the multicore path (n well above the spawn threshold) and
    /// checks that a uniquely-aligned vector is still found exactly.
    #[test]
    fn parallel_path_finds_exact_match() {
        let dim = 32;
        let n = 4 * MIN_ROWS_PER_THREAD; // 4096 → multithreaded
        let mut idx = VectorIndex::new(dim).unwrap();
        let target_id = 1234u64;
        // Most vectors point along axis 1; the target points along axis 0.
        let mut flat = Vec::with_capacity(n * dim);
        let mut ids = Vec::with_capacity(n);
        for j in 0..n as u64 {
            let axis = if j == target_id { 0 } else { 1 };
            flat.extend(unit(dim, axis));
            ids.push(j);
        }
        idx.add(&flat, &ids).unwrap();
        assert!(thread_count(idx.len()) > 1, "test should hit the parallel path");

        let hits = idx.search(&unit(dim, 0), 1);
        assert_eq!(hits[0].0, target_id, "the lone axis-0 vector wins");
    }

    #[test]
    fn active_simd_is_known() {
        let s = active_simd();
        assert!(
            matches!(s, "avx512f" | "avx2+fma" | "scalar"),
            "unexpected kernel {s}"
        );
    }

    // ----- G2: SIMD/int8 correctness + bench --------------------------------

    /// LAW (inject-assert): the runtime SIMD f32 kernel must produce the SAME
    /// dot product as the scalar reference, within f32 rounding tolerance, on
    /// real injected high-dim vectors — not merely "didn't crash".
    #[test]
    fn simd_kernel_matches_scalar_dot() {
        let dim = 768; // jina dim — exercises the AVX-512/AVX2 tail handling
        let a: Vec<f32> = (0..dim).map(|i| ((i as f32 + 1.0) * 0.013).sin()).collect();
        let b: Vec<f32> = (0..dim).map(|i| ((i as f32 + 1.0) * 0.027).cos()).collect();
        let an = normalized(&a);
        let bn = normalized(&b);
        let scalar = unsafe { dot_scalar(&an, &bn) };
        let kernel = select_dot_kernel();
        let simd = unsafe { kernel(&an, &bn) };
        assert!(
            (scalar - simd).abs() < 1e-5,
            "SIMD {} dot {simd} != scalar {scalar}",
            active_simd()
        );
    }

    /// LAW (inject-assert): the int8 (VNNI or scalar) cosine must match the
    /// scalar f32 cosine within the quantization tolerance, and rank the same
    /// nearest vector. Injects a corpus + a query and asserts both score
    /// agreement and ranking agreement against the f32 path.
    #[test]
    fn int8_matches_f32_within_tolerance() {
        let dim = 768;
        let n = 200;
        let mut idx = VectorIndex::new(dim).unwrap();
        let mk = |seed: f32| -> Vec<f32> {
            (0..dim).map(|i| ((i as f32 + 1.0) * seed).sin()).collect()
        };
        let mut flat = Vec::with_capacity(n * dim);
        let mut ids = Vec::with_capacity(n);
        for r in 0..n as u64 {
            flat.extend(mk(0.005 + r as f32 * 0.0007));
            ids.push(r);
        }
        idx.add(&flat, &ids).unwrap();

        let query = mk(0.005 + 42.0 * 0.0007); // == row 42's direction
        let f32_hits = idx.search(&query, 5);
        let i8_hits = idx.search_i8(&query, 5);

        // Same top-1 (row 42 wins under both kernels).
        assert_eq!(f32_hits[0].0, 42, "f32 top-1 should be the matching row");
        assert_eq!(i8_hits[0].0, f32_hits[0].0, "int8 top-1 disagrees with f32");

        // Scores agree within the quantization tolerance for the shared ids.
        use std::collections::HashMap;
        let f32_map: HashMap<u64, f32> = f32_hits.iter().copied().collect();
        for (id, s8) in &i8_hits {
            if let Some(s32) = f32_map.get(id) {
                // int8 quantization of 768 components accumulates ≲4e-2 of
                // absolute cosine error on high-similarity pairs; the ranking
                // (asserted above) is what matters, the score is approximate.
                assert!(
                    (s8 - s32).abs() < 4e-2,
                    "int8 cosine {s8} vs f32 {s32} for id {id} exceeds tolerance"
                );
            }
        }
    }

    /// LAW (inject-assert): the bench must run every kernel, return real timing
    /// numbers (not zero on a non-trivial corpus), and its int8 path must stay
    /// within tolerance — proving the bench is also a correctness gate. We
    /// assert a concrete bench number exists per the G2 requirement.
    #[test]
    fn bench_kernels_reports_real_numbers() {
        let rep = bench_kernels(2000, 768, 3);
        assert_eq!(rep.timings.len(), 3, "expected scalar+simd+int8 timings");
        assert!(rep.timings.iter().any(|t| t.name == "scalar"));
        assert!(rep.timings.iter().any(|t| t.name.starts_with("simd")));
        assert!(rep.timings.iter().any(|t| t.name.starts_with("int8")));
        // Real wall-clock numbers on a 2000×768 corpus.
        for t in &rep.timings {
            assert!(t.micros > 0, "kernel {} reported 0µs", t.name);
            assert!(t.mdps > 0.0, "kernel {} reported 0 Mdps", t.name);
        }
        // The SIMD/int8 paths agree with the scalar reference.
        let simd = rep.timings.iter().find(|t| t.name.starts_with("simd")).unwrap();
        assert!(simd.max_err < 1e-4, "simd error {} too high", simd.max_err);
        let i8 = rep.timings.iter().find(|t| t.name.starts_with("int8")).unwrap();
        assert!(i8.max_err < 4e-2, "int8 error {} too high", i8.max_err);
        // Speedups are computable (≥ ~1×; SIMD shouldn't be slower than scalar).
        assert!(rep.simd_speedup() > 0.5, "implausible simd speedup {}", rep.simd_speedup());
    }
}