lucisearch 0.8.0

//! Vector search: HNSW graph construction, search, and kNN queries.
//!
//! See [[hierarchical-navigable-small-world]] and [[architecture-overview|milestone-4]].

use crate::core::LuciError;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::{vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32};

pub mod global;
pub mod hnsw;
pub mod quantize;
pub mod query;

#[cfg(test)]
mod distance_tests;

/// Distance metric for vector similarity.
///
/// The `#[repr(u8)]` and explicit discriminants pin the on-disk byte
/// encoding used by HNSW and quantized-vector segment blobs. Adding a
/// new variant requires picking the next unused discriminant and
/// updating [`Self::from_byte`] in the same change. See
/// [[code-must-not-lie]].
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
pub enum DistanceMetric {
    /// Cosine distance: 1 - cosine_similarity. Lower = more similar.
    Cosine = 0,
    /// Negative dot product. Lower = more similar (for pre-normalized vectors).
    DotProduct = 1,
    /// Euclidean (L2) distance. Lower = more similar.
    L2 = 2,
}

impl DistanceMetric {
    /// Decode a metric byte written by `metric as u8`. Panics on unknown
    /// bytes — these mean the segment is corrupted or was written by a
    /// newer Luci version with an unfamiliar metric. Silently mapping
    /// unknown bytes to a default (e.g., L2) would produce wrong recall
    /// and scoring without any signal to the caller.
    pub fn from_byte(byte: u8) -> Self {
        match byte {
            0 => Self::Cosine,
            1 => Self::DotProduct,
            2 => Self::L2,
            other => panic!(
                "unknown distance metric byte {other}: segment is corrupted \
                 or was written by a newer version of Luci"
            ),
        }
    }
}

/// Compute distance between two vectors.
pub fn distance(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
    debug_assert_eq!(a.len(), b.len());
    match metric {
        DistanceMetric::Cosine => cosine_distance_normalized(a, b),
        DistanceMetric::DotProduct => -dot_product(a, b),
        DistanceMetric::L2 => l2_distance(a, b),
    }
}

/// f32 dot product.
///
/// On aarch64 dispatches to an explicit NEON kernel with 4 parallel
/// FMA accumulators × 4 lanes (16-element block) + 4-element middle
/// + scalar tail. Rust's strict float-associativity forbids the
/// auto-vectorizer from emitting the parallel-accumulator pattern
/// that hnswlib's C scalar relies on; only explicit intrinsics close
/// the gap. See [[optimization-vector-distance-kernel-trait]] §"Phase 1.x"
/// and [[vector-bench-glove100-global-hnsw]] §"SIMD kernel gap".
///
/// Other architectures use the iterator chain, which the LLVM
/// auto-vectorizer handles adequately on x86_64 with AVX2.
fn dot_product(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len());
    #[cfg(target_arch = "aarch64")]
    {
        // SAFETY: NEON is part of the AArch64 baseline ISA; always available.
        unsafe { dot_product_neon(a, b) }
    }
    #[cfg(not(target_arch = "aarch64"))]
    {
        a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
    }
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn dot_product_neon(a: &[f32], b: &[f32]) -> f32 {
    let n = a.len();
    let a_ptr = a.as_ptr();
    let b_ptr = b.as_ptr();

    // SAFETY: caller guarantees NEON via target_feature; pointer arithmetic
    // stays in bounds because every load is gated by `i + N <= n`.
    unsafe {
        let mut acc0 = vdupq_n_f32(0.0);
        let mut acc1 = vdupq_n_f32(0.0);
        let mut acc2 = vdupq_n_f32(0.0);
        let mut acc3 = vdupq_n_f32(0.0);

        let mut i = 0;
        while i + 16 <= n {
            let a0 = vld1q_f32(a_ptr.add(i));
            let a1 = vld1q_f32(a_ptr.add(i + 4));
            let a2 = vld1q_f32(a_ptr.add(i + 8));
            let a3 = vld1q_f32(a_ptr.add(i + 12));
            let b0 = vld1q_f32(b_ptr.add(i));
            let b1 = vld1q_f32(b_ptr.add(i + 4));
            let b2 = vld1q_f32(b_ptr.add(i + 8));
            let b3 = vld1q_f32(b_ptr.add(i + 12));
            acc0 = vfmaq_f32(acc0, a0, b0);
            acc1 = vfmaq_f32(acc1, a1, b1);
            acc2 = vfmaq_f32(acc2, a2, b2);
            acc3 = vfmaq_f32(acc3, a3, b3);
            i += 16;
        }
        while i + 4 <= n {
            let av = vld1q_f32(a_ptr.add(i));
            let bv = vld1q_f32(b_ptr.add(i));
            acc0 = vfmaq_f32(acc0, av, bv);
            i += 4;
        }
        let acc = vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3));
        let mut sum = vaddvq_f32(acc);
        while i < n {
            sum += *a_ptr.add(i) * *b_ptr.add(i);
            i += 1;
        }
        sum
    }
}

/// Cosine distance over pre-normalized vectors.
///
/// Both inputs must be unit length. Builder invariant for
/// [`DistanceMetric::Cosine`] (see [[optimize-cosine-norm-precompute]]).
/// L2 / DotProduct kernels are unchanged.
fn cosine_distance_normalized(a: &[f32], b: &[f32]) -> f32 {
    // cosine_similarity = dot(a, b)  (in [-1, 1])
    // cosine_distance   = 1 - dot(a, b)
    1.0 - dot_product(a, b)
}

/// Normalize a vector to unit length in place.
///
/// Idempotent on already-unit-length inputs (skips the multiply when the
/// squared norm is within 1e-4 of 1.0 — picked to absorb f32 rounding from
/// vectors normalized in f64 elsewhere and downcast). Returns an error for
/// zero, subnormal-collapse-to-zero, NaN, or infinite squared norms; the
/// builder propagates the error so a cosine index never silently embeds a
/// degenerate vector.
///
/// See [[optimize-cosine-norm-precompute]] and [[code-must-not-lie]].
pub fn normalize_in_place(v: &mut [f32]) -> Result<(), LuciError> {
    let norm_sq: f32 = v.iter().map(|x| x * x).sum();
    if !norm_sq.is_finite() || norm_sq == 0.0 {
        return Err(LuciError::InvalidQuery(
            "zero-length / non-finite vector not supported with cosine \
             metric — use metric: dot_product to bypass normalization"
                .into(),
        ));
    }
    if (norm_sq - 1.0).abs() < 1e-4 {
        return Ok(());
    }
    let inv = 1.0 / norm_sq.sqrt();
    for x in v.iter_mut() {
        *x *= inv;
    }
    Ok(())
}

/// f32 L2 (Euclidean) distance.
///
/// Same shape and rationale as [`dot_product`] — explicit NEON on
/// aarch64 with 4 parallel FMA accumulators on `(a-b)²`, iterator
/// chain elsewhere. See [[optimization-vector-distance-kernel-trait]]
/// §"Phase 1.x".
fn l2_distance(a: &[f32], b: &[f32]) -> f32 {
    debug_assert_eq!(a.len(), b.len());
    #[cfg(target_arch = "aarch64")]
    {
        // SAFETY: NEON is part of the AArch64 baseline ISA; always available.
        unsafe { l2_distance_neon(a, b) }
    }
    #[cfg(not(target_arch = "aarch64"))]
    {
        a.iter()
            .zip(b.iter())
            .map(|(x, y)| (x - y) * (x - y))
            .sum::<f32>()
            .sqrt()
    }
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn l2_distance_neon(a: &[f32], b: &[f32]) -> f32 {
    let n = a.len();
    let a_ptr = a.as_ptr();
    let b_ptr = b.as_ptr();

    // SAFETY: caller guarantees NEON via target_feature; pointer arithmetic
    // stays in bounds because every load is gated by `i + N <= n`.
    unsafe {
        let mut acc0 = vdupq_n_f32(0.0);
        let mut acc1 = vdupq_n_f32(0.0);
        let mut acc2 = vdupq_n_f32(0.0);
        let mut acc3 = vdupq_n_f32(0.0);

        let mut i = 0;
        while i + 16 <= n {
            let a0 = vld1q_f32(a_ptr.add(i));
            let a1 = vld1q_f32(a_ptr.add(i + 4));
            let a2 = vld1q_f32(a_ptr.add(i + 8));
            let a3 = vld1q_f32(a_ptr.add(i + 12));
            let b0 = vld1q_f32(b_ptr.add(i));
            let b1 = vld1q_f32(b_ptr.add(i + 4));
            let b2 = vld1q_f32(b_ptr.add(i + 8));
            let b3 = vld1q_f32(b_ptr.add(i + 12));
            let d0 = vsubq_f32(a0, b0);
            let d1 = vsubq_f32(a1, b1);
            let d2 = vsubq_f32(a2, b2);
            let d3 = vsubq_f32(a3, b3);
            acc0 = vfmaq_f32(acc0, d0, d0);
            acc1 = vfmaq_f32(acc1, d1, d1);
            acc2 = vfmaq_f32(acc2, d2, d2);
            acc3 = vfmaq_f32(acc3, d3, d3);
            i += 16;
        }
        while i + 4 <= n {
            let av = vld1q_f32(a_ptr.add(i));
            let bv = vld1q_f32(b_ptr.add(i));
            let d = vsubq_f32(av, bv);
            acc0 = vfmaq_f32(acc0, d, d);
            i += 4;
        }
        let acc = vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3));
        let mut sum = vaddvq_f32(acc);
        while i < n {
            let d = *a_ptr.add(i) - *b_ptr.add(i);
            sum += d * d;
            i += 1;
        }
        sum.sqrt()
    }
}

/// Convert a raw distance value to a score.
///
/// Uses metric-specific formulas matching Lucene's `VectorSimilarityFunction`:
/// - Cosine: `max((1 + cos_sim) / 2, 0)` where cos_sim = 1 - distance
/// - L2: `1 / (1 + distance²)` — inherently in (0, 1]
/// - DotProduct: `max((1 + dot) / 2, 0)` where dot = -distance
///
/// Lucene floors at 0 but does NOT ceil at 1 — scores above 1 are possible
/// for DotProduct when the normalization contract is violated (unnormalized
/// vectors). This matches Lucene's `VectorUtil.normalizeToUnitInterval()`.
///
/// See [[feature-knn-query-type#2b]].
pub fn distance_to_score(raw_distance: f32, metric: DistanceMetric) -> f32 {
    match metric {
        DistanceMetric::Cosine => {
            // cosine_distance = 1 - cos_sim, so cos_sim = 1 - distance
            // Lucene: max((1 + cos_sim) / 2, 0) = max((2 - distance) / 2, 0)
            ((2.0 - raw_distance) / 2.0).max(0.0)
        }
        DistanceMetric::L2 => {
            // Lucene: 1 / (1 + squaredDistance)
            // Inherently in (0, 1] — no clamping needed.
            1.0 / (1.0 + raw_distance * raw_distance)
        }
        DistanceMetric::DotProduct => {
            // distance = -dot_product, so dot = -distance
            // Lucene: max((1 + dot) / 2, 0) = max((1 - distance) / 2, 0)
            // Floor at 0, no ceiling — scores > 1 possible for unnormalized vectors.
            ((1.0 - raw_distance) / 2.0).max(0.0)
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cosine_identical() {
        // Kernel post-fix requires unit-length inputs. Normalize before
        // comparing to honor the builder invariant.
        let mut v = vec![1.0, 2.0, 3.0];
        normalize_in_place(&mut v).unwrap();
        let d = distance(&v, &v, DistanceMetric::Cosine);
        assert!(
            d.abs() < 1e-5,
            "identical vectors should have cosine distance ~0, got {d}"
        );
    }

    #[test]
    fn cosine_orthogonal() {
        let a = vec![1.0, 0.0];
        let b = vec![0.0, 1.0];
        let d = distance(&a, &b, DistanceMetric::Cosine);
        assert!(
            (d - 1.0).abs() < 1e-5,
            "orthogonal vectors should have cosine distance ~1, got {d}"
        );
    }

    #[test]
    fn cosine_opposite() {
        let a = vec![1.0, 0.0];
        let b = vec![-1.0, 0.0];
        let d = distance(&a, &b, DistanceMetric::Cosine);
        assert!(
            (d - 2.0).abs() < 1e-5,
            "opposite vectors should have cosine distance ~2, got {d}"
        );
    }

    #[test]
    fn dot_product_metric() {
        let a = vec![1.0, 2.0];
        let b = vec![3.0, 4.0];
        let d = distance(&a, &b, DistanceMetric::DotProduct);
        // dot = 1*3 + 2*4 = 11, negated = -11
        assert_eq!(d, -11.0);
    }

    #[test]
    fn l2_distance_metric() {
        let a = vec![0.0, 0.0];
        let b = vec![3.0, 4.0];
        let d = distance(&a, &b, DistanceMetric::L2);
        assert!((d - 5.0).abs() < 1e-5, "L2 distance should be 5.0, got {d}");
    }

    #[test]
    fn l2_identical() {
        let v = vec![1.0, 2.0, 3.0];
        let d = distance(&v, &v, DistanceMetric::L2);
        assert!(d.abs() < 1e-5);
    }

    // (Pre-fix `zero_vector_cosine` test removed: the kernel now requires
    // unit-length inputs, so zero/raw vectors no longer flow into
    // `distance(..., Cosine)`. Zero-vector rejection is now tested via
    // `normalize_in_place_zero_errors` and the builder/bulk tests in
    // hnsw.rs.)

    #[test]
    fn unit_vectors() {
        let a = vec![1.0, 0.0, 0.0];
        let b = vec![0.0, 1.0, 0.0];
        let d_cos = distance(&a, &b, DistanceMetric::Cosine);
        let d_l2 = distance(&a, &b, DistanceMetric::L2);
        assert!((d_cos - 1.0).abs() < 1e-5);
        assert!((d_l2 - std::f32::consts::SQRT_2).abs() < 1e-5);
    }

    // --- distance_to_score tests ---

    #[test]
    fn cosine_score_identical() {
        // cos_sim = 1 → distance = 0 → score = (2-0)/2 = 1.0
        let s = distance_to_score(0.0, DistanceMetric::Cosine);
        assert!(
            (s - 1.0).abs() < 1e-5,
            "identical vectors: score={s}, expected 1.0"
        );
    }

    #[test]
    fn cosine_score_orthogonal() {
        // cos_sim = 0 → distance = 1 → score = (2-1)/2 = 0.5
        let s = distance_to_score(1.0, DistanceMetric::Cosine);
        assert!(
            (s - 0.5).abs() < 1e-5,
            "orthogonal vectors: score={s}, expected 0.5"
        );
    }

    #[test]
    fn cosine_score_opposite() {
        // cos_sim = -1 → distance = 2 → score = (2-2)/2 = 0.0
        let s = distance_to_score(2.0, DistanceMetric::Cosine);
        assert!(s.abs() < 1e-5, "opposite vectors: score={s}, expected 0.0");
    }

    #[test]
    fn l2_score_identical() {
        // distance = 0 → score = 1/(1+0) = 1.0
        let s = distance_to_score(0.0, DistanceMetric::L2);
        assert!((s - 1.0).abs() < 1e-5, "identical: score={s}, expected 1.0");
    }

    #[test]
    fn l2_score_unit_distance() {
        // distance = 1 → score = 1/(1+1) = 0.5
        let s = distance_to_score(1.0, DistanceMetric::L2);
        assert!(
            (s - 0.5).abs() < 1e-5,
            "unit distance: score={s}, expected 0.5"
        );
    }

    #[test]
    fn l2_score_far() {
        // distance = 2 → score = 1/(1+4) = 0.2
        let s = distance_to_score(2.0, DistanceMetric::L2);
        assert!((s - 0.2).abs() < 1e-5, "far: score={s}, expected 0.2");
    }

    #[test]
    fn dot_product_score_high_similarity() {
        // dot = 1.0 → distance = -1.0 → score = (1-(-1))/2 = 1.0
        let s = distance_to_score(-1.0, DistanceMetric::DotProduct);
        assert!((s - 1.0).abs() < 1e-5, "high sim: score={s}, expected 1.0");
    }

    #[test]
    fn dot_product_score_zero() {
        // dot = 0 → distance = 0 → score = (1-0)/2 = 0.5
        let s = distance_to_score(0.0, DistanceMetric::DotProduct);
        assert!((s - 0.5).abs() < 1e-5, "zero dot: score={s}, expected 0.5");
    }

    #[test]
    fn dot_product_score_negative() {
        // dot = -1 → distance = 1 → score = (1-1)/2 = 0.0
        let s = distance_to_score(1.0, DistanceMetric::DotProduct);
        assert!(s.abs() < 1e-5, "negative dot: score={s}, expected 0.0");
    }

    #[test]
    fn all_scores_non_negative() {
        // Verify all metrics produce non-negative scores (Lucene floors at 0).
        // Scores > 1 are allowed for DotProduct with unnormalized vectors.
        for dist in [0.0, 0.5, 1.0, 2.0, 5.0, 10.0] {
            for metric in [
                DistanceMetric::Cosine,
                DistanceMetric::L2,
                DistanceMetric::DotProduct,
            ] {
                let s = distance_to_score(dist, metric);
                assert!(
                    s >= 0.0,
                    "score should be non-negative: metric={metric:?}, dist={dist}, score={s}"
                );
            }
        }
    }

    #[test]
    fn l2_scores_bounded_unit() {
        // L2 scores are always in (0, 1] by formula
        for dist in [0.0, 0.1, 1.0, 10.0, 100.0] {
            let s = distance_to_score(dist, DistanceMetric::L2);
            assert!(
                s > 0.0 && s <= 1.0,
                "L2 score out of (0,1]: dist={dist}, score={s}"
            );
        }
    }

    #[test]
    fn dot_product_unnormalized_can_exceed_one() {
        // distance = -dot, so distance = -2 means dot = 2
        // score = (1 + 2) / 2 = 1.5 — valid for unnormalized vectors
        let s = distance_to_score(-2.0, DistanceMetric::DotProduct);
        assert!(
            s > 1.0,
            "unnormalized dot product should produce score > 1: {s}"
        );
    }

    #[test]
    fn from_byte_round_trips_known_metrics() {
        for metric in [
            DistanceMetric::Cosine,
            DistanceMetric::DotProduct,
            DistanceMetric::L2,
        ] {
            let byte = metric as u8;
            assert_eq!(DistanceMetric::from_byte(byte), metric);
        }
    }

    #[test]
    fn from_byte_discriminants_are_pinned() {
        // The on-disk encoding depends on these exact discriminants.
        // Changing them silently is a forward-compat break.
        assert_eq!(DistanceMetric::Cosine as u8, 0);
        assert_eq!(DistanceMetric::DotProduct as u8, 1);
        assert_eq!(DistanceMetric::L2 as u8, 2);
    }

    #[test]
    #[should_panic(expected = "unknown distance metric byte 3")]
    fn from_byte_panics_on_unknown_metric() {
        // Forward-version mismatch: a newer Luci wrote a metric we don't
        // know. Must NOT silently fall back to L2 (or anything else).
        let _ = DistanceMetric::from_byte(3);
    }

    #[test]
    #[should_panic(expected = "unknown distance metric byte 255")]
    fn from_byte_panics_on_garbage() {
        // Segment corruption or a wild pointer landing on a metric byte.
        let _ = DistanceMetric::from_byte(255);
    }

    // --- normalize_in_place tests ---

    #[test]
    fn normalize_in_place_unit_length() {
        let mut v = vec![3.0_f32, 4.0];
        normalize_in_place(&mut v).unwrap();
        let norm = (v[0] * v[0] + v[1] * v[1]).sqrt();
        assert!((norm - 1.0).abs() < 1e-6, "norm after normalize: {norm}");
        assert!((v[0] - 0.6).abs() < 1e-6 && (v[1] - 0.8).abs() < 1e-6);
    }

    #[test]
    fn normalize_in_place_idempotent_on_unit_input() {
        let mut v = vec![0.6_f32, 0.8];
        let before = v.clone();
        normalize_in_place(&mut v).unwrap();
        // Idempotent short-circuit: input already unit length, no drift.
        for (a, b) in v.iter().zip(before.iter()) {
            assert_eq!(a, b);
        }
    }

    #[test]
    fn normalize_in_place_zero_errors() {
        let mut v = vec![0.0_f32, 0.0, 0.0];
        let err = normalize_in_place(&mut v).unwrap_err();
        let msg = format!("{err}");
        assert!(
            msg.contains("zero-length / non-finite vector"),
            "unexpected message: {msg}",
        );
    }

    #[test]
    fn normalize_in_place_subnormal_errors() {
        // f32::MIN_POSITIVE * 1e-2 lives in the subnormal range. Squaring
        // it underflows to 0 in f32, so the zero-norm guard fires.
        let mut v = vec![f32::MIN_POSITIVE * 1e-2; 3];
        let err = normalize_in_place(&mut v).unwrap_err();
        assert!(format!("{err}").contains("zero-length / non-finite vector"));
    }

    #[test]
    fn normalize_in_place_overflow_errors() {
        // 1e20² = 1e40, well past f32 max (~3.4e38). norm_sq → +Inf.
        let mut v = vec![1e20_f32; 3];
        let err = normalize_in_place(&mut v).unwrap_err();
        assert!(format!("{err}").contains("zero-length / non-finite vector"));
    }

    #[test]
    fn normalize_in_place_nan_errors() {
        let mut v = vec![1.0_f32, f32::NAN, 2.0];
        let err = normalize_in_place(&mut v).unwrap_err();
        assert!(format!("{err}").contains("zero-length / non-finite vector"));
    }

    #[test]
    fn cosine_score_unchanged_after_normalize() {
        // Pre-fix cosine kernel computed `1 - dot/(norm_a*norm_b)` in f32.
        // Post-fix computes `1 - dot(a/norm_a, b/norm_b)` in f32 after
        // explicit normalization. Both must produce the same score within
        // floating-point tolerance.
        let cases: &[(Vec<f32>, Vec<f32>)] = &[
            (vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]),
            (vec![1.0, 0.0, 0.0, 0.0], vec![0.0, 1.0, 0.0, 0.0]),
            (vec![0.1; 100], vec![0.2; 100]),
        ];
        for (a_raw, b_raw) in cases {
            // Pre-fix formula using f64 accumulators as oracle.
            let dot64: f64 = a_raw
                .iter()
                .zip(b_raw.iter())
                .map(|(x, y)| (*x as f64) * (*y as f64))
                .sum();
            let na64: f64 = a_raw
                .iter()
                .map(|x| (*x as f64).powi(2))
                .sum::<f64>()
                .sqrt();
            let nb64: f64 = b_raw
                .iter()
                .map(|x| (*x as f64).powi(2))
                .sum::<f64>()
                .sqrt();
            let oracle_dist = 1.0 - dot64 / (na64 * nb64);
            let oracle_score = ((2.0 - oracle_dist) / 2.0).max(0.0);

            let mut a = a_raw.clone();
            let mut b = b_raw.clone();
            normalize_in_place(&mut a).unwrap();
            normalize_in_place(&mut b).unwrap();
            let d = distance(&a, &b, DistanceMetric::Cosine);
            let s = distance_to_score(d, DistanceMetric::Cosine);
            assert!(
                ((s as f64) - oracle_score).abs() < 1e-3,
                "score drift > 1e-3: post={s}, oracle={oracle_score}",
            );
        }
    }

    #[test]
    fn cosine_distance_orthogonal_after_normalize() {
        let mut a = vec![3.0, 0.0];
        let mut b = vec![0.0, 7.0];
        normalize_in_place(&mut a).unwrap();
        normalize_in_place(&mut b).unwrap();
        let d = distance(&a, &b, DistanceMetric::Cosine);
        assert!((d - 1.0).abs() < 1e-6, "orthogonal cosine distance: {d}");
    }

    #[test]
    fn cosine_distance_identical_after_normalize() {
        let mut a = vec![1.0, 2.0, 3.0];
        let mut b = a.clone();
        normalize_in_place(&mut a).unwrap();
        normalize_in_place(&mut b).unwrap();
        let d = distance(&a, &b, DistanceMetric::Cosine);
        assert!(d.abs() < 1e-6, "identical cosine distance: {d}");
    }
}