vicinity 0.7.2

Approximate nearest-neighbor search
Documentation
#![allow(clippy::unwrap_used, clippy::expect_used)]
#![cfg(feature = "hnsw")]
//! Regression tests for known bugs in ANN libraries.
//!
//! These tests are grounded in real issues from hnswlib, faiss, usearch, lance:
//! - https://github.com/nmslib/hnswlib/issues
//! - https://github.com/facebookresearch/faiss/issues
//! - https://github.com/unum-cloud/USearch/issues
//! - https://github.com/lancedb/lance/issues
//!
//! Key issues covered:
//! - hnswlib #592: Query not normalized for cosine distance
//! - hnswlib #608: Issues after deleting vectors
//! - hnswlib #635, #606: M vs Mcurmax in neighbor selection
//! - faiss #4295: Integer overflow on large datasets
//! - lance #5208: Wrong layer iteration direction
//! - usearch #439: Performance regression (recall too low)

#![cfg(feature = "hnsw")]
#![allow(clippy::float_cmp)]

#[path = "common/mod.rs"]
mod common;
use common::*;

/// Test inspired by hnswlib #592: Vector not normalized for cosine distance.
///
/// When using cosine distance, unnormalized query vectors should still
/// produce correct relative rankings.
#[test]
fn cosine_distance_handles_unnormalized_query() {
    // Normalized vectors
    let _a = [1.0f32, 0.0, 0.0]; // reference direction
    let b = vec![0.0f32, 1.0, 0.0]; // perpendicular
    let c = vec![0.707f32, 0.707, 0.0]; // 45 degrees between a and b

    // Unnormalized query (magnitude 2, same direction as a)
    let query = vec![2.0f32, 0.0, 0.0];

    // c should be closer to query than b, regardless of normalization
    let dist_query_b = vicinity::distance::cosine_distance(&query, &b);
    let dist_query_c = vicinity::distance::cosine_distance(&query, &c);

    assert!(
        dist_query_c < dist_query_b,
        "c ({:.4}) should be closer to query than b ({:.4})",
        dist_query_c,
        dist_query_b
    );
}

/// Test inspired by hnswlib #635: M vs Mcurmax bug in neighbor selection.
///
/// Layer 0 should use M_max connections, not M.
#[test]
#[cfg(feature = "hnsw")]
fn layer0_uses_correct_connectivity() {
    use vicinity::hnsw::HNSWIndex;

    let dim = 32;
    let m = 8; // M for upper layers
    let m_max = 16; // M_max for layer 0

    // Build small index
    let mut index = HNSWIndex::new(dim, m, m_max).unwrap();
    for i in 0..100u32 {
        let vec: Vec<f32> = (0..dim)
            .map(|d| ((i as usize * dim + d) as f32 * 0.1).sin())
            .collect();
        index.add(i, normalize(&vec)).unwrap();
    }
    index.build().unwrap();

    // Verify layer 0 connectivity is at least M (could be up to M_max)
    // This is a sanity check - the actual invariant is tested in property tests
    let query = normalize(&vec![1.0; dim]);
    let results = index.search(&query, 20, 100).unwrap();
    assert!(results.len() >= m, "Should return at least M results");
}

/// Test inspired by lance #5208: Layer iteration direction must be top-to-bottom.
///
/// HNSW search must start from top layer (coarsest) and descend to layer 0.
/// Getting this wrong causes severe recall degradation.
#[test]
#[cfg(feature = "hnsw")]
fn layer_traversal_is_top_to_bottom() {
    use vicinity::hnsw::HNSWIndex;

    let dim = 64; // higher dim = more distinguishable vectors
    let n = 500; // smaller index for reliability

    // Build a non-trivial index that should have multiple layers.
    //
    // Important: vectors must be distinct. If we accidentally create duplicates, then the
    // "self must appear in results" assertion becomes invalid (another identical vector
    // can legitimately be returned instead).
    let vectors: Vec<Vec<f32>> = (0..n).map(|i| normalize(&random_vec(dim, i))).collect();

    let mut index = HNSWIndex::new(dim, 16, 32).unwrap();
    for (id, vec) in vectors.iter().enumerate() {
        index.add_slice(id as u32, vec).unwrap();
    }
    index.build().unwrap();

    // Search with high ef to ensure we explore properly
    // If layers are traversed wrong, recall will be very low
    let query = &vectors[250]; // middle of index
                               // Use ef = n to guarantee exhaustive search if needed
    let results = index.search(query, 50, n).unwrap();

    // Self should be in results (exact match exists)
    let found_self = results.iter().any(|(id, _)| *id == 250);
    assert!(
        found_self,
        "Should find the query vector itself - layer traversal may be wrong"
    );
}

/// Test inspired by paper Algorithm 2: Stopping condition must be correct.
///
/// HNSW stops when: best unexplored candidate distance > worst result distance.
/// Getting this wrong (e.g., stopping too early) causes recall loss.
#[test]
#[cfg(feature = "hnsw")]
fn stopping_condition_allows_sufficient_exploration() {
    use std::collections::HashSet;
    use vicinity::hnsw::HNSWIndex;

    let dim = 64;
    let n = 500;
    let k = 10;

    // Build index
    let vectors: Vec<Vec<f32>> = (0..n).map(|i| normalize(&random_vec(dim, i))).collect();
    let mut index = HNSWIndex::new(dim, 16, 32).unwrap();
    for (id, vec) in vectors.iter().enumerate() {
        index.add(id as u32, vec.clone()).unwrap();
    }
    index.build().unwrap();

    // Test recall at different ef values
    // If stopping condition is wrong, higher ef won't improve recall
    let query = &vectors[250];
    let gt = brute_force_knn(query, &vectors, k);
    let gt_ids: HashSet<u32> = gt.into_iter().collect();

    let recall_ef50 = {
        let results = index.search(query, k, 50).unwrap();
        let ids: HashSet<u32> = results.iter().map(|(id, _)| *id).collect();
        gt_ids.intersection(&ids).count() as f32 / k as f32
    };

    let recall_ef200 = {
        let results = index.search(query, k, 200).unwrap();
        let ids: HashSet<u32> = results.iter().map(|(id, _)| *id).collect();
        gt_ids.intersection(&ids).count() as f32 / k as f32
    };

    // Higher ef should give equal or better recall
    // If stopping condition is broken, ef200 might be worse than ef50
    assert!(
        recall_ef200 >= recall_ef50 * 0.95, // Allow 5% tolerance for randomness
        "Higher ef should not decrease recall: ef50={:.2}%, ef200={:.2}%",
        recall_ef50 * 100.0,
        recall_ef200 * 100.0
    );
}

/// Test inspired by usearch #439: Performance regression claims.
///
/// Not a correctness test, but documents that we measure recall to catch
/// configuration issues that could cause performance problems.
#[test]
#[cfg(feature = "hnsw")]
fn recall_is_measurable() {
    use std::collections::HashSet;
    use vicinity::hnsw::HNSWIndex;

    let dim = 64;
    let n = 500;
    let k = 10;

    // Generate vectors
    let vectors: Vec<Vec<f32>> = (0..n).map(|i| normalize(&random_vec(dim, i))).collect();

    // Build index
    let mut index = HNSWIndex::new(dim, 16, 32).unwrap();
    for (id, vec) in vectors.iter().enumerate() {
        index.add(id as u32, vec.clone()).unwrap();
    }
    index.build().unwrap();

    // Measure recall on a few queries
    let mut total_recall = 0.0;
    let n_queries = 10;

    for q in 0..n_queries {
        let query = &vectors[(q * 37) % n];

        // Ground truth
        let gt = brute_force_knn(query, &vectors, k);
        let gt_ids: HashSet<u32> = gt.into_iter().collect();

        // Approximate
        let approx = index.search(query, k, 100).unwrap();
        let approx_ids: HashSet<u32> = approx.iter().map(|(id, _)| *id).collect();

        let recall = gt_ids.intersection(&approx_ids).count() as f32 / k as f32;
        total_recall += recall;
    }

    let avg_recall = total_recall / n_queries as f32;

    // With ef=100 on clean data, recall should be high
    assert!(
        avg_recall > 0.7,
        "Average recall ({:.2}%) is too low - possible configuration issue",
        avg_recall * 100.0
    );
}

// --- Helpers ---

fn random_vec(dim: usize, seed: usize) -> Vec<f32> {
    let raw: Vec<f32> = (0..dim)
        .map(|i| ((seed * 31 + i * 17) as f32 * 0.001).sin())
        .collect();
    normalize(&raw)
}