lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Vector search profiling: HNSW build time, kNN latency, recall vs ef.

use super::harness::*;
use luci::index::Index;
use luci::search::expression::parse_search;
use rayon::ThreadPoolBuilder;
use serde_json::json;

use std::collections::HashSet;
use std::time::Instant;

#[test]
fn hnsw_build_time() {
    println!("\n=== Vector: HNSW Build Time ===\n");

    for &(count, dims) in &[
        (1_000, 64),
        (5_000, 64),
        (10_000, 64),
        (1_000, 128),
        (1_000, 256),
    ] {
        let start = Instant::now();
        let (path, _index) =
            build_vector_corpus(&format!("hnsw_build_{count}_{dims}"), count, dims);
        let elapsed = start.elapsed();

        println!(
            "{count:>6} vectors x {dims}d: {:.0}ms ({:.0} vecs/s)",
            elapsed.as_millis(),
            count as f64 / elapsed.as_secs_f64()
        );

        cleanup(&path);
    }
}

#[test]
fn knn_latency() {
    println!("\n=== Vector: kNN Query Latency (5K vectors, 64d) ===\n");

    let dims = 64;
    let (_path, index) = build_vector_corpus("knn_latency", 5_000, dims);

    // Query vector
    let query_vec: Vec<f32> = (0..dims)
        .map(|i| (i as f32 / dims as f32) * 2.0 - 1.0)
        .collect();

    println!("{:<20} {:>12}  {:>12}", "config", "p50", "p99");
    println!("{}", "-".repeat(50));

    for &(k, ef) in &[(10, 50), (10, 100), (10, 200), (50, 100), (100, 200)] {
        let expr = parse_search(
            json!({
                "knn": {
                    "field": "embedding",
                    "query_vector": query_vec,
                    "k": k,
                    "num_candidates": ef
                }
            }),
            k,
        )
        .unwrap();

        let mut times = Vec::new();
        for _ in 0..5 {
            let _ = index.search(&expr);
        } // warmup
        for _ in 0..20 {
            let start = Instant::now();
            let _ = index.search(&expr);
            times.push(start.elapsed());
        }
        times.sort();
        let p50 = times[times.len() / 2];
        let p99 = times[times.len() * 99 / 100];
        println!(
            "k={k:<3} ef={ef:<4}       p50={:>6.1}us  p99={:>6.1}us",
            p50.as_micros() as f64,
            p99.as_micros() as f64
        );

        // kNN on 5K vectors should be under 25ms p99
        assert!(
            p99.as_millis() < 25,
            "REGRESSION: kNN k={k} ef={ef} p99={}ms exceeds 25ms",
            p99.as_millis()
        );
    }

    cleanup(&_path);
}

#[test]
fn knn_recall_vs_ef() {
    println!("\n=== Vector: Recall@10 vs ef_search (5K vectors, 64d) ===\n");

    let dims = 64;
    let n = 5_000;
    let (path, mut index) = build_vector_corpus("knn_recall", n, dims);

    // Generate multiple query vectors and average recall
    let mut rng: u64 = 777;
    let num_queries = 10;

    println!("{:<10} {:>10}", "ef", "recall@10");
    println!("{}", "-".repeat(25));

    for &ef in &[10, 20, 50, 100, 200] {
        let mut total_recall = 0.0;

        for _ in 0..num_queries {
            let mut query_vec = Vec::with_capacity(dims);
            for _ in 0..dims {
                rng ^= rng << 13;
                rng ^= rng >> 7;
                rng ^= rng << 17;
                query_vec.push((rng as f32 / u64::MAX as f32) * 2.0 - 1.0);
            }

            let expr = parse_search(
                json!({
                    "knn": {
                        "field": "embedding",
                        "query_vector": query_vec,
                        "k": 10,
                        "num_candidates": ef
                    }
                }),
                10,
            )
            .unwrap();
            let results = index.search(&expr).unwrap();
            let hnsw_ids: HashSet<u32> = results.iter().map(|h| h.doc_id().as_u32()).collect();

            // Brute force for ground truth
            let brute = brute_force_knn(&mut index, &query_vec, 10);
            let brute_ids: HashSet<u32> = brute.into_iter().collect();

            let recall = hnsw_ids.intersection(&brute_ids).count() as f64 / 10.0;
            total_recall += recall;
        }

        let avg_recall = total_recall / num_queries as f64;
        println!("ef={ef:<6}   recall={avg_recall:.3}");
    }

    cleanup(&path);
}

/// Force-merge over two equally-sized vector segments should be faster
/// than from-scratch rebuild now that [[optimize-hnsw-merge-stitching]]
/// reuses the largest segment's HNSW graph. With two equal segments
/// (2× the docs of one), roughly half the docs come from the seed and
/// skip re-insertion. Empirically the ratio of merge-of-two-segments
/// to single-segment-build lands around 1.88× post-fix (vs 2.34×
/// pre-fix when the merge rebuilt every doc). The headroom above the
/// observed value still catches a full regression to the rebuild path
/// while absorbing measurement noise; the absolute number can't reach
/// 1.5× on this corpus because the merge still re-tokenizes, rebuilds
/// columnar, and rewrites the doc store for all 5,000 docs — only the
/// HNSW work is amortized via seed reuse.
#[test]
fn merge_reuses_seed_graph() {
    use luci::index::Index;
    use std::path::PathBuf;

    println!("\n=== Vector: Merge Reuses Seed HNSW Graph ===\n");

    let dims = 64;
    let n_per_seg = 2_500;

    // Baseline: build a single 2.5k segment to time one segment's worth
    // of HNSW build cost. Captures the cost-per-insert under the
    // current kernel + heuristic.
    let baseline_start = Instant::now();
    let (baseline_path, _) = build_vector_corpus("merge_reuse_baseline", n_per_seg, dims);
    let baseline_elapsed = baseline_start.elapsed();
    cleanup(&baseline_path);

    // Test: build two 2.5k segments (no commit-between-bulks tricks
    // needed — Index::bulk can be called twice to land docs into the
    // same index, and the writer flushes a segment per commit).
    let path: PathBuf = build_vector_corpus_two_segments("merge_reuse_test", n_per_seg, dims);
    let merge_index = Index::open(&path).unwrap();

    let merge_start = Instant::now();
    merge_index.force_merge(1).unwrap();
    let merge_elapsed = merge_start.elapsed();

    let ratio = merge_elapsed.as_secs_f64() / baseline_elapsed.as_secs_f64();
    println!(
        "single-segment build: {:.0}ms  |  merge of two segments: {:.0}ms  |  ratio {ratio:.2}×",
        baseline_elapsed.as_millis(),
        merge_elapsed.as_millis(),
    );

    // Pre-fix: merge rebuilt every doc → ratio ≈ 2.34×.
    // Post-fix: seed reuse skips half the HNSW work → ratio ≈ 1.88×.
    // 2.0× catches a regression to the full-rebuild path with headroom
    // for measurement noise; getting much tighter would require also
    // optimizing the per-doc text/columnar/doc-store work, which is
    // out of scope for the HNSW stitching design doc.
    assert!(
        ratio <= 2.0,
        "REGRESSION: merge took {ratio:.2}× single-segment-build (target ≤ 2.0×); \
         seed reuse may have regressed back to full rebuild"
    );

    cleanup(&path);
}

/// Build a vector corpus split across two segments. Each `bulk()` call
/// flushes its own segment, so two separate calls produce two segments
/// of `n_per_seg` docs each.
fn build_vector_corpus_two_segments(
    name: &str,
    n_per_seg: usize,
    dims: usize,
) -> std::path::PathBuf {
    use luci::index::Index;

    let path = profile_dir(name);
    let index = Index::create_with_mapping(&path, vector_schema(dims)).unwrap();

    for seed in [42u64, 9999u64] {
        let mut rng = seed;
        let docs: Vec<serde_json::Value> = (0..n_per_seg)
            .map(|i| {
                let mut vec = Vec::with_capacity(dims);
                for _ in 0..dims {
                    rng ^= rng << 13;
                    rng ^= rng >> 7;
                    rng ^= rng << 17;
                    vec.push((rng as f64 / u64::MAX as f64) * 2.0 - 1.0);
                }
                json!({
                    "title": format!("vector doc {i}"),
                    "tag": "x",
                    "embedding": vec,
                })
            })
            .collect();
        index.bulk(docs).unwrap();
    }
    drop(index);
    path
}

fn brute_force_knn(index: &mut Index, query: &[f32], k: usize) -> Vec<u32> {
    // Use the search with very high ef for near-brute-force
    let expr = parse_search(
        json!({
            "knn": {
                "field": "embedding",
                "query_vector": query,
                "k": k,
                "num_candidates": 5000
            }
        }),
        k,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    results.iter().map(|h| h.doc_id().as_u32()).collect()
}

/// Regression test for [[fix-hnsw-neighbor-heuristic]]: ensures the
/// neighbor-selection heuristic and M_max0=2*M at layer 0 keep recall
/// above thresholds that would have failed under the simple Algorithm 3
/// selection. Pre-fix recall on glove-100 at ef=128 was ~0.69 (vs 0.83+
/// cluster); on this small synthetic 5k corpus the gap is smaller but
/// detectable. Thresholds are deliberately loose to absorb RNG noise
/// across rebuilds while still catching a structural regression.
#[test]
fn knn_recall_regression() {
    println!("\n=== Vector: Recall Regression Guard (5K vectors, 64d) ===\n");

    let dims = 64;
    let n = 5_000;
    // Build deterministically: a 1-thread pool makes connect_pending take the
    // byte-identical sequential path, so the calibrated floors below can't
    // flake on parallel-linkage non-determinism (which swings ef=16 recall
    // ~0.10 at this corpus size). The recall *equivalence* of the parallel
    // build is established separately by the glove-100 serial-vs-parallel A/B
    // and `parallel_build_recall_matches_sequential`. See
    // [[optimization-concurrent-hnsw-insert]] §Determinism.
    let pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
    let (path, mut index) = pool.install(|| build_vector_corpus("knn_recall_regression", n, dims));

    let mut rng: u64 = 31415;
    let num_queries = 20;

    let mut recalls: std::collections::HashMap<usize, Vec<f64>> = std::collections::HashMap::new();

    for &ef in &[16, 64, 128] {
        let mut samples = Vec::with_capacity(num_queries);
        for _ in 0..num_queries {
            let mut query_vec = Vec::with_capacity(dims);
            for _ in 0..dims {
                rng ^= rng << 13;
                rng ^= rng >> 7;
                rng ^= rng << 17;
                query_vec.push((rng as f32 / u64::MAX as f32) * 2.0 - 1.0);
            }

            let expr = parse_search(
                json!({
                    "knn": {
                        "field": "embedding",
                        "query_vector": query_vec,
                        "k": 10,
                        "num_candidates": ef
                    }
                }),
                10,
            )
            .unwrap();
            let hnsw_ids: HashSet<u32> = index
                .search(&expr)
                .unwrap()
                .iter()
                .map(|h| h.doc_id().as_u32())
                .collect();
            let brute_ids: HashSet<u32> = brute_force_knn(&mut index, &query_vec, 10)
                .into_iter()
                .collect();
            samples.push(hnsw_ids.intersection(&brute_ids).count() as f64 / 10.0);
        }
        let mean = samples.iter().sum::<f64>() / samples.len() as f64;
        recalls.insert(ef, samples);
        println!("ef={ef:<4}  mean_recall@10 = {mean:.3}");
    }

    let r16 = recalls[&16].iter().sum::<f64>() / num_queries as f64;
    let r64 = recalls[&64].iter().sum::<f64>() / num_queries as f64;
    let r128 = recalls[&128].iter().sum::<f64>() / num_queries as f64;

    // Thresholds set ~0.05 below empirically observed post-fix values
    // on this synthetic corpus so RNG drift doesn't flake the test, but
    // the pre-fix regime (Algorithm 3, M_max=M at every layer) would
    // fail the ef=64 and ef=128 thresholds. ef=16 is included as a
    // smoke check that the heuristic doesn't over-prune at low budgets.
    assert!(
        r16 >= 0.55,
        "REGRESSION: recall@10 at ef=16 was {r16:.3} (threshold 0.55)"
    );
    assert!(
        r64 >= 0.80,
        "REGRESSION: recall@10 at ef=64 was {r64:.3} (threshold 0.80)"
    );
    assert!(
        r128 >= 0.90,
        "REGRESSION: recall@10 at ef=128 was {r128:.3} (threshold 0.90)"
    );

    cleanup(&path);
}