lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Exit criteria integration tests for Milestone 4: Vector Search.

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::search::expression::parse_search;
use serde_json::json;
use std::collections::HashSet;

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir =
        std::env::temp_dir().join(format!("luci_m4_integration_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

/// Exit Criterion 1: kNN recall on random vectors.
#[test]
fn knn_recall() {
    let path = test_dir("recall");
    let schema = Mapping::builder()
        .field("embedding", FieldType::dense_vector(16))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    // Generate deterministic "random" vectors
    let n = 500;
    let dims = 16;
    let mut rng: u64 = 99999;
    let mut vectors = Vec::new();
    let mut docs = Vec::new();
    for _i in 0..n {
        let mut v = Vec::with_capacity(dims);
        for _ in 0..dims {
            rng ^= rng << 13;
            rng ^= rng >> 7;
            rng ^= rng << 17;
            v.push((rng as f64 / u64::MAX as f64) * 2.0 - 1.0);
        }
        vectors.push(v.clone());
        docs.push(json!({"embedding": v}));
    }
    index.bulk(docs).unwrap();

    // Search via kNN
    let query_vec: Vec<f32> = vectors[0].iter().map(|&v| v as f32).collect();
    let expr = parse_search(
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": query_vec,
                "k": 10,
                "num_candidates": 50
            }}
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    assert!(
        results.total_hits().value >= 5,
        "should find at least 5 results"
    );

    // Brute-force top-10 for recall comparison
    let mut brute: Vec<(usize, f64)> = vectors
        .iter()
        .enumerate()
        .map(|(i, v)| {
            let d: f64 = v
                .iter()
                .zip(query_vec.iter())
                .map(|(&a, &b)| (a as f64 - b as f64).powi(2))
                .sum::<f64>()
                .sqrt();
            (i, d)
        })
        .collect();
    brute.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
    let brute_top10: HashSet<u32> = brute[..10].iter().map(|x| x.0 as u32).collect();
    let hnsw_top10: HashSet<u32> = results.iter().map(|h| h.doc_id().as_u32()).collect();

    let recall = brute_top10.intersection(&hnsw_top10).count() as f64 / 10.0;
    assert!(recall >= 0.7, "recall@10 = {recall}, expected >= 0.7");

    cleanup(&path);
}

/// Exit Criterion 2: End-to-end with text + vectors.
#[test]
fn text_plus_vectors() {
    let path = test_dir("text_vec");
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    index.bulk(vec![
        json!({"title": "search engine", "tag": "tech", "embedding": [1.0, 0.0, 0.0, 0.0]}),
        json!({"title": "database internals", "tag": "tech", "embedding": [0.0, 1.0, 0.0, 0.0]}),
        json!({"title": "cute cat", "tag": "animal", "embedding": [0.0, 0.0, 1.0, 0.0]}),
        json!({"title": "happy dog", "tag": "animal", "embedding": [0.0, 0.0, 0.0, 1.0]}),
    ]).unwrap();

    // Text search works alongside vectors
    let expr = parse_search(json!({"match": {"title": "search"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert_eq!(results.total_hits().value, 1);

    // kNN search
    let expr = parse_search(
        json!({
            "knn": {
                "field": "embedding",
                "query_vector": [0.9, 0.1, 0.0, 0.0],
                "k": 2,
                "num_candidates": 10
            }
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    assert!(results.total_hits().value >= 1);
    // Closest should be doc 0 (search engine)
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    cleanup(&path);
}

/// Exit Criterion 3: kNN with keyword filter.
#[test]
fn knn_with_filter() {
    let path = test_dir("knn_filter");
    let schema = Mapping::builder()
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(2))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    // Tech docs cluster around (1,0), animal docs around (0,1)
    index
        .bulk(vec![
            json!({"tag": "tech", "embedding": [1.0, 0.1]}),
            json!({"tag": "tech", "embedding": [0.9, 0.2]}),
            json!({"tag": "animal", "embedding": [0.1, 1.0]}),
            json!({"tag": "animal", "embedding": [0.2, 0.9]}),
            json!({"tag": "tech", "embedding": [0.8, 0.3]}),
        ])
        .unwrap();

    // kNN without filter: closest to (1,0) should be tech docs
    let expr = parse_search(
        json!({
            "knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0],
                "k": 5,
                "num_candidates": 10
            }
        }),
        5,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    assert_eq!(results.total_hits().value, 5);

    // Top results should be tech docs (closest to (1,0)), not animal docs
    // Doc 0: tech (1.0, 0.1), Doc 1: tech (0.9, 0.2), Doc 4: tech (0.8, 0.3)
    // Doc 2: animal (0.1, 1.0), Doc 3: animal (0.2, 0.9)
    let top3_ids: Vec<u32> = results
        .iter()
        .take(3)
        .map(|h| h.doc_id().as_u32())
        .collect();
    for id in &top3_ids {
        assert!(
            *id == 0 || *id == 1 || *id == 4,
            "top 3 nearest to (1,0) should be tech docs (0,1,4), got doc {id}"
        );
    }

    cleanup(&path);
}