lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Exit criteria integration tests for Milestone 5: Hybrid Search & RRF.

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use serde_json::json;

fn search(
    index: &mut Index,
    query: serde_json::Value,
    size: usize,
) -> luci::search::results::SearchResults {
    let expr = luci::search::expression::parse_search(query, size).unwrap();
    index.search(&expr).unwrap()
}

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir =
        std::env::temp_dir().join(format!("luci_m5_integration_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

fn build_hybrid_index(name: &str) -> (std::path::PathBuf, Index) {
    let path = test_dir(name);
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    index.bulk(vec![
        json!({"title": "search engine design", "tag": "tech", "embedding": [0.9, 0.1, 0.0, 0.0]}),
        json!({"title": "search algorithms", "tag": "tech", "embedding": [0.1, 0.9, 0.0, 0.0]}),
        json!({"title": "cute cats", "tag": "animal", "embedding": [0.8, 0.2, 0.0, 0.0]}),
        json!({"title": "search optimization", "tag": "tech", "embedding": [0.0, 0.0, 0.9, 0.1]}),
        json!({"title": "happy dog", "tag": "animal", "embedding": [0.0, 0.0, 0.0, 1.0]}),
    ]).unwrap();

    (path, index)
}

#[test]
fn hybrid_search_rrf() {
    let (path, mut index) = build_hybrid_index("hybrid");

    let results = search(
        &mut index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5,
                        "num_candidates": 10
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    let ids: Vec<u32> = results.iter().map(|h| h.doc_id().as_u32()).collect();
    assert!(ids.contains(&1));
    assert!(ids.contains(&2));

    cleanup(&path);
}

#[test]
fn text_only_structured() {
    let (path, mut index) = build_hybrid_index("text_only");

    let results = search(
        &mut index,
        json!({
            "query": {"match": {"title": "search"}},
            "size": 10
        }),
        10,
    );

    assert!(results.total_hits().value >= 3);

    cleanup(&path);
}

#[test]
fn knn_only_structured() {
    let (path, mut index) = build_hybrid_index("knn_only");

    let results = search(
        &mut index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 3,
                "num_candidates": 10
            }},
            "size": 3
        }),
        3,
    );

    assert_eq!(results.len(), 3);
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);
    assert_eq!(results.hit(1).unwrap().doc_id().as_u32(), 2);

    cleanup(&path);
}

#[test]
fn rrf_rank_based_scores() {
    let (path, mut index) = build_hybrid_index("rrf_scores");

    let results = search(
        &mut index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5,
                        "num_candidates": 10
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    for i in 0..results.len().saturating_sub(1) {
        assert!(results.hit(i).unwrap().score() >= results.hit(i + 1).unwrap().score());
    }
    for hit in results.iter() {
        assert!(hit.score() > 0.0);
    }

    cleanup(&path);
}