lucisearch 0.8.1

//! Tests for QueryExpression::Ranking(Fusion) — explicit RRF and score fusion.
//!
//! See [[feature-rrf-retrievers]].

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::search::expression::parse_search;
use serde_json::json;

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir = std::env::temp_dir().join(format!("luci_fusion_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

fn build_index(name: &str) -> (std::path::PathBuf, Index) {
    let path = test_dir(name);
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    index
        .bulk(vec![
            json!({"title": "search engine design", "tag": "tech", "embedding": [0.9, 0.1, 0.0, 0.0]}),
            json!({"title": "search algorithms", "tag": "tech", "embedding": [0.1, 0.9, 0.0, 0.0]}),
            json!({"title": "cute cats", "tag": "animal", "embedding": [0.8, 0.2, 0.0, 0.0]}),
            json!({"title": "search optimization", "tag": "tech", "embedding": [0.0, 0.0, 0.9, 0.1]}),
            json!({"title": "happy dog", "tag": "animal", "embedding": [0.0, 0.0, 0.0, 1.0]}),
        ])
        .unwrap();

    (path, index)
}

fn search(
    index: &Index,
    query: serde_json::Value,
    size: usize,
) -> luci::search::results::SearchResults {
    let expr = parse_search(query, size).unwrap();
    index.search(&expr).unwrap()
}

// --- RRF fusion ---

#[test]
fn rrf_two_sources() {
    let (path, index) = build_index("rrf_two");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // Doc 0 matches both text ("search engine design") and kNN (closest vector)
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);
    // Scores should be RRF-style (small values, rank-based)
    let top_score = results.hit(0).unwrap().score();
    assert!(
        top_score > 0.0 && top_score < 1.0,
        "RRF scores are small: {top_score}"
    );

    cleanup(&path);
}

#[test]
fn rrf_three_sources() {
    let (path, index) = build_index("rrf_three");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"term": {"tag": "tech"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf",
                "rank_constant": 60
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // Doc 0 appears in all three sources — should rank highest
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    cleanup(&path);
}

#[test]
fn rrf_custom_rank_constant() {
    let (path, index) = build_index("rrf_k");

    // Small rank_constant amplifies rank differences
    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf",
                "rank_constant": 1
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // With k=1, scores are higher: 1/(1+1) = 0.5 for rank 1
    let top_score = results.hit(0).unwrap().score();
    assert!(
        top_score > 0.3,
        "small rank_constant → higher scores: {top_score}"
    );

    cleanup(&path);
}

#[test]
fn rrf_rank_window_size() {
    let (path, index) = build_index("rrf_window");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf",
                "rank_window_size": 2
            }},
            "size": 5
        }),
        5,
    );

    // rank_window_size=2 means each source contributes at most 2 candidates
    // Total unique docs <= 4 (2 per source, possibly overlapping)
    assert!(results.len() <= 4, "window_size=2: at most 4 unique docs");

    cleanup(&path);
}

// --- Score-based fusion ---

#[test]
fn sum_fusion() {
    let (path, index) = build_index("sum");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "sum"
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // Doc 0 should rank highest (matches both sources)
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);
    // Sum scores are higher than RRF scores (actual BM25 + kNN scores)
    let top_score = results.hit(0).unwrap().score();
    assert!(
        top_score > 0.5,
        "sum fusion should produce higher scores: {top_score}"
    );

    cleanup(&path);
}

#[test]
fn weighted_sum_fusion() {
    let (path, index) = build_index("weighted_sum");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "sum",
                "weights": [0.3, 0.7]
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // With heavy vector weighting, doc 2 (cute cats, embedding [0.8,0.2,0,0])
    // should rank higher than in unweighted sum
    for i in 0..results.len().saturating_sub(1) {
        assert!(
            results.hit(i).unwrap().score() >= results.hit(i + 1).unwrap().score(),
            "scores should be descending"
        );
    }

    cleanup(&path);
}

// --- Parsing ---

#[test]
fn fusion_parse_default_method() {
    // Default method is RRF
    let _expr = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "a"}},
                    {"match": {"title": "b"}}
                ]
            }}
        }),
        10,
    )
    .unwrap();
}

#[test]
fn fusion_requires_at_least_two_sources() {
    let result = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [{"match": {"title": "a"}}]
            }}
        }),
        10,
    );
    assert!(result.is_err());
}

#[test]
fn fusion_unknown_method_rejected() {
    let result = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "a"}},
                    {"match": {"title": "b"}}
                ],
                "method": "unknown_method"
            }}
        }),
        10,
    );
    assert!(result.is_err());
}

#[test]
fn fusion_scores_descending() {
    let (path, index) = build_index("scores_desc");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    for i in 0..results.len().saturating_sub(1) {
        let a = results.hit(i).unwrap().score();
        let b = results.hit(i + 1).unwrap().score();
        assert!(a >= b, "scores should be descending: {a} < {b}");
    }

    cleanup(&path);
}

// --- Explain and highlight work with fusion ---

#[test]
fn fusion_explain_returns_leaf() {
    let (path, index) = build_index("explain");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    let hit = results.hit(0).unwrap();
    let explanation = hit.explain().expect("explain should not error");
    assert!(explanation.is_some());
    let expl = explanation.unwrap();
    assert!(expl.value > 0.0);
    assert!(expl.description.contains("fusion"));

    cleanup(&path);
}

#[test]
fn fusion_highlight_extracts_terms() {
    let (path, index) = build_index("highlight");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search engine"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    let hit = results.hit(0).unwrap();
    let hl = hit.highlight("title");
    // Should extract terms from the match source ("search", "engine")
    assert!(hl.is_some(), "highlight should work on fusion results");

    cleanup(&path);
}

// --- Aggregations with fusion ---

#[test]
fn fusion_with_aggregations() {
    let (path, index) = build_index("aggs");

    let expr = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "aggs": {
                "tags": {"terms": {"field": "tag"}}
            },
            "size": 5
        }),
        5,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();

    assert!(!results.is_empty());
    let aggs = results.aggregations();
    assert!(
        aggs.contains_key("tags"),
        "fusion results should include aggregations"
    );

    cleanup(&path);
}

// --- from pagination with fusion ---

#[test]
fn fusion_from_pagination() {
    let (path, index) = build_index("from_page");

    let all = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 10
        }),
        10,
    );

    let page2 = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "from": 2,
            "size": 10
        }),
        10,
    );

    // from=2 should skip the first 2 results
    if all.len() > 2 {
        assert_eq!(
            page2.hit(0).unwrap().doc_id().as_u32(),
            all.hit(2).unwrap().doc_id().as_u32(),
            "from=2 should start at the 3rd result"
        );
    }

    cleanup(&path);
}

// --- Weight validation ---

#[test]
fn fusion_weights_length_mismatch_rejected() {
    let result = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "a"}},
                    {"match": {"title": "b"}}
                ],
                "method": "sum",
                "weights": [0.5]
            }}
        }),
        10,
    );
    assert!(
        result.is_err(),
        "mismatched weights length should be rejected"
    );
}

// --- Nested fusion ---

// --- Sort on fusion output ---

#[test]
fn fusion_with_sort() {
    let path = test_dir("sort");
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("price", FieldType::Float)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    index
        .bulk(vec![
            json!({"title": "search engine", "price": 30.0, "embedding": [0.9, 0.1, 0.0, 0.0]}),
            json!({"title": "search tools", "price": 10.0, "embedding": [0.1, 0.9, 0.0, 0.0]}),
            json!({"title": "search guide", "price": 20.0, "embedding": [0.5, 0.5, 0.0, 0.0]}),
        ])
        .unwrap();

    let expr = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 3
                    }}
                ],
                "method": "rrf"
            }},
            "sort": [{"price": "asc"}],
            "size": 10
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();

    assert!(results.len() >= 2);
    // Results should be sorted by price ascending
    // Sort values should be present
    assert!(
        results.hit(0).unwrap().sort_values().is_some(),
        "sort values should be present"
    );

    // Extract prices from sort_values
    let prices: Vec<f64> = (0..results.len())
        .map(|i| {
            let hit = results.hit(i).unwrap();
            let sv = hit.sort_values().unwrap();
            sv[0].to_json().as_f64().unwrap()
        })
        .collect();

    // Verify ascending price order
    for i in 0..prices.len().saturating_sub(1) {
        assert!(
            prices[i] <= prices[i + 1],
            "price should be ascending: {} > {}",
            prices[i],
            prices[i + 1]
        );
    }

    cleanup(&path);
}

// --- Collapse on fusion output ---

#[test]
fn fusion_with_collapse() {
    let (path, index) = build_index("collapse");

    let expr = parse_search(
        json!({
            "query": {"fusion": {
                "sources": [
                    {"match": {"title": "search"}},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "collapse": {"field": "tag"},
            "size": 10
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();

    // 5 docs, 2 unique tags ("tech", "animal") → at most 2 results
    assert!(
        results.len() <= 2,
        "collapse by tag should produce at most 2 results, got {}",
        results.len()
    );

    // Each result should have a collapse_key
    for hit in results.iter() {
        assert!(
            hit.collapse_key().is_some(),
            "collapsed hits should have collapse_key"
        );
    }

    // Verify unique collapse keys
    let keys: std::collections::HashSet<String> = results
        .iter()
        .map(|h| h.collapse_key().unwrap().to_string())
        .collect();
    assert_eq!(keys.len(), results.len(), "collapse keys should be unique");

    cleanup(&path);
}

// --- Nested fusion ---

#[test]
fn nested_fusion() {
    let (path, index) = build_index("nested");

    let results = search(
        &index,
        json!({
            "query": {"fusion": {
                "sources": [
                    {"fusion": {
                        "sources": [
                            {"match": {"title": "search"}},
                            {"match": {"title": "engine"}}
                        ],
                        "method": "rrf"
                    }},
                    {"knn": {
                        "field": "embedding",
                        "query_vector": [1.0, 0.0, 0.0, 0.0],
                        "k": 5
                    }}
                ],
                "method": "rrf"
            }},
            "size": 5
        }),
        5,
    );

    assert!(!results.is_empty());
    // Doc 0 should still rank highest (matches all inner sources + closest vector)
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    cleanup(&path);
}