lucisearch 0.8.1

//! Tests for kNN as a ScoringExpression variant (query-level kNN).
//!
//! See [[feature-knn-query-type]].

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::search::expression::parse_search;
use serde_json::json;

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir = std::env::temp_dir().join(format!("luci_knn_query_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

fn build_index(name: &str) -> (std::path::PathBuf, Index) {
    let path = test_dir(name);
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    index.bulk(vec![
        json!({"title": "search engine design", "tag": "tech", "embedding": [0.9, 0.1, 0.0, 0.0]}),
        json!({"title": "search algorithms", "tag": "tech", "embedding": [0.1, 0.9, 0.0, 0.0]}),
        json!({"title": "cute cats", "tag": "animal", "embedding": [0.0, 0.0, 0.9, 0.1]}),
        json!({"title": "search optimization", "tag": "tech", "embedding": [0.0, 0.0, 0.1, 0.9]}),
        json!({"title": "happy dog", "tag": "animal", "embedding": [0.0, 0.0, 0.0, 0.1]}),
    ]).unwrap();

    (path, index)
}

fn search(
    index: &Index,
    query: serde_json::Value,
    size: usize,
) -> luci::search::results::SearchResults {
    let expr = parse_search(query, size).unwrap();
    index.search(&expr).unwrap()
}

// --- 1. Standalone kNN query ---

#[test]
fn knn_query_standalone() {
    let (path, index) = build_index("standalone");

    let results = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 3
            }}
        }),
        10,
    );

    assert_eq!(results.len(), 3);
    // Doc 0 (0.9, 0.1, 0, 0) is closest to (1, 0, 0, 0)
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    cleanup(&path);
}

// --- 2. Threshold filters low-similarity results ---

#[test]
fn knn_query_with_threshold() {
    let (path, index) = build_index("threshold");

    // Without threshold: get all 5
    let all = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 5
            }}
        }),
        10,
    );
    assert_eq!(all.len(), 5);

    // With threshold: only close results
    let filtered = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 5,
                "threshold": 0.6
            }}
        }),
        10,
    );

    // Doc 0 (0.9, 0.1, 0, 0) is very close → should pass
    // Far docs should be filtered
    assert!(
        filtered.len() < all.len(),
        "threshold should reduce results"
    );
    assert!(filtered.len() >= 1, "closest doc should pass threshold");

    cleanup(&path);
}

// --- 3. Threshold too high excludes all ---

#[test]
fn knn_query_threshold_excludes_all() {
    let (path, index) = build_index("threshold_all");

    let results = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 5,
                "threshold": 0.999
            }}
        }),
        10,
    );

    // No exact match exists, threshold 0.999 should exclude everything
    assert_eq!(results.len(), 0);

    cleanup(&path);
}

// --- 4. kNN in bool.should (hybrid search) ---

#[test]
fn knn_query_in_bool_should() {
    let (path, index) = build_index("bool_should");

    let results = search(
        &index,
        json!({
            "query": {"bool": {"should": [
                {"match": {"title": "search engine"}},
                {"knn": {
                    "field": "embedding",
                    "query_vector": [1.0, 0.0, 0.0, 0.0],
                    "k": 3
                }}
            ]}}
        }),
        10,
    );

    assert!(!results.is_empty());
    // Doc 0 matches both text ("search engine design") and kNN (closest vector)
    // so it should be ranked first
    assert_eq!(results.hit(0).unwrap().doc_id().as_u32(), 0);

    cleanup(&path);
}

// --- 5. kNN in bool.must (conjunction) ---

#[test]
fn knn_query_in_bool_must() {
    let (path, index) = build_index("bool_must");

    let results = search(
        &index,
        json!({
            "query": {"bool": {"must": [
                {"match": {"title": "search"}},
                {"knn": {
                    "field": "embedding",
                    "query_vector": [1.0, 0.0, 0.0, 0.0],
                    "k": 5
                }}
            ]}}
        }),
        10,
    );

    // Only docs matching BOTH text "search" AND in kNN top-5
    // "search" matches docs 0, 1, 3. All are in kNN top-5.
    for hit in results.iter() {
        let source = hit.source().unwrap();
        let title = source["title"].as_str().unwrap();
        assert!(
            title.contains("search"),
            "must conjunction: doc should match 'search', got '{title}'"
        );
    }

    cleanup(&path);
}

// --- 6. kNN in bool.filter (vector as filter) ---

#[test]
fn knn_query_in_bool_filter() {
    let (path, index) = build_index("bool_filter");

    let results = search(
        &index,
        json!({
            "query": {"bool": {
                "must": [{"match": {"title": "search"}}],
                "filter": [{"knn": {
                    "field": "embedding",
                    "query_vector": [1.0, 0.0, 0.0, 0.0],
                    "k": 2
                }}]
            }}
        }),
        10,
    );

    // "search" matches docs 0, 1, 3
    // kNN top-2 closest to (1,0,0,0): doc 0 (0.9,0.1,0,0), doc 1 (0.1,0.9,0,0)
    // Intersection: docs 0 and 1
    assert!(results.len() <= 2, "filter should restrict to kNN top-2");
    for hit in results.iter() {
        let id = hit.doc_id().as_u32();
        assert!(id == 0 || id == 1, "expected doc 0 or 1, got {id}");
    }

    cleanup(&path);
}

// --- 7. num_candidates defaults to 1.5 * k ---

#[test]
fn knn_query_num_candidates_default() {
    // Verify the parse succeeds without num_candidates specified
    // (verifies the default path doesn't error)
    let _expr = parse_search(
        json!({
            "query": {"knn": {
                "field": "f",
                "query_vector": [1.0],
                "k": 10
            }}
        }),
        10,
    )
    .unwrap();
}

// --- 8. Scores are descending ---

#[test]
fn knn_query_scores_descending() {
    let (path, index) = build_index("scores_desc");

    let results = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 5
            }}
        }),
        10,
    );

    for i in 0..results.len().saturating_sub(1) {
        let a = results.hit(i).unwrap().score();
        let b = results.hit(i + 1).unwrap().score();
        assert!(
            a >= b,
            "scores should be descending: hit[{i}]={a} < hit[{}]={b}",
            i + 1
        );
    }

    cleanup(&path);
}

// --- 9. Scores non-negative ---

#[test]
fn knn_query_score_range() {
    let (path, index) = build_index("score_range");

    let results = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 5
            }}
        }),
        10,
    );

    for hit in results.iter() {
        let s = hit.score();
        // Cosine metric: scores are in [0, 1]. Other metrics may exceed 1
        // for unnormalized vectors (matching Lucene behavior).
        assert!(s >= 0.0, "score should be non-negative, got {s}");
    }

    cleanup(&path);
}

// --- 10. Explain works ---

#[test]
fn knn_query_explain() {
    let (path, index) = build_index("explain");

    let results = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 3
            }}
        }),
        10,
    );

    let hit = results.hit(0).unwrap();
    let explanation = hit.explain().expect("explain should not error");
    assert!(
        explanation.is_some(),
        "kNN query should produce an explanation"
    );
    let expl = explanation.unwrap();
    assert!(expl.value > 0.0, "explanation score should be > 0");

    cleanup(&path);
}

// --- 12. Non-numeric query_vector is rejected ---

#[test]
fn knn_query_invalid_vector() {
    let result = parse_search(
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1, "bad", 3]
            }}
        }),
        10,
    );

    assert!(
        result.is_err(),
        "non-numeric vector elements should be rejected"
    );
}

// --- 13. k=0 is rejected ---

#[test]
fn knn_query_zero_k() {
    let result = parse_search(
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 0
            }}
        }),
        10,
    );

    assert!(result.is_err(), "k=0 should be rejected");
}

// --- 14. Bool must correctness (validates doc_id sort fix) ---

#[test]
fn knn_query_bool_must_correctness() {
    let path = test_dir("bool_must_correct");
    let schema = Mapping::builder()
        .field("tag", FieldType::Keyword)
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    // 20 docs: 5 tagged "target", 15 tagged "other"
    // Vectors are spread across 4D space
    let mut docs = Vec::new();
    for i in 0..20 {
        let tag = if i < 5 { "target" } else { "other" };
        let angle = (i as f32) * 0.3;
        let v = [
            angle.cos(),
            angle.sin(),
            (angle * 0.5).cos(),
            (angle * 0.5).sin(),
        ];
        docs.push(json!({"tag": tag, "embedding": v}));
    }
    index.bulk(docs).unwrap();

    // kNN top-10 should include some "target" and some "other" docs
    let knn_only = search(
        &index,
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 1.0, 0.0],
                "k": 10
            }}
        }),
        10,
    );

    // Term match: only "target" docs
    let term_only = search(
        &index,
        json!({
            "query": {"term": {"tag": "target"}}
        }),
        10,
    );

    // Conjunction: must match both
    let conjunction = search(
        &index,
        json!({
            "query": {"bool": {"must": [
                {"term": {"tag": "target"}},
                {"knn": {
                    "field": "embedding",
                    "query_vector": [1.0, 0.0, 1.0, 0.0],
                    "k": 10
                }}
            ]}}
        }),
        10,
    );

    let knn_ids: std::collections::HashSet<u32> =
        knn_only.iter().map(|h| h.doc_id().as_u32()).collect();
    let term_ids: std::collections::HashSet<u32> =
        term_only.iter().map(|h| h.doc_id().as_u32()).collect();

    // Every conjunction result must be in BOTH kNN and term results
    for hit in conjunction.iter() {
        let id = hit.doc_id().as_u32();
        assert!(
            knn_ids.contains(&id),
            "conjunction doc {id} not in kNN results"
        );
        assert!(
            term_ids.contains(&id),
            "conjunction doc {id} not in term results"
        );
    }

    // Conjunction should not be empty (some target docs should be in kNN top-10)
    assert!(
        !conjunction.is_empty(),
        "conjunction should find at least one doc matching both conditions"
    );

    cleanup(&path);
}

// --- 11. Dimension mismatch at bind time ---
// After [[fix-silent-scorer-errors]], bind errors propagate instead of
// being silently swallowed. A kNN query_vector with the wrong number of
// dimensions fails loudly at search time rather than returning empty.

#[test]
fn knn_query_dimension_mismatch() {
    let (path, index) = build_index("dim_mismatch");

    let expr = parse_search(
        json!({
            "query": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 2.0],
                "k": 5
            }}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("dimension mismatch must error, not return empty"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("2 dimensions") && msg.contains("embedding"),
        "error message should name the dim mismatch: {msg}"
    );

    cleanup(&path);
}

// --- kNN field validation (regression for the silent-empty E4 bug:
//     knn on a non-dense_vector or unknown field must error, not
//     return zero hits — [[code-must-not-lie]]) ---

#[test]
fn knn_query_non_vector_field_errors() {
    // `title` is a Text field. A knn query against it can't run; the
    // engine must say so rather than silently returning zero hits.
    let (path, index) = build_index("non_vector_field");

    let expr = parse_search(
        json!({
            "query": {"knn": {
                "field": "title",
                "query_vector": [1.0, 2.0, 3.0, 4.0],
                "k": 5
            }}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("knn on a non-dense_vector field must error, not return empty"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("title") && msg.contains("dense_vector"),
        "error should name the field and that it is not a dense_vector: {msg}"
    );

    cleanup(&path);
}

#[test]
fn knn_query_unknown_field_errors() {
    // A field absent from the mapping must error, not silently empty.
    let (path, index) = build_index("unknown_field");

    let expr = parse_search(
        json!({
            "query": {"knn": {
                "field": "nope",
                "query_vector": [1.0, 2.0, 3.0, 4.0],
                "k": 5
            }}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("knn on an unknown field must error, not return empty"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("nope") && msg.contains("unknown"),
        "error should name the unknown field: {msg}"
    );

    cleanup(&path);
}

#[test]
fn knn_dims_zero_builder_rejected() {
    // The programmatic builder does not guard dims == 0 (the JSON parser
    // does). `Mapping::validate()` — invoked by `create_with_mapping` —
    // must reject it, else a knn against the field silently empties
    // (GlobalHnsw skips dims == 0). See [[feature-knn-query-type]] §4.
    let path = test_dir("dims_zero");
    let schema = Mapping::builder()
        .field("v", FieldType::dense_vector(0))
        .build();
    let err = match Index::create_with_mapping(&path, schema) {
        Ok(_) => panic!("dense_vector(0) must be rejected at mapping validation"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("v") && (msg.contains("dims") || msg.contains("dimension")),
        "error should name the field and dims: {msg}"
    );

    cleanup(&path);
}

// --- §4b: bind-time errors must propagate through the three call paths
//     that historically swallowed them — `filter` aggs, `filters` aggs,
//     and `inner_hits`. The filter query is now bound once against the
//     real searcher at `AggregationExpression::bind` time, so a bad-field
//     knn surfaces as `Err` instead of a silently empty bucket, and a
//     valid one actually runs. See [[feature-knn-query-type]] §4b and
//     [[code-must-not-lie]]. ---

#[test]
fn knn_bad_field_in_agg_filter_errors() {
    // A bad-field knn inside a `filter` aggregation binds at
    // `AggregationExpression::bind` time; the error must reach the search
    // caller, not collapse to an empty `filter` bucket.
    let (path, index) = build_index("agg_filter_bad_field");
    let expr = parse_search(
        json!({
            "query": {"match_all": {}},
            "aggs": {"f": {"filter": {"knn": {
                "field": "nope",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 2
            }}}}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("bad-field knn in a filter agg must error, not return empty buckets"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("nope") && msg.contains("unknown"),
        "error should name the unknown field: {msg}"
    );

    cleanup(&path);
}

#[test]
fn knn_valid_field_in_agg_filter_works() {
    // Positive companion: binding against the *real* searcher lets the
    // valid case run. Deterministic single-segment corpus of 4 separated
    // unit vectors; query_vector [1,0,0,0] with k=2 + num_candidates=10
    // (≥ corpus) makes the top-2 = docs {0,1} unambiguous (wide cosine
    // margin over docs {2,3}). The filter bucket counts exactly those 2.
    let path = test_dir("agg_filter_valid");
    let schema = Mapping::builder()
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();
    index
        .bulk(vec![
            json!({"embedding": [1.0, 0.0, 0.0, 0.0]}),
            json!({"embedding": [0.9, 0.1, 0.0, 0.0]}),
            json!({"embedding": [0.0, 1.0, 0.0, 0.0]}),
            json!({"embedding": [0.0, 0.0, 1.0, 0.0]}),
        ])
        .unwrap();

    let expr = parse_search(
        json!({
            "query": {"match_all": {}},
            "aggs": {"f": {"filter": {"knn": {
                "field": "embedding",
                "query_vector": [1.0, 0.0, 0.0, 0.0],
                "k": 2,
                "num_candidates": 10
            }}}}
        }),
        10,
    )
    .unwrap();
    let results = index.search(&expr).unwrap();
    let agg = results.aggregations()["f"].to_json();
    assert_eq!(
        agg["buckets"][0]["doc_count"].as_u64().unwrap(),
        2,
        "filter-agg knn top-2 should count exactly docs {{0,1}}: {agg}"
    );

    cleanup(&path);
}

#[test]
fn knn_bad_field_in_filters_agg_errors() {
    // The multi-filter `filters` factory is a *separate* swallow site
    // (its own per-filter bind loop). A bad-field knn in any named filter
    // must surface as `Err` — the "easy" parallel arm is where an
    // incomplete fix would hide.
    let (path, index) = build_index("filters_agg_bad_field");
    let expr = parse_search(
        json!({
            "query": {"match_all": {}},
            "aggs": {"f": {"filters": {"filters": {
                "a": {"knn": {"field": "nope", "query_vector": [1.0, 0.0, 0.0, 0.0], "k": 2}},
                "b": {"match_all": {}}
            }}}}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("bad-field knn in a filters agg must error, not return empty buckets"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("nope") && msg.contains("unknown"),
        "error should name the unknown field: {msg}"
    );

    cleanup(&path);
}

#[test]
fn filters_agg_sub_aggs_refused() {
    // Honest-refusal: `filters` + sub-aggregations is not yet implemented.
    // The historical `..` match silently dropped `sub_aggs`; the bind now
    // returns an explicit "not yet supported" `Err` rather than pretend to
    // honour them. The sub-agg parses fine in isolation — it is the
    // `filters` + `sub_aggs` *combination* that is refused. Implementing it
    // is deferred to phase-7. See [[code-must-not-lie]].
    let (path, index) = build_index("filters_agg_sub_aggs");
    let expr = parse_search(
        json!({
            "query": {"match_all": {}},
            "aggs": {"f": {
                "filters": {"filters": {"a": {"match_all": {}}}},
                "aggs": {"by_tag": {"terms": {"field": "tag"}}}
            }}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("filters agg with sub_aggs must be refused, not silently dropped"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("filters") && msg.contains("not yet supported"),
        "error should explain filters sub-aggs are unsupported: {msg}"
    );

    cleanup(&path);
}

#[test]
fn knn_bad_field_in_nested_inner_hits_errors() {
    // The `inner_hits` re-bind (`collect_inner_hit_specs`) was the third
    // swallow site. A `nested` query whose inner query is a bad-field knn
    // must surface as `Err` — the user must never silently receive empty
    // inner_hits. (With the §4 main-path strictness, the main nested bind
    // catches a field error first; this test pins the end-to-end contract
    // that the combination errors, and the inner_hits `?`-propagation is
    // additionally locked in by type-checking.) See [[feature-knn-query-type]] §4b.
    let (path, index) = build_index("nested_inner_hits_bad_field");
    let expr = parse_search(
        json!({
            "query": {"nested": {
                "path": "items",
                "query": {"knn": {
                    "field": "nope",
                    "query_vector": [1.0, 0.0, 0.0, 0.0],
                    "k": 2
                }},
                "inner_hits": {"name": "matched"}
            }}
        }),
        10,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("bad-field knn under nested inner_hits must error, not silently empty"),
        Err(e) => e,
    };
    let msg = err.to_string();
    assert!(
        msg.contains("nope") && msg.contains("unknown"),
        "error should name the unknown field: {msg}"
    );

    cleanup(&path);
}

// --- kNN recall survives a segment merge (regression for the
//     merge-persist silent-drop) ---

#[test]
fn knn_recall_survives_segment_merge() {
    // `execute_merge` rewrites the global HNSW resolver in memory but
    // `commit()` persists the vector index *before* `maybe_merge`. Before
    // the fix, the on-disk resolver kept pointing merged docs at the
    // removed source segments; the reader loads that stale copy, so every
    // kNN hit resolving to a merged-away segment was silently dropped.
    // After `force_merge(1)` collapses every segment, that meant *zero*
    // hits returned. This is the unit-scale reproduction of the 600k
    // recall collapse. See [[vector-recall-investigation-audit]] H6.
    let path = test_dir("merge_recall");
    let schema = Mapping::builder()
        .field("embedding", FieldType::dense_vector(4))
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();
    index.set_memory_budget(1); // one segment per document

    let n: usize = 16;
    let docs: Vec<_> = (0..n)
        .map(|i| json!({"embedding": [i as f32 + 1.0, 1.0, 0.0, 0.0]}))
        .collect();
    index.bulk(docs).unwrap();
    index.force_merge(1).unwrap(); // collapse all segments -> rewrites resolver

    // Every document must still be retrievable through the global graph;
    // a stale persisted resolver drops the merged docs.
    let results = search(
        &index,
        json!({"query": {"knn": {
            "field": "embedding",
            "query_vector": [1.0, 1.0, 0.0, 0.0],
            "k": n,
            "num_candidates": 100
        }}}),
        n,
    );
    assert_eq!(
        results.len(),
        n,
        "merge left dangling resolver entries: kNN returned {} of {n} hits",
        results.len()
    );

    cleanup(&path);
}