lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Exit criteria integration tests for Milestone 3: Aggregations.

use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use serde_json::json;

fn search(
    index: &mut Index,
    query: serde_json::Value,
    size: usize,
) -> luci::search::results::SearchResults {
    let expr = luci::search::expression::parse_search(query, size).unwrap();
    index.search(&expr).unwrap()
}

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir =
        std::env::temp_dir().join(format!("luci_m3_integration_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

fn build_test_index(name: &str) -> (std::path::PathBuf, Index) {
    let path = test_dir(name);
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .field("price", FieldType::Float)
        .field("quantity", FieldType::Integer)
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    let docs = vec![
        json!({"title": "Widget A", "tag": "electronics", "price": 29.99, "quantity": 100}),
        json!({"title": "Widget B", "tag": "electronics", "price": 49.99, "quantity": 50}),
        json!({"title": "Gadget C", "tag": "electronics", "price": 99.99, "quantity": 25}),
        json!({"title": "Book D", "tag": "books", "price": 14.99, "quantity": 200}),
        json!({"title": "Book E", "tag": "books", "price": 24.99, "quantity": 150}),
        json!({"title": "Toy F", "tag": "toys", "price": 9.99, "quantity": 500}),
        json!({"title": "Toy G", "tag": "toys", "price": 19.99, "quantity": 300}),
        json!({"title": "Toy H", "tag": "toys", "price": 4.99, "quantity": 1000}),
    ];

    index.bulk(docs).unwrap();

    (path, index)
}

/// Exit Criterion 1: Terms aggregation returns correct bucket counts.
#[test]
fn terms_aggregation() {
    let (path, mut index) = build_test_index("terms");

    let results = search(
        &mut index,
        json!({
            "query": {"match_all": {}},
            "aggs": {
                "by_tag": {"terms": {"field": "tag"}}
            }
        }),
        10,
    );

    assert_eq!(results.total_hits().value, 8);
    let by_tag = &results.aggregations()["by_tag"];
    let buckets_json = by_tag.to_json();
    let buckets = buckets_json["buckets"].as_array().unwrap();

    let find_count = |key: &str| -> u64 {
        buckets
            .iter()
            .find(|b| b["key"].as_str() == Some(key))
            .map(|b| b["doc_count"].as_u64().unwrap())
            .unwrap_or(0)
    };

    assert_eq!(find_count("electronics"), 3);
    assert_eq!(find_count("books"), 2);
    assert_eq!(find_count("toys"), 3);

    cleanup(&path);
}

/// Exit Criterion 2: Metric aggregations produce correct results.
#[test]
fn metric_aggregations() {
    let (path, mut index) = build_test_index("metrics");

    let results = search(
        &mut index,
        json!({
            "query": {"match_all": {}},
            "aggs": {
                "avg_price": {"avg": {"field": "price"}},
                "total_price": {"sum": {"field": "price"}},
                "min_price": {"min": {"field": "price"}},
                "max_price": {"max": {"field": "price"}},
                "count": {"value_count": {"field": "price"}}
            }
        }),
        0,
    );

    let avg = results.aggregations()["avg_price"].to_json()["value"]
        .as_f64()
        .unwrap();
    let sum = results.aggregations()["total_price"].to_json()["value"]
        .as_f64()
        .unwrap();
    let min = results.aggregations()["min_price"].to_json()["value"]
        .as_f64()
        .unwrap();
    let max = results.aggregations()["max_price"].to_json()["value"]
        .as_f64()
        .unwrap();
    let count = results.aggregations()["count"].to_json()["value"]
        .as_f64()
        .unwrap();

    let expected_sum = 29.99 + 49.99 + 99.99 + 14.99 + 24.99 + 9.99 + 19.99 + 4.99;
    assert!(
        (sum - expected_sum).abs() < 0.1,
        "sum: {sum} expected: {expected_sum}"
    );
    assert!((avg - expected_sum / 8.0).abs() < 0.1);
    assert!((min - 4.99).abs() < 0.1);
    assert!((max - 99.99).abs() < 0.1);
    assert_eq!(count, 8.0);

    cleanup(&path);
}

/// Exit Criterion 3: Stats aggregation.
#[test]
fn stats_aggregation() {
    let (path, mut index) = build_test_index("stats");

    let results = search(
        &mut index,
        json!({
            "query": {"match_all": {}},
            "aggs": {
                "price_stats": {"stats": {"field": "price"}}
            }
        }),
        0,
    );

    let stats = results.aggregations()["price_stats"].to_json();
    assert!(stats["count"].as_f64().unwrap() == 8.0);
    assert!(stats["min"].as_f64().unwrap() < 5.0);
    assert!(stats["max"].as_f64().unwrap() > 99.0);
    assert!(stats["sum"].as_f64().unwrap() > 200.0);
    assert!(stats["value"].as_f64().unwrap() > 20.0); // avg

    cleanup(&path);
}

/// Exit Criterion 4: Aggregations with a filtered query.
#[test]
fn aggs_with_query_filter() {
    let (path, mut index) = build_test_index("filtered_aggs");

    let results = search(
        &mut index,
        json!({
            "query": {"term": {"tag": "electronics"}},
            "aggs": {
                "avg_price": {"avg": {"field": "price"}}
            }
        }),
        10,
    );

    assert_eq!(results.total_hits().value, 3);
    let avg = results.aggregations()["avg_price"].to_json()["value"]
        .as_f64()
        .unwrap();
    assert!((avg - 59.99).abs() < 1.0, "avg: {avg}");

    cleanup(&path);
}

/// Exit Criterion 5: Sub-aggregation nesting (terms → avg).
#[test]
fn sub_aggregation_nesting() {
    let (path, mut index) = build_test_index("nesting");

    let results = search(
        &mut index,
        json!({
            "query": {"match_all": {}},
            "aggs": {
                "by_tag": {
                    "terms": {"field": "tag"},
                    "aggs": {
                        "avg_price": {"avg": {"field": "price"}}
                    }
                }
            }
        }),
        0,
    );

    let by_tag = results.aggregations()["by_tag"].to_json();
    let buckets = by_tag["buckets"].as_array().unwrap();

    let electronics = buckets
        .iter()
        .find(|b| b["key"].as_str() == Some("electronics"))
        .expect("should have electronics bucket");
    assert_eq!(electronics["doc_count"], 3);

    let avg = electronics["avg_price"]["value"].as_f64().unwrap();
    assert!((avg - 59.99).abs() < 1.0, "electronics avg_price={avg}");

    let books = buckets
        .iter()
        .find(|b| b["key"].as_str() == Some("books"))
        .expect("should have books bucket");
    let books_avg = books["avg_price"]["value"].as_f64().unwrap();
    assert!(
        (books_avg - 19.99).abs() < 1.0,
        "books avg_price={books_avg}"
    );

    cleanup(&path);
}

/// Exit Criterion 6: Histogram aggregation.
#[test]
fn histogram_aggregation() {
    let (path, mut index) = build_test_index("histogram");

    let results = search(
        &mut index,
        json!({
            "query": {"match_all": {}},
            "aggs": {
                "price_hist": {"histogram": {"field": "price", "interval": 25.0}}
            }
        }),
        0,
    );

    let hist = results.aggregations()["price_hist"].to_json();
    let buckets = hist["buckets"].as_array().unwrap();
    assert!(!buckets.is_empty(), "histogram should have buckets");

    for i in 0..buckets.len() - 1 {
        let k1 = buckets[i]["key"].as_f64().unwrap();
        let k2 = buckets[i + 1]["key"].as_f64().unwrap();
        assert!(k1 <= k2, "histogram buckets should be ordered");
    }

    cleanup(&path);
}

/// The five bucket aggregations that parse `sub_aggs` but do not yet
/// implement them must refuse loudly (an explicit "not yet supported"
/// error), not silently drop the sub-aggregation. This mirrors the
/// `filters` honest-refusal and closes the same [[code-must-not-lie]]
/// family — see `bucket-agg-sub-aggs-silent-drop`. Each agg parses fine on
/// its own; it is the agg + `sub_aggs` *combination* that is refused.
#[test]
fn bucket_aggs_refuse_unsupported_sub_aggs() {
    let (path, index) = build_test_index("subaggs_refused");

    let cases = [
        (
            "range",
            json!({"range": {"field": "price", "ranges": [{"to": 50.0}, {"from": 50.0}]}}),
        ),
        (
            "date_range",
            json!({"date_range": {"field": "price", "ranges": [{"to": 50.0}]}}),
        ),
        (
            "histogram",
            json!({"histogram": {"field": "price", "interval": 25.0}}),
        ),
        (
            "date_histogram",
            json!({"date_histogram": {"field": "price", "calendar_interval": "day"}}),
        ),
        (
            "geohash_grid",
            json!({"geohash_grid": {"field": "price", "precision": 5, "size": 10}}),
        ),
    ];

    for (name, mut agg_body) in cases {
        // Attach a well-formed sub-aggregation as a sibling of the agg-type
        // key (`{"b": {"<type>": {...}, "aggs": {...}}}`). It parses fine in
        // isolation; the bind-time refusal is what we assert.
        agg_body.as_object_mut().unwrap().insert(
            "aggs".to_string(),
            json!({"inner": {"value_count": {"field": "tag"}}}),
        );
        let expr = luci::search::expression::parse_search(
            json!({"query": {"match_all": {}}, "aggs": {"b": agg_body}}),
            0,
        )
        .unwrap();
        let err = match index.search(&expr) {
            Ok(_) => panic!("{name} agg with sub_aggs must be refused, not silently dropped"),
            Err(e) => e,
        };
        let msg = err.to_string();
        assert!(
            msg.contains(name) && msg.contains("not yet supported"),
            "{name}: error should name the agg and 'not yet supported': {msg}"
        );
    }

    cleanup(&path);
}

/// A refused bucket agg nested *as a sub-aggregation* (under a terms agg)
/// must also surface the error — this exercises the `bind_sub_aggs`
/// propagation path, distinct from the top-level `execute_query` path.
#[test]
fn nested_refused_bucket_agg_sub_aggs_errors() {
    let (path, index) = build_test_index("nested_refused");
    let expr = luci::search::expression::parse_search(
        json!({
            "query": {"match_all": {}},
            "aggs": {"by_tag": {
                "terms": {"field": "tag"},
                "aggs": {"r": {
                    "range": {"field": "price", "ranges": [{"to": 50.0}]},
                    "aggs": {"inner": {"value_count": {"field": "tag"}}}
                }}
            }}
        }),
        0,
    )
    .unwrap();
    let err = match index.search(&expr) {
        Ok(_) => panic!("range sub-agg with its own sub_aggs must error via bind_sub_aggs"),
        Err(e) => e,
    };
    assert!(
        err.to_string().contains("range") && err.to_string().contains("not yet supported"),
        "nested refusal should name the agg: {err}"
    );
    cleanup(&path);
}

/// Metric/leaf aggregations cannot nest sub-aggregations — Elasticsearch
/// rejects the combination outright (it is invalid input, not a deferred
/// feature). Luci must reject at parse, not silently drop the `sub_aggs`
/// the way the metric arms of `parse_agg_expr` otherwise would. The guard
/// is central (`agg_type_accepts_sub_aggs`), so probe a spread of leaf
/// families — simple metric, multi-value stats, cardinality, percentiles,
/// top_hits — to prove it covers the whole family, not just `avg`. See
/// [[code-must-not-lie]].
#[test]
fn metric_agg_sub_aggs_rejected_at_parse() {
    let leaf_bodies = [
        ("avg", json!({"avg": {"field": "price"}})),
        ("sum", json!({"sum": {"field": "price"}})),
        ("value_count", json!({"value_count": {"field": "tag"}})),
        ("stats", json!({"stats": {"field": "price"}})),
        ("cardinality", json!({"cardinality": {"field": "tag"}})),
        ("percentiles", json!({"percentiles": {"field": "price"}})),
        ("top_hits", json!({"top_hits": {"size": 1}})),
    ];

    for (type_name, mut body) in leaf_bodies {
        // Attach a well-formed sub-aggregation as a sibling of the type key.
        // It parses fine in isolation; the metric+sub_aggs combination is
        // what must be rejected, at parse, before the drop.
        body.as_object_mut().unwrap().insert(
            "aggs".to_string(),
            json!({"inner": {"value_count": {"field": "tag"}}}),
        );
        let result = luci::search::expression::parse_search(
            json!({"query": {"match_all": {}}, "aggs": {"a": body}}),
            0,
        );
        let err = match result {
            Ok(_) => {
                panic!("{type_name} agg with sub_aggs must be rejected at parse, not dropped")
            }
            Err(e) => e,
        };
        let msg = err.to_string();
        assert!(
            msg.contains(type_name) && msg.contains("cannot have sub-aggregations"),
            "{type_name}: error should name the type and ES 'cannot have' semantics: {msg}"
        );
    }
}