lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! SearchExpression: the top-level search request DSL.
//!
//! A composable description of a search request: what to match (query),
//! what to compute (aggregations), how to order (sort), and how to
//! paginate (size, from, search_after, collapse).
//!
//! Consumers build expressions directly (Rust) or parse them from JSON
//! at the edge (Python SDK, CLI).
//!
//! See [[architecture-scoring-materialization-separation]].

use crate::core::LuciError;

use crate::agg::AggregationExpression;
use crate::query::ast::{QueryExpression, ScoringExpression};
use crate::query::parser::{opt_f64, opt_str, opt_u64, parse_query, parse_query_expression};
use crate::search::{SortField, SortValue, TrackTotalHits};

/// Recognized top-level keys in a structured search request.
///
/// Used by the strict-validation pass in [`SearchExpression::from_json`]
/// to reject typos (``siez``, ``quary``) and by [`is_bare_query`] to
/// distinguish a bare query body (e.g. `{"match": {...}}`) from a
/// structured search with a single search-level key.
///
/// ``highlight`` and ``explain`` are deliberately absent: highlighting
/// is retrieved lazily per-hit via ``Hit.highlight(field)`` and explain
/// is always produced for scoring queries. Accepting either as a
/// top-level key would silently drop the user's input — see CLAUDE.md
/// "Never silently drop user input" rule.
const SEARCH_LEVEL_KEYS: &[&str] = &[
    "query",
    "aggs",
    "aggregations",
    "size",
    "from",
    "sort",
    "search_after",
    "collapse",
    "track_total_hits",
    "rescore",
    "_source",
    "fields",
];

/// A bare query body is a single-key object whose key is *not* a
/// top-level search-level key (``size``, ``sort``, etc.). Routes
/// callers like ``idx.search({"match": {...}})`` to the fast path
/// without going through the strict-validation step.
fn is_bare_query(json: &serde_json::Value) -> bool {
    let Some(obj) = json.as_object() else {
        return false;
    };
    if obj.len() != 1 {
        return false;
    }
    let key = obj.keys().next().expect("checked len == 1");
    !SEARCH_LEVEL_KEYS.contains(&key.as_str())
}

/// Reject object keys outside the expected allow-list. Returns the
/// underlying map for chained reads (`obj.get("foo")`).
///
/// Error format mirrors serde's "unknown field `X`, expected one of
/// ..." so users get consistent guidance regardless of where the
/// check fires.
pub(crate) fn validate_obj_keys<'a>(
    val: &'a serde_json::Value,
    expected: &[&str],
    ctx: &str,
) -> crate::core::Result<&'a serde_json::Map<String, serde_json::Value>> {
    let obj = val
        .as_object()
        .ok_or_else(|| crate::core::LuciError::InvalidQuery(format!("{ctx}: must be an object")))?;
    for key in obj.keys() {
        if !expected.contains(&key.as_str()) {
            let expected_list = expected
                .iter()
                .map(|k| format!("`{k}`"))
                .collect::<Vec<_>>()
                .join(", ");
            return Err(crate::core::LuciError::InvalidQuery(format!(
                "{ctx}: unknown field `{key}`, expected one of {expected_list}"
            )));
        }
    }
    Ok(obj)
}

/// Top-level search-body key validation. Delegates to
/// [`validate_obj_keys`] with the curated top-level allow-list.
fn validate_search_keys(
    obj: &serde_json::Map<String, serde_json::Value>,
) -> crate::core::Result<()> {
    for key in obj.keys() {
        if !SEARCH_LEVEL_KEYS.contains(&key.as_str()) {
            let expected = SEARCH_LEVEL_KEYS
                .iter()
                .map(|k| format!("`{k}`"))
                .collect::<Vec<_>>()
                .join(", ");
            return Err(crate::core::LuciError::InvalidQuery(format!(
                "invalid search request: unknown field `{key}`, expected one of {expected}"
            )));
        }
    }
    Ok(())
}

/// A complete search request.
///
/// The engine's native search input. `Index.search()` takes this.
/// JSON parsing at the edge (`SearchExpression::from_json`) produces this.
///
/// The `query` field accepts any `QueryExpression` — scoring queries
/// (match, term, knn, bool) and ranking expressions (fusion/RRF).
pub struct SearchExpression {
    /// The query — scoring or ranking expression. None means match_all.
    pub(crate) query: Option<QueryExpression>,
    /// Named aggregation definitions.
    pub(crate) aggs: Vec<(String, AggregationExpression)>,
    /// Maximum hits to return.
    pub(crate) size: usize,
    /// Pagination offset.
    pub(crate) from: usize,
    /// Sort specification. None = sort by score.
    pub(crate) sort: Option<Vec<SortField>>,
    /// Field collapse (deduplication by field value).
    pub(crate) collapse: Option<String>,
    /// Keyset pagination cursor.
    pub(crate) search_after: Option<Vec<SortValue>>,
    /// Total hits tracking mode.
    pub(crate) track_total_hits: TrackTotalHits,
    /// Rescore specification (optional second-pass re-ranking).
    pub(crate) rescore: Option<RescoreSpec>,
}

/// Rescore specification for second-pass re-ranking.
pub struct RescoreSpec {
    pub(crate) query: Box<dyn crate::query::Query>,
    pub window_size: usize,
    pub query_weight: f32,
    pub rescore_query_weight: f32,
    pub score_mode: crate::search::RescoreScoreMode,
}

impl SearchExpression {
    /// Create a new expression with defaults (match_all, size=10).
    pub fn new() -> Self {
        Self {
            query: None,
            aggs: Vec::new(),
            size: 10,
            from: 0,
            sort: None,
            collapse: None,
            search_after: None,
            track_total_hits: TrackTotalHits::Exact,
            rescore: None,
        }
    }

    /// Set the query expression (scoring or ranking).
    pub fn query(mut self, query: QueryExpression) -> Self {
        self.query = Some(query);
        self
    }

    /// Set a scoring query (convenience — wraps in `QueryExpression::Scoring`).
    pub fn scoring_query(mut self, query: ScoringExpression) -> Self {
        self.query = Some(QueryExpression::Scoring(query));
        self
    }

    /// Add a named aggregation.
    pub fn agg(mut self, name: impl Into<String>, agg: AggregationExpression) -> Self {
        self.aggs.push((name.into(), agg));
        self
    }

    /// Set the maximum number of hits to return.
    pub fn size(mut self, size: usize) -> Self {
        self.size = size;
        self
    }

    /// Set the pagination offset.
    pub fn from(mut self, from: usize) -> Self {
        self.from = from;
        self
    }

    /// Set the sort specification.
    pub fn sort(mut self, sort: Vec<SortField>) -> Self {
        self.sort = Some(sort);
        self
    }

    /// Set the collapse field.
    pub fn collapse(mut self, field: impl Into<String>) -> Self {
        self.collapse = Some(field.into());
        self
    }

    /// Set the search_after cursor.
    pub fn search_after(mut self, cursor: Vec<SortValue>) -> Self {
        self.search_after = Some(cursor);
        self
    }

    /// Set total hits tracking mode.
    pub fn track_total_hits(mut self, mode: TrackTotalHits) -> Self {
        self.track_total_hits = mode;
        self
    }

    /// Set a rescore specification.
    pub fn rescore(mut self, rescore: RescoreSpec) -> Self {
        self.rescore = Some(rescore);
        self
    }
}

impl Default for SearchExpression {
    fn default() -> Self {
        Self::new()
    }
}

/// Parse an ES-compatible JSON search request into a SearchExpression.
///
/// This is the edge parser — converts the JSON wire format into the
/// engine's native expression type. Called by Python SDK and CLI.
///
/// Accepts both bare queries (`{"match": {...}}`) and structured
/// requests (`{"query": {...}, "aggs": {...}, "size": 10}`).
pub fn parse_search(
    json: serde_json::Value,
    default_size: usize,
) -> Result<SearchExpression, crate::core::LuciError> {
    SearchExpression::from_json(json, default_size)
}

impl SearchExpression {
    /// Parse an ES-compatible JSON search request.
    ///
    /// Accepts both bare queries (`{"match": {...}}`) and structured
    /// requests (`{"query": {...}, "aggs": {...}, "size": 10}`).
    pub fn from_json(
        json: serde_json::Value,
        default_size: usize,
    ) -> Result<SearchExpression, crate::core::LuciError> {
        let mut expr = SearchExpression::new();

        // Bare query fast path: a single-key object whose key is a
        // query type ("match", "term", …) — not a search-level key.
        // Non-object inputs also take this path so the existing
        // error message from `parse_query_expression` fires.
        if !json.is_object() || is_bare_query(&json) {
            expr.query = Some(parse_query_expression(&json)?);
            expr.size = default_size;
            return Ok(expr);
        }

        // Structured request: reject unknown top-level keys up-front.
        // The borrow of `obj` is scoped so the subsequent `json.get`
        // calls below still work against the same owned value.
        let json_obj = json.as_object().expect("is_object checked above");
        validate_search_keys(json_obj)?;

        if let Some(q) = json.get("query") {
            expr.query = Some(parse_query_expression(q)?);
        }

        if let Some(aggs_json) = json.get("aggs").or_else(|| json.get("aggregations")) {
            expr.aggs = crate::agg::parser::parse_aggs(aggs_json)?;
        }

        expr.size = opt_u64(json_obj, "size", "search")?
            .map(|v| v as usize)
            .unwrap_or(default_size);
        expr.from = opt_u64(json_obj, "from", "search")?
            .map(|v| v as usize)
            .unwrap_or(0);

        expr.sort = crate::index::parse_sort(json.get("sort"))?;
        expr.search_after = crate::index::parse_search_after(json.get("search_after"))?;

        if let Some(collapse_val) = json.get("collapse") {
            // Only `field` is honoured today; ES also has `inner_hits`
            // and `max_concurrent_group_searches`. Adding those to the
            // allow-list is a follow-up once the engine supports them.
            let obj = validate_obj_keys(collapse_val, &["field"], "collapse")?;
            expr.collapse = opt_str(obj, "field", "collapse")?.map(String::from);
        }

        expr.track_total_hits = match json.get("track_total_hits") {
            Some(serde_json::Value::Bool(true)) | None => TrackTotalHits::Exact,
            Some(serde_json::Value::Bool(false)) => TrackTotalHits::Disabled,
            Some(serde_json::Value::Number(n)) => {
                TrackTotalHits::UpTo(n.as_u64().ok_or_else(|| {
                    LuciError::InvalidQuery(
                        "track_total_hits: integer count must be a non-negative integer".into(),
                    )
                })?)
            }
            Some(other) => {
                return Err(LuciError::InvalidQuery(format!(
                    "track_total_hits: must be a boolean or integer, got {other}"
                )));
            }
        };

        if let Some(rescore_val) = json.get("rescore") {
            let rescore_obj = validate_obj_keys(rescore_val, &["window_size", "query"], "rescore")?;
            let window_size = opt_u64(rescore_obj, "window_size", "rescore")?
                .map(|v| v as usize)
                .unwrap_or(10);
            let inner_query = rescore_obj.get("query");
            let inner_obj = match inner_query {
                Some(v) => Some(validate_obj_keys(
                    v,
                    &[
                        "rescore_query",
                        "query_weight",
                        "rescore_query_weight",
                        "score_mode",
                    ],
                    "rescore.query",
                )?),
                None => None,
            };
            if let Some(rq) = inner_obj.and_then(|o| o.get("rescore_query")) {
                let rescore_query: Box<dyn crate::query::Query> = Box::new(parse_query(rq)?);
                let inner = inner_obj.expect("inner_obj checked above");
                let query_weight = opt_f64(inner, "query_weight", "rescore.query")?
                    .map(|v| v as f32)
                    .unwrap_or(1.0);
                let rescore_query_weight = opt_f64(inner, "rescore_query_weight", "rescore.query")?
                    .map(|v| v as f32)
                    .unwrap_or(1.0);
                let score_mode = match opt_str(inner, "score_mode", "rescore.query")? {
                    Some("multiply") => crate::search::RescoreScoreMode::Multiply,
                    Some("avg") => crate::search::RescoreScoreMode::Avg,
                    Some("max") => crate::search::RescoreScoreMode::Max,
                    Some("min") => crate::search::RescoreScoreMode::Min,
                    Some("total") | None => crate::search::RescoreScoreMode::Total,
                    Some(other) => {
                        return Err(crate::core::LuciError::InvalidQuery(format!(
                            "rescore.query.score_mode: unknown value '{other}', expected \
                             one of `total`, `multiply`, `avg`, `max`, `min`"
                        )));
                    }
                };
                expr.rescore = Some(RescoreSpec {
                    query: rescore_query,
                    window_size,
                    query_weight,
                    rescore_query_weight,
                    score_mode,
                });
            }
        }

        Ok(expr)
    }
}