Skip to main content

luci/search/
expression.rs

1//! SearchExpression: the top-level search request DSL.
2//!
3//! A composable description of a search request: what to match (query),
4//! what to compute (aggregations), how to order (sort), and how to
5//! paginate (size, from, search_after, collapse).
6//!
7//! Consumers build expressions directly (Rust) or parse them from JSON
8//! at the edge (Python SDK, CLI).
9//!
10//! See [[architecture-scoring-materialization-separation]].
11
12use crate::core::LuciError;
13
14use crate::agg::AggregationExpression;
15use crate::query::ast::{QueryExpression, ScoringExpression};
16use crate::query::parser::{opt_f64, opt_str, opt_u64, parse_query, parse_query_expression};
17use crate::search::{SortField, SortValue, TrackTotalHits};
18
19/// Recognized top-level keys in a structured search request.
20///
21/// Used by the strict-validation pass in [`SearchExpression::from_json`]
22/// to reject typos (``siez``, ``quary``) and by [`is_bare_query`] to
23/// distinguish a bare query body (e.g. `{"match": {...}}`) from a
24/// structured search with a single search-level key.
25///
26/// ``highlight`` and ``explain`` are deliberately absent: highlighting
27/// is retrieved lazily per-hit via ``Hit.highlight(field)`` and explain
28/// is always produced for scoring queries. Accepting either as a
29/// top-level key would silently drop the user's input — see CLAUDE.md
30/// "Never silently drop user input" rule.
31const SEARCH_LEVEL_KEYS: &[&str] = &[
32    "query",
33    "aggs",
34    "aggregations",
35    "size",
36    "from",
37    "sort",
38    "search_after",
39    "collapse",
40    "track_total_hits",
41    "rescore",
42    "_source",
43    "fields",
44];
45
46/// A bare query body is a single-key object whose key is *not* a
47/// top-level search-level key (``size``, ``sort``, etc.). Routes
48/// callers like ``idx.search({"match": {...}})`` to the fast path
49/// without going through the strict-validation step.
50fn is_bare_query(json: &serde_json::Value) -> bool {
51    let Some(obj) = json.as_object() else {
52        return false;
53    };
54    if obj.len() != 1 {
55        return false;
56    }
57    let key = obj.keys().next().expect("checked len == 1");
58    !SEARCH_LEVEL_KEYS.contains(&key.as_str())
59}
60
61/// Reject object keys outside the expected allow-list. Returns the
62/// underlying map for chained reads (`obj.get("foo")`).
63///
64/// Error format mirrors serde's "unknown field `X`, expected one of
65/// ..." so users get consistent guidance regardless of where the
66/// check fires.
67pub(crate) fn validate_obj_keys<'a>(
68    val: &'a serde_json::Value,
69    expected: &[&str],
70    ctx: &str,
71) -> crate::core::Result<&'a serde_json::Map<String, serde_json::Value>> {
72    let obj = val
73        .as_object()
74        .ok_or_else(|| crate::core::LuciError::InvalidQuery(format!("{ctx}: must be an object")))?;
75    for key in obj.keys() {
76        if !expected.contains(&key.as_str()) {
77            let expected_list = expected
78                .iter()
79                .map(|k| format!("`{k}`"))
80                .collect::<Vec<_>>()
81                .join(", ");
82            return Err(crate::core::LuciError::InvalidQuery(format!(
83                "{ctx}: unknown field `{key}`, expected one of {expected_list}"
84            )));
85        }
86    }
87    Ok(obj)
88}
89
90/// Top-level search-body key validation. Delegates to
91/// [`validate_obj_keys`] with the curated top-level allow-list.
92fn validate_search_keys(
93    obj: &serde_json::Map<String, serde_json::Value>,
94) -> crate::core::Result<()> {
95    for key in obj.keys() {
96        if !SEARCH_LEVEL_KEYS.contains(&key.as_str()) {
97            let expected = SEARCH_LEVEL_KEYS
98                .iter()
99                .map(|k| format!("`{k}`"))
100                .collect::<Vec<_>>()
101                .join(", ");
102            return Err(crate::core::LuciError::InvalidQuery(format!(
103                "invalid search request: unknown field `{key}`, expected one of {expected}"
104            )));
105        }
106    }
107    Ok(())
108}
109
110/// A complete search request.
111///
112/// The engine's native search input. `Index.search()` takes this.
113/// JSON parsing at the edge (`SearchExpression::from_json`) produces this.
114///
115/// The `query` field accepts any `QueryExpression` — scoring queries
116/// (match, term, knn, bool) and ranking expressions (fusion/RRF).
117pub struct SearchExpression {
118    /// The query — scoring or ranking expression. None means match_all.
119    pub(crate) query: Option<QueryExpression>,
120    /// Named aggregation definitions.
121    pub(crate) aggs: Vec<(String, AggregationExpression)>,
122    /// Maximum hits to return.
123    pub(crate) size: usize,
124    /// Pagination offset.
125    pub(crate) from: usize,
126    /// Sort specification. None = sort by score.
127    pub(crate) sort: Option<Vec<SortField>>,
128    /// Field collapse (deduplication by field value).
129    pub(crate) collapse: Option<String>,
130    /// Keyset pagination cursor.
131    pub(crate) search_after: Option<Vec<SortValue>>,
132    /// Total hits tracking mode.
133    pub(crate) track_total_hits: TrackTotalHits,
134    /// Rescore specification (optional second-pass re-ranking).
135    pub(crate) rescore: Option<RescoreSpec>,
136}
137
138/// Rescore specification for second-pass re-ranking.
139pub struct RescoreSpec {
140    pub(crate) query: Box<dyn crate::query::Query>,
141    pub window_size: usize,
142    pub query_weight: f32,
143    pub rescore_query_weight: f32,
144    pub score_mode: crate::search::RescoreScoreMode,
145}
146
147impl SearchExpression {
148    /// Create a new expression with defaults (match_all, size=10).
149    pub fn new() -> Self {
150        Self {
151            query: None,
152            aggs: Vec::new(),
153            size: 10,
154            from: 0,
155            sort: None,
156            collapse: None,
157            search_after: None,
158            track_total_hits: TrackTotalHits::Exact,
159            rescore: None,
160        }
161    }
162
163    /// Set the query expression (scoring or ranking).
164    pub fn query(mut self, query: QueryExpression) -> Self {
165        self.query = Some(query);
166        self
167    }
168
169    /// Set a scoring query (convenience — wraps in `QueryExpression::Scoring`).
170    pub fn scoring_query(mut self, query: ScoringExpression) -> Self {
171        self.query = Some(QueryExpression::Scoring(query));
172        self
173    }
174
175    /// Add a named aggregation.
176    pub fn agg(mut self, name: impl Into<String>, agg: AggregationExpression) -> Self {
177        self.aggs.push((name.into(), agg));
178        self
179    }
180
181    /// Set the maximum number of hits to return.
182    pub fn size(mut self, size: usize) -> Self {
183        self.size = size;
184        self
185    }
186
187    /// Set the pagination offset.
188    pub fn from(mut self, from: usize) -> Self {
189        self.from = from;
190        self
191    }
192
193    /// Set the sort specification.
194    pub fn sort(mut self, sort: Vec<SortField>) -> Self {
195        self.sort = Some(sort);
196        self
197    }
198
199    /// Set the collapse field.
200    pub fn collapse(mut self, field: impl Into<String>) -> Self {
201        self.collapse = Some(field.into());
202        self
203    }
204
205    /// Set the search_after cursor.
206    pub fn search_after(mut self, cursor: Vec<SortValue>) -> Self {
207        self.search_after = Some(cursor);
208        self
209    }
210
211    /// Set total hits tracking mode.
212    pub fn track_total_hits(mut self, mode: TrackTotalHits) -> Self {
213        self.track_total_hits = mode;
214        self
215    }
216
217    /// Set a rescore specification.
218    pub fn rescore(mut self, rescore: RescoreSpec) -> Self {
219        self.rescore = Some(rescore);
220        self
221    }
222}
223
224impl Default for SearchExpression {
225    fn default() -> Self {
226        Self::new()
227    }
228}
229
230/// Parse an ES-compatible JSON search request into a SearchExpression.
231///
232/// This is the edge parser — converts the JSON wire format into the
233/// engine's native expression type. Called by Python SDK and CLI.
234///
235/// Accepts both bare queries (`{"match": {...}}`) and structured
236/// requests (`{"query": {...}, "aggs": {...}, "size": 10}`).
237pub fn parse_search(
238    json: serde_json::Value,
239    default_size: usize,
240) -> Result<SearchExpression, crate::core::LuciError> {
241    SearchExpression::from_json(json, default_size)
242}
243
244impl SearchExpression {
245    /// Parse an ES-compatible JSON search request.
246    ///
247    /// Accepts both bare queries (`{"match": {...}}`) and structured
248    /// requests (`{"query": {...}, "aggs": {...}, "size": 10}`).
249    pub fn from_json(
250        json: serde_json::Value,
251        default_size: usize,
252    ) -> Result<SearchExpression, crate::core::LuciError> {
253        let mut expr = SearchExpression::new();
254
255        // Bare query fast path: a single-key object whose key is a
256        // query type ("match", "term", …) — not a search-level key.
257        // Non-object inputs also take this path so the existing
258        // error message from `parse_query_expression` fires.
259        if !json.is_object() || is_bare_query(&json) {
260            expr.query = Some(parse_query_expression(&json)?);
261            expr.size = default_size;
262            return Ok(expr);
263        }
264
265        // Structured request: reject unknown top-level keys up-front.
266        // The borrow of `obj` is scoped so the subsequent `json.get`
267        // calls below still work against the same owned value.
268        let json_obj = json.as_object().expect("is_object checked above");
269        validate_search_keys(json_obj)?;
270
271        if let Some(q) = json.get("query") {
272            expr.query = Some(parse_query_expression(q)?);
273        }
274
275        if let Some(aggs_json) = json.get("aggs").or_else(|| json.get("aggregations")) {
276            expr.aggs = crate::agg::parser::parse_aggs(aggs_json)?;
277        }
278
279        expr.size = opt_u64(json_obj, "size", "search")?
280            .map(|v| v as usize)
281            .unwrap_or(default_size);
282        expr.from = opt_u64(json_obj, "from", "search")?
283            .map(|v| v as usize)
284            .unwrap_or(0);
285
286        expr.sort = crate::index::parse_sort(json.get("sort"))?;
287        expr.search_after = crate::index::parse_search_after(json.get("search_after"))?;
288
289        if let Some(collapse_val) = json.get("collapse") {
290            // Only `field` is honoured today; ES also has `inner_hits`
291            // and `max_concurrent_group_searches`. Adding those to the
292            // allow-list is a follow-up once the engine supports them.
293            let obj = validate_obj_keys(collapse_val, &["field"], "collapse")?;
294            expr.collapse = opt_str(obj, "field", "collapse")?.map(String::from);
295        }
296
297        expr.track_total_hits = match json.get("track_total_hits") {
298            Some(serde_json::Value::Bool(true)) | None => TrackTotalHits::Exact,
299            Some(serde_json::Value::Bool(false)) => TrackTotalHits::Disabled,
300            Some(serde_json::Value::Number(n)) => {
301                TrackTotalHits::UpTo(n.as_u64().ok_or_else(|| {
302                    LuciError::InvalidQuery(
303                        "track_total_hits: integer count must be a non-negative integer".into(),
304                    )
305                })?)
306            }
307            Some(other) => {
308                return Err(LuciError::InvalidQuery(format!(
309                    "track_total_hits: must be a boolean or integer, got {other}"
310                )));
311            }
312        };
313
314        if let Some(rescore_val) = json.get("rescore") {
315            let rescore_obj = validate_obj_keys(rescore_val, &["window_size", "query"], "rescore")?;
316            let window_size = opt_u64(rescore_obj, "window_size", "rescore")?
317                .map(|v| v as usize)
318                .unwrap_or(10);
319            let inner_query = rescore_obj.get("query");
320            let inner_obj = match inner_query {
321                Some(v) => Some(validate_obj_keys(
322                    v,
323                    &[
324                        "rescore_query",
325                        "query_weight",
326                        "rescore_query_weight",
327                        "score_mode",
328                    ],
329                    "rescore.query",
330                )?),
331                None => None,
332            };
333            if let Some(rq) = inner_obj.and_then(|o| o.get("rescore_query")) {
334                let rescore_query: Box<dyn crate::query::Query> = Box::new(parse_query(rq)?);
335                let inner = inner_obj.expect("inner_obj checked above");
336                let query_weight = opt_f64(inner, "query_weight", "rescore.query")?
337                    .map(|v| v as f32)
338                    .unwrap_or(1.0);
339                let rescore_query_weight = opt_f64(inner, "rescore_query_weight", "rescore.query")?
340                    .map(|v| v as f32)
341                    .unwrap_or(1.0);
342                let score_mode = match opt_str(inner, "score_mode", "rescore.query")? {
343                    Some("multiply") => crate::search::RescoreScoreMode::Multiply,
344                    Some("avg") => crate::search::RescoreScoreMode::Avg,
345                    Some("max") => crate::search::RescoreScoreMode::Max,
346                    Some("min") => crate::search::RescoreScoreMode::Min,
347                    Some("total") | None => crate::search::RescoreScoreMode::Total,
348                    Some(other) => {
349                        return Err(crate::core::LuciError::InvalidQuery(format!(
350                            "rescore.query.score_mode: unknown value '{other}', expected \
351                             one of `total`, `multiply`, `avg`, `max`, `min`"
352                        )));
353                    }
354                };
355                expr.rescore = Some(RescoreSpec {
356                    query: rescore_query,
357                    window_size,
358                    query_weight,
359                    rescore_query_weight,
360                    score_mode,
361                });
362            }
363        }
364
365        Ok(expr)
366    }
367}