Skip to main content

luci/search/
highlight.rs

1//! Search highlighting: re-analyze field text and wrap matched terms in tags.
2//!
3//! Runs in the result construction phase (post-scoring, per-hit) following
4//! the same materialization pattern as explain and fields retrieval.
5//!
6//! See [[feature-search-highlight]].
7
8use std::collections::{HashMap, HashSet};
9
10use crate::analysis::{Analyzer, AnalyzerRegistry};
11
12use crate::query::ast::{ScoringExpression, SpanExpression};
13
14// --- Configuration types ---
15
16/// Parsed highlight request from the search JSON.
17///
18/// See [[feature-search-highlight]].
19#[derive(Clone, Debug)]
20pub struct HighlightConfig {
21    pub fields: Vec<HighlightFieldConfig>,
22    pub require_field_match: bool,
23    pub order: HighlightOrder,
24}
25
26/// Per-field highlight settings.
27///
28/// `fragment_size` / `number_of_fragments` cap the number of spans
29/// returned for very long fields. `0` for both means "emit every
30/// match".
31#[derive(Clone, Debug)]
32pub struct HighlightFieldConfig {
33    pub field: String,
34    pub fragment_size: usize,
35    pub number_of_fragments: usize,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq)]
39pub enum HighlightOrder {
40    /// Return fragments in their order of appearance in the field text.
41    None,
42    /// Return fragments ordered by relevance score (best match first).
43    Score,
44}
45
46// --- Public output type ---
47
48/// A single match span within a field's text.
49///
50/// The library returns match positions, not pre-rendered markup.
51/// Consumers decide how to present matches (HTML tags, React nodes,
52/// terminal colour codes, JSON, ...). See
53/// [[feature-search-highlight]] and
54/// [[architecture-scoring-materialization-separation]].
55#[derive(Clone, Debug, PartialEq, Eq)]
56pub struct Highlight {
57    /// The substring of the field text that matched.
58    pub text: String,
59    /// Byte offset in the original field text where the match starts.
60    pub start: usize,
61    /// Byte offset where the match ends (exclusive).
62    pub end: usize,
63}
64
65// --- Internal types ---
66
67#[derive(Clone, Debug)]
68struct MatchedToken {
69    offset_from: usize,
70    offset_to: usize,
71}
72
73// --- Constants ---
74
75const BOUNDARY_CHARS: &[char] = &['.', ',', '!', '?', ' ', '\t', '\n'];
76const BOUNDARY_MAX_SCAN: usize = 20;
77
78// --- Term extraction from query expression ---
79
80/// Extract query terms from the expression tree, keyed by field name.
81///
82/// Walks the expression recursively. For text-analyzed queries (Match,
83/// MatchPhrase), the query text is re-analyzed to get the actual
84/// indexed terms.
85///
86/// See [[feature-search-highlight]].
87pub(crate) fn extract_query_terms(
88    ast: &ScoringExpression,
89    searcher: &crate::search::searcher::Searcher,
90) -> HashMap<String, HashSet<String>> {
91    let mut terms: HashMap<String, HashSet<String>> = HashMap::new();
92    collect_terms(ast, searcher, &mut terms);
93    terms
94}
95
96fn collect_terms(
97    ast: &ScoringExpression,
98    searcher: &crate::search::searcher::Searcher,
99    terms: &mut HashMap<String, HashSet<String>>,
100) {
101    let analyzers = searcher.analyzers();
102    match ast {
103        ScoringExpression::Term { field, value } => {
104            terms
105                .entry(field.clone())
106                .or_default()
107                .insert(value.clone());
108        }
109        ScoringExpression::Terms { field, values } => {
110            let set = terms.entry(field.clone()).or_default();
111            for v in values {
112                set.insert(v.clone());
113            }
114        }
115        ScoringExpression::Match {
116            field,
117            query,
118            analyzer,
119        } => {
120            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
121            let a = analyzers.get(analyzer_name);
122            let tokens = a.analyze(query);
123            let set = terms.entry(field.clone()).or_default();
124            for token in tokens {
125                set.insert(token.text);
126            }
127        }
128        ScoringExpression::MatchPhrase {
129            field,
130            query,
131            analyzer,
132        } => {
133            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
134            let a = analyzers.get(analyzer_name);
135            let tokens = a.analyze(query);
136            let set = terms.entry(field.clone()).or_default();
137            for token in tokens {
138                set.insert(token.text);
139            }
140        }
141        ScoringExpression::MatchBoolPrefix {
142            field,
143            query,
144            analyzer,
145        } => {
146            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
147            let a = analyzers.get(analyzer_name);
148            let tokens = a.analyze(query);
149            let set = terms.entry(field.clone()).or_default();
150            for token in tokens {
151                set.insert(token.text);
152            }
153        }
154        ScoringExpression::Prefix { field, value } => {
155            terms
156                .entry(field.clone())
157                .or_default()
158                .insert(value.clone());
159        }
160        ScoringExpression::Fuzzy { field, value, .. } => {
161            terms
162                .entry(field.clone())
163                .or_default()
164                .insert(value.clone());
165        }
166        ScoringExpression::Wildcard { field, value } => {
167            terms
168                .entry(field.clone())
169                .or_default()
170                .insert(value.clone());
171        }
172        ScoringExpression::MultiMatch {
173            fields,
174            query,
175            analyzer,
176            ..
177        } => {
178            // For multi_match, use the first field's analyzer for all fields
179            let analyzer_name = if let Some(f) = fields.first() {
180                searcher.resolve_search_analyzer(f, analyzer.as_deref())
181            } else {
182                "standard"
183            };
184            let a = analyzers.get(analyzer_name);
185            let tokens = a.analyze(query);
186            for f in fields {
187                let set = terms.entry(f.clone()).or_default();
188                for token in &tokens {
189                    set.insert(token.text.clone());
190                }
191            }
192        }
193        // All span queries delegate to the SpanExpression walker —
194        // single arm, one concept.
195        ScoringExpression::Span(span_ast) => {
196            collect_span_terms(span_ast, searcher, terms);
197        }
198        // Recursive cases — descend into sub-queries
199        ScoringExpression::Bool {
200            must,
201            should,
202            must_not: _,
203            filter,
204            ..
205        } => {
206            for q in must.iter().chain(should).chain(filter) {
207                collect_terms(q, searcher, terms);
208            }
209            // must_not terms are intentionally excluded — they are
210            // exclusion criteria, not highlight targets.
211        }
212        ScoringExpression::DisMax { queries, .. } => {
213            for q in queries {
214                collect_terms(q, searcher, terms);
215            }
216        }
217        ScoringExpression::ConstantScore { query, .. }
218        | ScoringExpression::Boost { query, .. }
219        | ScoringExpression::Nested { query, .. } => {
220            collect_terms(query, searcher, terms);
221        }
222        ScoringExpression::ScriptScore { query, .. }
223        | ScoringExpression::FunctionScore { query, .. } => {
224            collect_terms(query, searcher, terms);
225        }
226        ScoringExpression::Boosting { positive, .. } => {
227            collect_terms(positive, searcher, terms);
228        }
229        // Leaf queries with no terms to extract
230        ScoringExpression::Exists { .. }
231        | ScoringExpression::Range { .. }
232        | ScoringExpression::GeoDistance { .. }
233        | ScoringExpression::GeoBoundingBox { .. }
234        | ScoringExpression::GeoShape { .. }
235        | ScoringExpression::Regexp { .. }
236        | ScoringExpression::Knn { .. }
237        | ScoringExpression::MatchAll
238        | ScoringExpression::MatchNone => {}
239    }
240}
241
242/// Walk a span AST and collect terms for highlighting. Mirrors
243/// [`collect_terms`] but operates on the span-typed subset.
244fn collect_span_terms(
245    ast: &SpanExpression,
246    searcher: &crate::search::searcher::Searcher,
247    terms: &mut HashMap<String, HashSet<String>>,
248) {
249    let _ = searcher;
250    match ast {
251        SpanExpression::SpanTerm { field, value } => {
252            terms
253                .entry(field.clone())
254                .or_default()
255                .insert(value.clone());
256        }
257        SpanExpression::SpanNear {
258            field,
259            terms: near_terms,
260            ..
261        } => {
262            let set = terms.entry(field.clone()).or_default();
263            for v in near_terms {
264                set.insert(v.clone());
265            }
266        }
267        SpanExpression::SpanNot { include, .. } => {
268            // Exclude side is negation, not highlight-worthy.
269            collect_span_terms(include, searcher, terms);
270        }
271        SpanExpression::SpanFirst { query, .. } => {
272            collect_span_terms(query, searcher, terms);
273        }
274    }
275}
276
277// --- Token matching ---
278
279/// Re-analyze field text and find matching token positions.
280fn find_matching_tokens(
281    text: &str,
282    query_terms: &HashSet<String>,
283    analyzer: &Analyzer,
284) -> Vec<MatchedToken> {
285    let tokens = analyzer.analyze(text);
286    tokens
287        .into_iter()
288        .filter(|token| query_terms.contains(&token.text))
289        .map(|token| MatchedToken {
290            offset_from: token.offset_from,
291            offset_to: token.offset_to,
292        })
293        .collect()
294}
295
296// --- Fragment selection ---
297
298/// Select the best fragments from the field text.
299///
300/// Uses a greedy approach: score each candidate window by summing
301/// the weights of matched terms it contains, then greedily pick
302/// the top non-overlapping fragments.
303fn select_fragments(
304    text: &str,
305    matches: &[MatchedToken],
306    fragment_size: usize,
307    number_of_fragments: usize,
308    order: HighlightOrder,
309) -> Vec<(usize, usize, f32)> {
310    if matches.is_empty() {
311        return Vec::new();
312    }
313    if number_of_fragments == 0 {
314        return vec![(0, text.len(), 1.0)];
315    }
316
317    // Build candidate windows centered on each match
318    let mut candidates: Vec<(usize, usize, f32)> = Vec::new();
319
320    for m in matches {
321        let center = (m.offset_from + m.offset_to) / 2;
322        let half = fragment_size / 2;
323        let raw_start = center.saturating_sub(half);
324        let raw_end = (raw_start + fragment_size).min(text.len());
325
326        let start = snap_to_boundary(text, raw_start, true);
327        let end = snap_to_boundary(text, raw_end, false);
328
329        // Score: count matching tokens in this window
330        let score: f32 = matches
331            .iter()
332            .filter(|t| t.offset_from >= start && t.offset_to <= end)
333            .count() as f32;
334
335        candidates.push((start, end, score));
336    }
337
338    // Sort by score descending, then by position
339    candidates.sort_by(|a, b| {
340        b.2.partial_cmp(&a.2)
341            .unwrap_or(std::cmp::Ordering::Equal)
342            .then_with(|| a.0.cmp(&b.0))
343    });
344
345    // Greedily select non-overlapping fragments
346    let mut selected: Vec<(usize, usize, f32)> = Vec::new();
347    for (start, end, score) in candidates {
348        if selected.len() >= number_of_fragments {
349            break;
350        }
351        let overlaps = selected.iter().any(|(s, e, _)| start < *e && end > *s);
352        if !overlaps {
353            selected.push((start, end, score));
354        }
355    }
356
357    // Order: by score or by position
358    match order {
359        HighlightOrder::Score => {
360            selected.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
361        }
362        HighlightOrder::None => {
363            selected.sort_by_key(|f| f.0);
364        }
365    }
366
367    selected
368}
369
370/// Snap a byte offset to a nearby boundary character.
371fn snap_to_boundary(text: &str, offset: usize, snap_left: bool) -> usize {
372    if offset == 0 || offset >= text.len() {
373        return offset;
374    }
375    let scan_range = if snap_left {
376        offset.saturating_sub(BOUNDARY_MAX_SCAN)..offset
377    } else {
378        offset..(offset + BOUNDARY_MAX_SCAN).min(text.len())
379    };
380
381    if snap_left {
382        for i in (scan_range.start..scan_range.end).rev() {
383            if text.is_char_boundary(i) {
384                if let Some(c) = text[i..].chars().next() {
385                    if BOUNDARY_CHARS.contains(&c) {
386                        return i + c.len_utf8();
387                    }
388                }
389            }
390        }
391    } else {
392        for i in scan_range {
393            if text.is_char_boundary(i) {
394                if let Some(c) = text[i..].chars().next() {
395                    if BOUNDARY_CHARS.contains(&c) {
396                        return i;
397                    }
398                }
399            }
400        }
401    }
402
403    offset
404}
405
406// --- Top-level highlight function ---
407
408/// Generate match spans for a single search hit.
409///
410/// Reads field text from `_source`, re-analyses with the field's
411/// index-time analyser, matches terms, and emits the resulting
412/// positions as [`Highlight`] spans. No rendering/tagging happens
413/// here — consumers decide presentation.
414///
415/// The optional fragment-selection step (when
416/// `number_of_fragments > 0`) caps output to spans within the top-N
417/// fragment windows by score; otherwise all matches are returned in
418/// positional order.
419///
420/// See [[feature-search-highlight]].
421pub fn highlight_hit(
422    source: &serde_json::Value,
423    config: &HighlightConfig,
424    query_terms: &HashMap<String, HashSet<String>>,
425    analyzers: &AnalyzerRegistry,
426    mapping: Option<&crate::mapping::Mapping>,
427) -> Option<HashMap<String, Vec<Highlight>>> {
428    let obj = source.as_object()?;
429    let mut out: HashMap<String, Vec<Highlight>> = HashMap::new();
430
431    for field_config in &config.fields {
432        let field_name = &field_config.field;
433
434        let all_terms: HashSet<String>;
435        let effective_terms = if config.require_field_match {
436            match query_terms.get(field_name) {
437                Some(t) => t,
438                None => continue,
439            }
440        } else {
441            all_terms = query_terms.values().flatten().cloned().collect();
442            &all_terms
443        };
444
445        if effective_terms.is_empty() {
446            continue;
447        }
448
449        let text = match obj.get(field_name) {
450            Some(serde_json::Value::String(s)) if !s.is_empty() => s.as_str(),
451            _ => continue,
452        };
453
454        // For highlighting, tokenise with the field's index-time analyser.
455        let field_analyzer_name = mapping
456            .and_then(|m| m.field_id(field_name))
457            .and_then(|fid| mapping.unwrap().field(fid).analyzer.as_deref())
458            .unwrap_or("standard");
459        let analyzer = analyzers.get(field_analyzer_name);
460
461        let matches = find_matching_tokens(text, effective_terms, analyzer);
462        if matches.is_empty() {
463            continue;
464        }
465
466        let spans = spans_for_field(text, &matches, field_config, config.order);
467        if !spans.is_empty() {
468            out.insert(field_name.clone(), spans);
469        }
470    }
471
472    if out.is_empty() { None } else { Some(out) }
473}
474
475/// Convert raw matches into `Highlight` spans, optionally capped by the
476/// field config's fragment-selection knobs.
477///
478/// When `number_of_fragments == 0`, returns every match in positional
479/// order. Otherwise runs the fragment selector and retains only the
480/// matches that fall inside the selected windows.
481fn spans_for_field(
482    text: &str,
483    matches: &[MatchedToken],
484    field_config: &HighlightFieldConfig,
485    order: HighlightOrder,
486) -> Vec<Highlight> {
487    if field_config.number_of_fragments == 0 {
488        let mut spans: Vec<Highlight> = matches.iter().map(|m| span_from_match(text, m)).collect();
489        spans.sort_by_key(|s| s.start);
490        return spans;
491    }
492
493    let fragments = select_fragments(
494        text,
495        matches,
496        field_config.fragment_size,
497        field_config.number_of_fragments,
498        order,
499    );
500
501    let mut spans: Vec<Highlight> = Vec::new();
502    for (fstart, fend, _score) in &fragments {
503        for m in matches {
504            if m.offset_from >= *fstart && m.offset_to <= *fend {
505                spans.push(span_from_match(text, m));
506            }
507        }
508    }
509    spans.sort_by_key(|s| s.start);
510    spans.dedup_by_key(|s| (s.start, s.end));
511    spans
512}
513
514fn span_from_match(text: &str, m: &MatchedToken) -> Highlight {
515    Highlight {
516        text: text[m.offset_from..m.offset_to].to_string(),
517        start: m.offset_from,
518        end: m.offset_to,
519    }
520}
521
522// --- Request parsing ---
523
524/// Parse a highlight config from the ES-shape search-request JSON.
525///
526/// Ignores ``pre_tags`` / ``post_tags`` / ``no_match_size`` / ``encoder``
527/// keys: those are rendering concerns and no longer affect output. Kept
528/// here for ES copy-paste compatibility — see
529/// [[fix-strict-search-parsing]].
530pub fn parse_highlight_config(json: &serde_json::Value) -> HighlightConfig {
531    let require_field_match = json
532        .get("require_field_match")
533        .and_then(|v| v.as_bool())
534        .unwrap_or(true);
535
536    let order = match json.get("order").and_then(|v| v.as_str()) {
537        Some("score") => HighlightOrder::Score,
538        _ => HighlightOrder::None,
539    };
540
541    let fields = json
542        .get("fields")
543        .and_then(|v| v.as_object())
544        .map(|obj| {
545            obj.iter()
546                .map(|(name, field_json)| {
547                    let fragment_size = field_json
548                        .get("fragment_size")
549                        .and_then(|v| v.as_u64())
550                        .map(|v| v as usize)
551                        .unwrap_or(100);
552                    let number_of_fragments = field_json
553                        .get("number_of_fragments")
554                        .and_then(|v| v.as_u64())
555                        .map(|v| v as usize)
556                        .unwrap_or(5);
557                    HighlightFieldConfig {
558                        field: name.clone(),
559                        fragment_size,
560                        number_of_fragments,
561                    }
562                })
563                .collect()
564        })
565        .unwrap_or_default();
566
567    HighlightConfig {
568        fields,
569        require_field_match,
570        order,
571    }
572}