lucisearch 0.8.0

//! Search highlighting: re-analyze field text and wrap matched terms in tags.
//!
//! Runs in the result construction phase (post-scoring, per-hit) following
//! the same materialization pattern as explain and fields retrieval.
//!
//! See [[feature-search-highlight]].

use std::collections::{HashMap, HashSet};

use crate::analysis::{Analyzer, AnalyzerRegistry};

use crate::query::ast::{ScoringExpression, SpanExpression};

// --- Configuration types ---

/// Parsed highlight request from the search JSON.
///
/// See [[feature-search-highlight]].
#[derive(Clone, Debug)]
pub struct HighlightConfig {
    pub fields: Vec<HighlightFieldConfig>,
    pub require_field_match: bool,
    pub order: HighlightOrder,
}

/// Per-field highlight settings.
///
/// `fragment_size` / `number_of_fragments` cap the number of spans
/// returned for very long fields. `0` for both means "emit every
/// match".
#[derive(Clone, Debug)]
pub struct HighlightFieldConfig {
    pub field: String,
    pub fragment_size: usize,
    pub number_of_fragments: usize,
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub enum HighlightOrder {
    /// Return fragments in their order of appearance in the field text.
    None,
    /// Return fragments ordered by relevance score (best match first).
    Score,
}

// --- Public output type ---

/// A single match span within a field's text.
///
/// The library returns match positions, not pre-rendered markup.
/// Consumers decide how to present matches (HTML tags, React nodes,
/// terminal colour codes, JSON, ...). See
/// [[feature-search-highlight]] and
/// [[architecture-scoring-materialization-separation]].
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Highlight {
    /// The substring of the field text that matched.
    pub text: String,
    /// Byte offset in the original field text where the match starts.
    pub start: usize,
    /// Byte offset where the match ends (exclusive).
    pub end: usize,
}

// --- Internal types ---

#[derive(Clone, Debug)]
struct MatchedToken {
    offset_from: usize,
    offset_to: usize,
}

// --- Constants ---

const BOUNDARY_CHARS: &[char] = &['.', ',', '!', '?', ' ', '\t', '\n'];
const BOUNDARY_MAX_SCAN: usize = 20;

// --- Term extraction from query expression ---

/// Extract query terms from the expression tree, keyed by field name.
///
/// Walks the expression recursively. For text-analyzed queries (Match,
/// MatchPhrase), the query text is re-analyzed to get the actual
/// indexed terms.
///
/// See [[feature-search-highlight]].
pub(crate) fn extract_query_terms(
    ast: &ScoringExpression,
    searcher: &crate::search::searcher::Searcher,
) -> HashMap<String, HashSet<String>> {
    let mut terms: HashMap<String, HashSet<String>> = HashMap::new();
    collect_terms(ast, searcher, &mut terms);
    terms
}

fn collect_terms(
    ast: &ScoringExpression,
    searcher: &crate::search::searcher::Searcher,
    terms: &mut HashMap<String, HashSet<String>>,
) {
    let analyzers = searcher.analyzers();
    match ast {
        ScoringExpression::Term { field, value } => {
            terms
                .entry(field.clone())
                .or_default()
                .insert(value.clone());
        }
        ScoringExpression::Terms { field, values } => {
            let set = terms.entry(field.clone()).or_default();
            for v in values {
                set.insert(v.clone());
            }
        }
        ScoringExpression::Match {
            field,
            query,
            analyzer,
        } => {
            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
            let a = analyzers.get(analyzer_name);
            let tokens = a.analyze(query);
            let set = terms.entry(field.clone()).or_default();
            for token in tokens {
                set.insert(token.text);
            }
        }
        ScoringExpression::MatchPhrase {
            field,
            query,
            analyzer,
        } => {
            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
            let a = analyzers.get(analyzer_name);
            let tokens = a.analyze(query);
            let set = terms.entry(field.clone()).or_default();
            for token in tokens {
                set.insert(token.text);
            }
        }
        ScoringExpression::MatchBoolPrefix {
            field,
            query,
            analyzer,
        } => {
            let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
            let a = analyzers.get(analyzer_name);
            let tokens = a.analyze(query);
            let set = terms.entry(field.clone()).or_default();
            for token in tokens {
                set.insert(token.text);
            }
        }
        ScoringExpression::Prefix { field, value } => {
            terms
                .entry(field.clone())
                .or_default()
                .insert(value.clone());
        }
        ScoringExpression::Fuzzy { field, value, .. } => {
            terms
                .entry(field.clone())
                .or_default()
                .insert(value.clone());
        }
        ScoringExpression::Wildcard { field, value } => {
            terms
                .entry(field.clone())
                .or_default()
                .insert(value.clone());
        }
        ScoringExpression::MultiMatch {
            fields,
            query,
            analyzer,
            ..
        } => {
            // For multi_match, use the first field's analyzer for all fields
            let analyzer_name = if let Some(f) = fields.first() {
                searcher.resolve_search_analyzer(f, analyzer.as_deref())
            } else {
                "standard"
            };
            let a = analyzers.get(analyzer_name);
            let tokens = a.analyze(query);
            for f in fields {
                let set = terms.entry(f.clone()).or_default();
                for token in &tokens {
                    set.insert(token.text.clone());
                }
            }
        }
        // All span queries delegate to the SpanExpression walker —
        // single arm, one concept.
        ScoringExpression::Span(span_ast) => {
            collect_span_terms(span_ast, searcher, terms);
        }
        // Recursive cases — descend into sub-queries
        ScoringExpression::Bool {
            must,
            should,
            must_not: _,
            filter,
            ..
        } => {
            for q in must.iter().chain(should).chain(filter) {
                collect_terms(q, searcher, terms);
            }
            // must_not terms are intentionally excluded — they are
            // exclusion criteria, not highlight targets.
        }
        ScoringExpression::DisMax { queries, .. } => {
            for q in queries {
                collect_terms(q, searcher, terms);
            }
        }
        ScoringExpression::ConstantScore { query, .. }
        | ScoringExpression::Boost { query, .. }
        | ScoringExpression::Nested { query, .. } => {
            collect_terms(query, searcher, terms);
        }
        ScoringExpression::ScriptScore { query, .. }
        | ScoringExpression::FunctionScore { query, .. } => {
            collect_terms(query, searcher, terms);
        }
        ScoringExpression::Boosting { positive, .. } => {
            collect_terms(positive, searcher, terms);
        }
        // Leaf queries with no terms to extract
        ScoringExpression::Exists { .. }
        | ScoringExpression::Range { .. }
        | ScoringExpression::GeoDistance { .. }
        | ScoringExpression::GeoBoundingBox { .. }
        | ScoringExpression::GeoShape { .. }
        | ScoringExpression::Regexp { .. }
        | ScoringExpression::Knn { .. }
        | ScoringExpression::MatchAll
        | ScoringExpression::MatchNone => {}
    }
}

/// Walk a span AST and collect terms for highlighting. Mirrors
/// [`collect_terms`] but operates on the span-typed subset.
fn collect_span_terms(
    ast: &SpanExpression,
    searcher: &crate::search::searcher::Searcher,
    terms: &mut HashMap<String, HashSet<String>>,
) {
    let _ = searcher;
    match ast {
        SpanExpression::SpanTerm { field, value } => {
            terms
                .entry(field.clone())
                .or_default()
                .insert(value.clone());
        }
        SpanExpression::SpanNear {
            field,
            terms: near_terms,
            ..
        } => {
            let set = terms.entry(field.clone()).or_default();
            for v in near_terms {
                set.insert(v.clone());
            }
        }
        SpanExpression::SpanNot { include, .. } => {
            // Exclude side is negation, not highlight-worthy.
            collect_span_terms(include, searcher, terms);
        }
        SpanExpression::SpanFirst { query, .. } => {
            collect_span_terms(query, searcher, terms);
        }
    }
}

// --- Token matching ---

/// Re-analyze field text and find matching token positions.
fn find_matching_tokens(
    text: &str,
    query_terms: &HashSet<String>,
    analyzer: &Analyzer,
) -> Vec<MatchedToken> {
    let tokens = analyzer.analyze(text);
    tokens
        .into_iter()
        .filter(|token| query_terms.contains(&token.text))
        .map(|token| MatchedToken {
            offset_from: token.offset_from,
            offset_to: token.offset_to,
        })
        .collect()
}

// --- Fragment selection ---

/// Select the best fragments from the field text.
///
/// Uses a greedy approach: score each candidate window by summing
/// the weights of matched terms it contains, then greedily pick
/// the top non-overlapping fragments.
fn select_fragments(
    text: &str,
    matches: &[MatchedToken],
    fragment_size: usize,
    number_of_fragments: usize,
    order: HighlightOrder,
) -> Vec<(usize, usize, f32)> {
    if matches.is_empty() {
        return Vec::new();
    }
    if number_of_fragments == 0 {
        return vec![(0, text.len(), 1.0)];
    }

    // Build candidate windows centered on each match
    let mut candidates: Vec<(usize, usize, f32)> = Vec::new();

    for m in matches {
        let center = (m.offset_from + m.offset_to) / 2;
        let half = fragment_size / 2;
        let raw_start = center.saturating_sub(half);
        let raw_end = (raw_start + fragment_size).min(text.len());

        let start = snap_to_boundary(text, raw_start, true);
        let end = snap_to_boundary(text, raw_end, false);

        // Score: count matching tokens in this window
        let score: f32 = matches
            .iter()
            .filter(|t| t.offset_from >= start && t.offset_to <= end)
            .count() as f32;

        candidates.push((start, end, score));
    }

    // Sort by score descending, then by position
    candidates.sort_by(|a, b| {
        b.2.partial_cmp(&a.2)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then_with(|| a.0.cmp(&b.0))
    });

    // Greedily select non-overlapping fragments
    let mut selected: Vec<(usize, usize, f32)> = Vec::new();
    for (start, end, score) in candidates {
        if selected.len() >= number_of_fragments {
            break;
        }
        let overlaps = selected.iter().any(|(s, e, _)| start < *e && end > *s);
        if !overlaps {
            selected.push((start, end, score));
        }
    }

    // Order: by score or by position
    match order {
        HighlightOrder::Score => {
            selected.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
        }
        HighlightOrder::None => {
            selected.sort_by_key(|f| f.0);
        }
    }

    selected
}

/// Snap a byte offset to a nearby boundary character.
fn snap_to_boundary(text: &str, offset: usize, snap_left: bool) -> usize {
    if offset == 0 || offset >= text.len() {
        return offset;
    }
    let scan_range = if snap_left {
        offset.saturating_sub(BOUNDARY_MAX_SCAN)..offset
    } else {
        offset..(offset + BOUNDARY_MAX_SCAN).min(text.len())
    };

    if snap_left {
        for i in (scan_range.start..scan_range.end).rev() {
            if text.is_char_boundary(i) {
                if let Some(c) = text[i..].chars().next() {
                    if BOUNDARY_CHARS.contains(&c) {
                        return i + c.len_utf8();
                    }
                }
            }
        }
    } else {
        for i in scan_range {
            if text.is_char_boundary(i) {
                if let Some(c) = text[i..].chars().next() {
                    if BOUNDARY_CHARS.contains(&c) {
                        return i;
                    }
                }
            }
        }
    }

    offset
}

// --- Top-level highlight function ---

/// Generate match spans for a single search hit.
///
/// Reads field text from `_source`, re-analyses with the field's
/// index-time analyser, matches terms, and emits the resulting
/// positions as [`Highlight`] spans. No rendering/tagging happens
/// here — consumers decide presentation.
///
/// The optional fragment-selection step (when
/// `number_of_fragments > 0`) caps output to spans within the top-N
/// fragment windows by score; otherwise all matches are returned in
/// positional order.
///
/// See [[feature-search-highlight]].
pub fn highlight_hit(
    source: &serde_json::Value,
    config: &HighlightConfig,
    query_terms: &HashMap<String, HashSet<String>>,
    analyzers: &AnalyzerRegistry,
    mapping: Option<&crate::mapping::Mapping>,
) -> Option<HashMap<String, Vec<Highlight>>> {
    let obj = source.as_object()?;
    let mut out: HashMap<String, Vec<Highlight>> = HashMap::new();

    for field_config in &config.fields {
        let field_name = &field_config.field;

        let all_terms: HashSet<String>;
        let effective_terms = if config.require_field_match {
            match query_terms.get(field_name) {
                Some(t) => t,
                None => continue,
            }
        } else {
            all_terms = query_terms.values().flatten().cloned().collect();
            &all_terms
        };

        if effective_terms.is_empty() {
            continue;
        }

        let text = match obj.get(field_name) {
            Some(serde_json::Value::String(s)) if !s.is_empty() => s.as_str(),
            _ => continue,
        };

        // For highlighting, tokenise with the field's index-time analyser.
        let field_analyzer_name = mapping
            .and_then(|m| m.field_id(field_name))
            .and_then(|fid| mapping.unwrap().field(fid).analyzer.as_deref())
            .unwrap_or("standard");
        let analyzer = analyzers.get(field_analyzer_name);

        let matches = find_matching_tokens(text, effective_terms, analyzer);
        if matches.is_empty() {
            continue;
        }

        let spans = spans_for_field(text, &matches, field_config, config.order);
        if !spans.is_empty() {
            out.insert(field_name.clone(), spans);
        }
    }

    if out.is_empty() { None } else { Some(out) }
}

/// Convert raw matches into `Highlight` spans, optionally capped by the
/// field config's fragment-selection knobs.
///
/// When `number_of_fragments == 0`, returns every match in positional
/// order. Otherwise runs the fragment selector and retains only the
/// matches that fall inside the selected windows.
fn spans_for_field(
    text: &str,
    matches: &[MatchedToken],
    field_config: &HighlightFieldConfig,
    order: HighlightOrder,
) -> Vec<Highlight> {
    if field_config.number_of_fragments == 0 {
        let mut spans: Vec<Highlight> = matches.iter().map(|m| span_from_match(text, m)).collect();
        spans.sort_by_key(|s| s.start);
        return spans;
    }

    let fragments = select_fragments(
        text,
        matches,
        field_config.fragment_size,
        field_config.number_of_fragments,
        order,
    );

    let mut spans: Vec<Highlight> = Vec::new();
    for (fstart, fend, _score) in &fragments {
        for m in matches {
            if m.offset_from >= *fstart && m.offset_to <= *fend {
                spans.push(span_from_match(text, m));
            }
        }
    }
    spans.sort_by_key(|s| s.start);
    spans.dedup_by_key(|s| (s.start, s.end));
    spans
}

fn span_from_match(text: &str, m: &MatchedToken) -> Highlight {
    Highlight {
        text: text[m.offset_from..m.offset_to].to_string(),
        start: m.offset_from,
        end: m.offset_to,
    }
}

// --- Request parsing ---

/// Parse a highlight config from the ES-shape search-request JSON.
///
/// Ignores ``pre_tags`` / ``post_tags`` / ``no_match_size`` / ``encoder``
/// keys: those are rendering concerns and no longer affect output. Kept
/// here for ES copy-paste compatibility — see
/// [[fix-strict-search-parsing]].
pub fn parse_highlight_config(json: &serde_json::Value) -> HighlightConfig {
    let require_field_match = json
        .get("require_field_match")
        .and_then(|v| v.as_bool())
        .unwrap_or(true);

    let order = match json.get("order").and_then(|v| v.as_str()) {
        Some("score") => HighlightOrder::Score,
        _ => HighlightOrder::None,
    };

    let fields = json
        .get("fields")
        .and_then(|v| v.as_object())
        .map(|obj| {
            obj.iter()
                .map(|(name, field_json)| {
                    let fragment_size = field_json
                        .get("fragment_size")
                        .and_then(|v| v.as_u64())
                        .map(|v| v as usize)
                        .unwrap_or(100);
                    let number_of_fragments = field_json
                        .get("number_of_fragments")
                        .and_then(|v| v.as_u64())
                        .map(|v| v as usize)
                        .unwrap_or(5);
                    HighlightFieldConfig {
                        field: name.clone(),
                        fragment_size,
                        number_of_fragments,
                    }
                })
                .collect()
        })
        .unwrap_or_default();

    HighlightConfig {
        fields,
        require_field_match,
        order,
    }
}