use std::collections::{HashMap, HashSet};
use crate::analysis::{Analyzer, AnalyzerRegistry};
use crate::query::ast::{ScoringExpression, SpanExpression};
#[derive(Clone, Debug)]
pub struct HighlightConfig {
pub fields: Vec<HighlightFieldConfig>,
pub require_field_match: bool,
pub order: HighlightOrder,
}
#[derive(Clone, Debug)]
pub struct HighlightFieldConfig {
pub field: String,
pub fragment_size: usize,
pub number_of_fragments: usize,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum HighlightOrder {
None,
Score,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Highlight {
pub text: String,
pub start: usize,
pub end: usize,
}
#[derive(Clone, Debug)]
struct MatchedToken {
offset_from: usize,
offset_to: usize,
}
const BOUNDARY_CHARS: &[char] = &['.', ',', '!', '?', ' ', '\t', '\n'];
const BOUNDARY_MAX_SCAN: usize = 20;
pub(crate) fn extract_query_terms(
ast: &ScoringExpression,
searcher: &crate::search::searcher::Searcher,
) -> HashMap<String, HashSet<String>> {
let mut terms: HashMap<String, HashSet<String>> = HashMap::new();
collect_terms(ast, searcher, &mut terms);
terms
}
fn collect_terms(
ast: &ScoringExpression,
searcher: &crate::search::searcher::Searcher,
terms: &mut HashMap<String, HashSet<String>>,
) {
let analyzers = searcher.analyzers();
match ast {
ScoringExpression::Term { field, value } => {
terms
.entry(field.clone())
.or_default()
.insert(value.clone());
}
ScoringExpression::Terms { field, values } => {
let set = terms.entry(field.clone()).or_default();
for v in values {
set.insert(v.clone());
}
}
ScoringExpression::Match {
field,
query,
analyzer,
} => {
let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
let a = analyzers.get(analyzer_name);
let tokens = a.analyze(query);
let set = terms.entry(field.clone()).or_default();
for token in tokens {
set.insert(token.text);
}
}
ScoringExpression::MatchPhrase {
field,
query,
analyzer,
} => {
let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
let a = analyzers.get(analyzer_name);
let tokens = a.analyze(query);
let set = terms.entry(field.clone()).or_default();
for token in tokens {
set.insert(token.text);
}
}
ScoringExpression::MatchBoolPrefix {
field,
query,
analyzer,
} => {
let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
let a = analyzers.get(analyzer_name);
let tokens = a.analyze(query);
let set = terms.entry(field.clone()).or_default();
for token in tokens {
set.insert(token.text);
}
}
ScoringExpression::Prefix { field, value } => {
terms
.entry(field.clone())
.or_default()
.insert(value.clone());
}
ScoringExpression::Fuzzy { field, value, .. } => {
terms
.entry(field.clone())
.or_default()
.insert(value.clone());
}
ScoringExpression::Wildcard { field, value } => {
terms
.entry(field.clone())
.or_default()
.insert(value.clone());
}
ScoringExpression::MultiMatch {
fields,
query,
analyzer,
..
} => {
let analyzer_name = if let Some(f) = fields.first() {
searcher.resolve_search_analyzer(f, analyzer.as_deref())
} else {
"standard"
};
let a = analyzers.get(analyzer_name);
let tokens = a.analyze(query);
for f in fields {
let set = terms.entry(f.clone()).or_default();
for token in &tokens {
set.insert(token.text.clone());
}
}
}
ScoringExpression::Span(span_ast) => {
collect_span_terms(span_ast, searcher, terms);
}
ScoringExpression::Bool {
must,
should,
must_not: _,
filter,
..
} => {
for q in must.iter().chain(should).chain(filter) {
collect_terms(q, searcher, terms);
}
}
ScoringExpression::DisMax { queries, .. } => {
for q in queries {
collect_terms(q, searcher, terms);
}
}
ScoringExpression::ConstantScore { query, .. }
| ScoringExpression::Boost { query, .. }
| ScoringExpression::Nested { query, .. } => {
collect_terms(query, searcher, terms);
}
ScoringExpression::ScriptScore { query, .. }
| ScoringExpression::FunctionScore { query, .. } => {
collect_terms(query, searcher, terms);
}
ScoringExpression::Boosting { positive, .. } => {
collect_terms(positive, searcher, terms);
}
ScoringExpression::Exists { .. }
| ScoringExpression::Range { .. }
| ScoringExpression::GeoDistance { .. }
| ScoringExpression::GeoBoundingBox { .. }
| ScoringExpression::GeoShape { .. }
| ScoringExpression::Regexp { .. }
| ScoringExpression::Knn { .. }
| ScoringExpression::MatchAll
| ScoringExpression::MatchNone => {}
}
}
fn collect_span_terms(
ast: &SpanExpression,
searcher: &crate::search::searcher::Searcher,
terms: &mut HashMap<String, HashSet<String>>,
) {
let _ = searcher;
match ast {
SpanExpression::SpanTerm { field, value } => {
terms
.entry(field.clone())
.or_default()
.insert(value.clone());
}
SpanExpression::SpanNear {
field,
terms: near_terms,
..
} => {
let set = terms.entry(field.clone()).or_default();
for v in near_terms {
set.insert(v.clone());
}
}
SpanExpression::SpanNot { include, .. } => {
collect_span_terms(include, searcher, terms);
}
SpanExpression::SpanFirst { query, .. } => {
collect_span_terms(query, searcher, terms);
}
}
}
fn find_matching_tokens(
text: &str,
query_terms: &HashSet<String>,
analyzer: &Analyzer,
) -> Vec<MatchedToken> {
let tokens = analyzer.analyze(text);
tokens
.into_iter()
.filter(|token| query_terms.contains(&token.text))
.map(|token| MatchedToken {
offset_from: token.offset_from,
offset_to: token.offset_to,
})
.collect()
}
fn select_fragments(
text: &str,
matches: &[MatchedToken],
fragment_size: usize,
number_of_fragments: usize,
order: HighlightOrder,
) -> Vec<(usize, usize, f32)> {
if matches.is_empty() {
return Vec::new();
}
if number_of_fragments == 0 {
return vec![(0, text.len(), 1.0)];
}
let mut candidates: Vec<(usize, usize, f32)> = Vec::new();
for m in matches {
let center = (m.offset_from + m.offset_to) / 2;
let half = fragment_size / 2;
let raw_start = center.saturating_sub(half);
let raw_end = (raw_start + fragment_size).min(text.len());
let start = snap_to_boundary(text, raw_start, true);
let end = snap_to_boundary(text, raw_end, false);
let score: f32 = matches
.iter()
.filter(|t| t.offset_from >= start && t.offset_to <= end)
.count() as f32;
candidates.push((start, end, score));
}
candidates.sort_by(|a, b| {
b.2.partial_cmp(&a.2)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
let mut selected: Vec<(usize, usize, f32)> = Vec::new();
for (start, end, score) in candidates {
if selected.len() >= number_of_fragments {
break;
}
let overlaps = selected.iter().any(|(s, e, _)| start < *e && end > *s);
if !overlaps {
selected.push((start, end, score));
}
}
match order {
HighlightOrder::Score => {
selected.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
}
HighlightOrder::None => {
selected.sort_by_key(|f| f.0);
}
}
selected
}
fn snap_to_boundary(text: &str, offset: usize, snap_left: bool) -> usize {
if offset == 0 || offset >= text.len() {
return offset;
}
let scan_range = if snap_left {
offset.saturating_sub(BOUNDARY_MAX_SCAN)..offset
} else {
offset..(offset + BOUNDARY_MAX_SCAN).min(text.len())
};
if snap_left {
for i in (scan_range.start..scan_range.end).rev() {
if text.is_char_boundary(i) {
if let Some(c) = text[i..].chars().next() {
if BOUNDARY_CHARS.contains(&c) {
return i + c.len_utf8();
}
}
}
}
} else {
for i in scan_range {
if text.is_char_boundary(i) {
if let Some(c) = text[i..].chars().next() {
if BOUNDARY_CHARS.contains(&c) {
return i;
}
}
}
}
}
offset
}
pub fn highlight_hit(
source: &serde_json::Value,
config: &HighlightConfig,
query_terms: &HashMap<String, HashSet<String>>,
analyzers: &AnalyzerRegistry,
mapping: Option<&crate::mapping::Mapping>,
) -> Option<HashMap<String, Vec<Highlight>>> {
let obj = source.as_object()?;
let mut out: HashMap<String, Vec<Highlight>> = HashMap::new();
for field_config in &config.fields {
let field_name = &field_config.field;
let all_terms: HashSet<String>;
let effective_terms = if config.require_field_match {
match query_terms.get(field_name) {
Some(t) => t,
None => continue,
}
} else {
all_terms = query_terms.values().flatten().cloned().collect();
&all_terms
};
if effective_terms.is_empty() {
continue;
}
let text = match obj.get(field_name) {
Some(serde_json::Value::String(s)) if !s.is_empty() => s.as_str(),
_ => continue,
};
let field_analyzer_name = mapping
.and_then(|m| m.field_id(field_name))
.and_then(|fid| mapping.unwrap().field(fid).analyzer.as_deref())
.unwrap_or("standard");
let analyzer = analyzers.get(field_analyzer_name);
let matches = find_matching_tokens(text, effective_terms, analyzer);
if matches.is_empty() {
continue;
}
let spans = spans_for_field(text, &matches, field_config, config.order);
if !spans.is_empty() {
out.insert(field_name.clone(), spans);
}
}
if out.is_empty() { None } else { Some(out) }
}
fn spans_for_field(
text: &str,
matches: &[MatchedToken],
field_config: &HighlightFieldConfig,
order: HighlightOrder,
) -> Vec<Highlight> {
if field_config.number_of_fragments == 0 {
let mut spans: Vec<Highlight> = matches.iter().map(|m| span_from_match(text, m)).collect();
spans.sort_by_key(|s| s.start);
return spans;
}
let fragments = select_fragments(
text,
matches,
field_config.fragment_size,
field_config.number_of_fragments,
order,
);
let mut spans: Vec<Highlight> = Vec::new();
for (fstart, fend, _score) in &fragments {
for m in matches {
if m.offset_from >= *fstart && m.offset_to <= *fend {
spans.push(span_from_match(text, m));
}
}
}
spans.sort_by_key(|s| s.start);
spans.dedup_by_key(|s| (s.start, s.end));
spans
}
fn span_from_match(text: &str, m: &MatchedToken) -> Highlight {
Highlight {
text: text[m.offset_from..m.offset_to].to_string(),
start: m.offset_from,
end: m.offset_to,
}
}
pub fn parse_highlight_config(json: &serde_json::Value) -> HighlightConfig {
let require_field_match = json
.get("require_field_match")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let order = match json.get("order").and_then(|v| v.as_str()) {
Some("score") => HighlightOrder::Score,
_ => HighlightOrder::None,
};
let fields = json
.get("fields")
.and_then(|v| v.as_object())
.map(|obj| {
obj.iter()
.map(|(name, field_json)| {
let fragment_size = field_json
.get("fragment_size")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(100);
let number_of_fragments = field_json
.get("number_of_fragments")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(5);
HighlightFieldConfig {
field: name.clone(),
fragment_size,
number_of_fragments,
}
})
.collect()
})
.unwrap_or_default();
HighlightConfig {
fields,
require_field_match,
order,
}
}