1use std::collections::{HashMap, HashSet};
9
10use crate::analysis::{Analyzer, AnalyzerRegistry};
11
12use crate::query::ast::{ScoringExpression, SpanExpression};
13
14#[derive(Clone, Debug)]
20pub struct HighlightConfig {
21 pub fields: Vec<HighlightFieldConfig>,
22 pub require_field_match: bool,
23 pub order: HighlightOrder,
24}
25
26#[derive(Clone, Debug)]
32pub struct HighlightFieldConfig {
33 pub field: String,
34 pub fragment_size: usize,
35 pub number_of_fragments: usize,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq)]
39pub enum HighlightOrder {
40 None,
42 Score,
44}
45
46#[derive(Clone, Debug, PartialEq, Eq)]
56pub struct Highlight {
57 pub text: String,
59 pub start: usize,
61 pub end: usize,
63}
64
65#[derive(Clone, Debug)]
68struct MatchedToken {
69 offset_from: usize,
70 offset_to: usize,
71}
72
73const BOUNDARY_CHARS: &[char] = &['.', ',', '!', '?', ' ', '\t', '\n'];
76const BOUNDARY_MAX_SCAN: usize = 20;
77
78pub(crate) fn extract_query_terms(
88 ast: &ScoringExpression,
89 searcher: &crate::search::searcher::Searcher,
90) -> HashMap<String, HashSet<String>> {
91 let mut terms: HashMap<String, HashSet<String>> = HashMap::new();
92 collect_terms(ast, searcher, &mut terms);
93 terms
94}
95
96fn collect_terms(
97 ast: &ScoringExpression,
98 searcher: &crate::search::searcher::Searcher,
99 terms: &mut HashMap<String, HashSet<String>>,
100) {
101 let analyzers = searcher.analyzers();
102 match ast {
103 ScoringExpression::Term { field, value } => {
104 terms
105 .entry(field.clone())
106 .or_default()
107 .insert(value.clone());
108 }
109 ScoringExpression::Terms { field, values } => {
110 let set = terms.entry(field.clone()).or_default();
111 for v in values {
112 set.insert(v.clone());
113 }
114 }
115 ScoringExpression::Match {
116 field,
117 query,
118 analyzer,
119 } => {
120 let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
121 let a = analyzers.get(analyzer_name);
122 let tokens = a.analyze(query);
123 let set = terms.entry(field.clone()).or_default();
124 for token in tokens {
125 set.insert(token.text);
126 }
127 }
128 ScoringExpression::MatchPhrase {
129 field,
130 query,
131 analyzer,
132 } => {
133 let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
134 let a = analyzers.get(analyzer_name);
135 let tokens = a.analyze(query);
136 let set = terms.entry(field.clone()).or_default();
137 for token in tokens {
138 set.insert(token.text);
139 }
140 }
141 ScoringExpression::MatchBoolPrefix {
142 field,
143 query,
144 analyzer,
145 } => {
146 let analyzer_name = searcher.resolve_search_analyzer(field, analyzer.as_deref());
147 let a = analyzers.get(analyzer_name);
148 let tokens = a.analyze(query);
149 let set = terms.entry(field.clone()).or_default();
150 for token in tokens {
151 set.insert(token.text);
152 }
153 }
154 ScoringExpression::Prefix { field, value } => {
155 terms
156 .entry(field.clone())
157 .or_default()
158 .insert(value.clone());
159 }
160 ScoringExpression::Fuzzy { field, value, .. } => {
161 terms
162 .entry(field.clone())
163 .or_default()
164 .insert(value.clone());
165 }
166 ScoringExpression::Wildcard { field, value } => {
167 terms
168 .entry(field.clone())
169 .or_default()
170 .insert(value.clone());
171 }
172 ScoringExpression::MultiMatch {
173 fields,
174 query,
175 analyzer,
176 ..
177 } => {
178 let analyzer_name = if let Some(f) = fields.first() {
180 searcher.resolve_search_analyzer(f, analyzer.as_deref())
181 } else {
182 "standard"
183 };
184 let a = analyzers.get(analyzer_name);
185 let tokens = a.analyze(query);
186 for f in fields {
187 let set = terms.entry(f.clone()).or_default();
188 for token in &tokens {
189 set.insert(token.text.clone());
190 }
191 }
192 }
193 ScoringExpression::Span(span_ast) => {
196 collect_span_terms(span_ast, searcher, terms);
197 }
198 ScoringExpression::Bool {
200 must,
201 should,
202 must_not: _,
203 filter,
204 ..
205 } => {
206 for q in must.iter().chain(should).chain(filter) {
207 collect_terms(q, searcher, terms);
208 }
209 }
212 ScoringExpression::DisMax { queries, .. } => {
213 for q in queries {
214 collect_terms(q, searcher, terms);
215 }
216 }
217 ScoringExpression::ConstantScore { query, .. }
218 | ScoringExpression::Boost { query, .. }
219 | ScoringExpression::Nested { query, .. } => {
220 collect_terms(query, searcher, terms);
221 }
222 ScoringExpression::ScriptScore { query, .. }
223 | ScoringExpression::FunctionScore { query, .. } => {
224 collect_terms(query, searcher, terms);
225 }
226 ScoringExpression::Boosting { positive, .. } => {
227 collect_terms(positive, searcher, terms);
228 }
229 ScoringExpression::Exists { .. }
231 | ScoringExpression::Range { .. }
232 | ScoringExpression::GeoDistance { .. }
233 | ScoringExpression::GeoBoundingBox { .. }
234 | ScoringExpression::GeoShape { .. }
235 | ScoringExpression::Regexp { .. }
236 | ScoringExpression::Knn { .. }
237 | ScoringExpression::MatchAll
238 | ScoringExpression::MatchNone => {}
239 }
240}
241
242fn collect_span_terms(
245 ast: &SpanExpression,
246 searcher: &crate::search::searcher::Searcher,
247 terms: &mut HashMap<String, HashSet<String>>,
248) {
249 let _ = searcher;
250 match ast {
251 SpanExpression::SpanTerm { field, value } => {
252 terms
253 .entry(field.clone())
254 .or_default()
255 .insert(value.clone());
256 }
257 SpanExpression::SpanNear {
258 field,
259 terms: near_terms,
260 ..
261 } => {
262 let set = terms.entry(field.clone()).or_default();
263 for v in near_terms {
264 set.insert(v.clone());
265 }
266 }
267 SpanExpression::SpanNot { include, .. } => {
268 collect_span_terms(include, searcher, terms);
270 }
271 SpanExpression::SpanFirst { query, .. } => {
272 collect_span_terms(query, searcher, terms);
273 }
274 }
275}
276
277fn find_matching_tokens(
281 text: &str,
282 query_terms: &HashSet<String>,
283 analyzer: &Analyzer,
284) -> Vec<MatchedToken> {
285 let tokens = analyzer.analyze(text);
286 tokens
287 .into_iter()
288 .filter(|token| query_terms.contains(&token.text))
289 .map(|token| MatchedToken {
290 offset_from: token.offset_from,
291 offset_to: token.offset_to,
292 })
293 .collect()
294}
295
296fn select_fragments(
304 text: &str,
305 matches: &[MatchedToken],
306 fragment_size: usize,
307 number_of_fragments: usize,
308 order: HighlightOrder,
309) -> Vec<(usize, usize, f32)> {
310 if matches.is_empty() {
311 return Vec::new();
312 }
313 if number_of_fragments == 0 {
314 return vec![(0, text.len(), 1.0)];
315 }
316
317 let mut candidates: Vec<(usize, usize, f32)> = Vec::new();
319
320 for m in matches {
321 let center = (m.offset_from + m.offset_to) / 2;
322 let half = fragment_size / 2;
323 let raw_start = center.saturating_sub(half);
324 let raw_end = (raw_start + fragment_size).min(text.len());
325
326 let start = snap_to_boundary(text, raw_start, true);
327 let end = snap_to_boundary(text, raw_end, false);
328
329 let score: f32 = matches
331 .iter()
332 .filter(|t| t.offset_from >= start && t.offset_to <= end)
333 .count() as f32;
334
335 candidates.push((start, end, score));
336 }
337
338 candidates.sort_by(|a, b| {
340 b.2.partial_cmp(&a.2)
341 .unwrap_or(std::cmp::Ordering::Equal)
342 .then_with(|| a.0.cmp(&b.0))
343 });
344
345 let mut selected: Vec<(usize, usize, f32)> = Vec::new();
347 for (start, end, score) in candidates {
348 if selected.len() >= number_of_fragments {
349 break;
350 }
351 let overlaps = selected.iter().any(|(s, e, _)| start < *e && end > *s);
352 if !overlaps {
353 selected.push((start, end, score));
354 }
355 }
356
357 match order {
359 HighlightOrder::Score => {
360 selected.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
361 }
362 HighlightOrder::None => {
363 selected.sort_by_key(|f| f.0);
364 }
365 }
366
367 selected
368}
369
370fn snap_to_boundary(text: &str, offset: usize, snap_left: bool) -> usize {
372 if offset == 0 || offset >= text.len() {
373 return offset;
374 }
375 let scan_range = if snap_left {
376 offset.saturating_sub(BOUNDARY_MAX_SCAN)..offset
377 } else {
378 offset..(offset + BOUNDARY_MAX_SCAN).min(text.len())
379 };
380
381 if snap_left {
382 for i in (scan_range.start..scan_range.end).rev() {
383 if text.is_char_boundary(i) {
384 if let Some(c) = text[i..].chars().next() {
385 if BOUNDARY_CHARS.contains(&c) {
386 return i + c.len_utf8();
387 }
388 }
389 }
390 }
391 } else {
392 for i in scan_range {
393 if text.is_char_boundary(i) {
394 if let Some(c) = text[i..].chars().next() {
395 if BOUNDARY_CHARS.contains(&c) {
396 return i;
397 }
398 }
399 }
400 }
401 }
402
403 offset
404}
405
406pub fn highlight_hit(
422 source: &serde_json::Value,
423 config: &HighlightConfig,
424 query_terms: &HashMap<String, HashSet<String>>,
425 analyzers: &AnalyzerRegistry,
426 mapping: Option<&crate::mapping::Mapping>,
427) -> Option<HashMap<String, Vec<Highlight>>> {
428 let obj = source.as_object()?;
429 let mut out: HashMap<String, Vec<Highlight>> = HashMap::new();
430
431 for field_config in &config.fields {
432 let field_name = &field_config.field;
433
434 let all_terms: HashSet<String>;
435 let effective_terms = if config.require_field_match {
436 match query_terms.get(field_name) {
437 Some(t) => t,
438 None => continue,
439 }
440 } else {
441 all_terms = query_terms.values().flatten().cloned().collect();
442 &all_terms
443 };
444
445 if effective_terms.is_empty() {
446 continue;
447 }
448
449 let text = match obj.get(field_name) {
450 Some(serde_json::Value::String(s)) if !s.is_empty() => s.as_str(),
451 _ => continue,
452 };
453
454 let field_analyzer_name = mapping
456 .and_then(|m| m.field_id(field_name))
457 .and_then(|fid| mapping.unwrap().field(fid).analyzer.as_deref())
458 .unwrap_or("standard");
459 let analyzer = analyzers.get(field_analyzer_name);
460
461 let matches = find_matching_tokens(text, effective_terms, analyzer);
462 if matches.is_empty() {
463 continue;
464 }
465
466 let spans = spans_for_field(text, &matches, field_config, config.order);
467 if !spans.is_empty() {
468 out.insert(field_name.clone(), spans);
469 }
470 }
471
472 if out.is_empty() { None } else { Some(out) }
473}
474
475fn spans_for_field(
482 text: &str,
483 matches: &[MatchedToken],
484 field_config: &HighlightFieldConfig,
485 order: HighlightOrder,
486) -> Vec<Highlight> {
487 if field_config.number_of_fragments == 0 {
488 let mut spans: Vec<Highlight> = matches.iter().map(|m| span_from_match(text, m)).collect();
489 spans.sort_by_key(|s| s.start);
490 return spans;
491 }
492
493 let fragments = select_fragments(
494 text,
495 matches,
496 field_config.fragment_size,
497 field_config.number_of_fragments,
498 order,
499 );
500
501 let mut spans: Vec<Highlight> = Vec::new();
502 for (fstart, fend, _score) in &fragments {
503 for m in matches {
504 if m.offset_from >= *fstart && m.offset_to <= *fend {
505 spans.push(span_from_match(text, m));
506 }
507 }
508 }
509 spans.sort_by_key(|s| s.start);
510 spans.dedup_by_key(|s| (s.start, s.end));
511 spans
512}
513
514fn span_from_match(text: &str, m: &MatchedToken) -> Highlight {
515 Highlight {
516 text: text[m.offset_from..m.offset_to].to_string(),
517 start: m.offset_from,
518 end: m.offset_to,
519 }
520}
521
522pub fn parse_highlight_config(json: &serde_json::Value) -> HighlightConfig {
531 let require_field_match = json
532 .get("require_field_match")
533 .and_then(|v| v.as_bool())
534 .unwrap_or(true);
535
536 let order = match json.get("order").and_then(|v| v.as_str()) {
537 Some("score") => HighlightOrder::Score,
538 _ => HighlightOrder::None,
539 };
540
541 let fields = json
542 .get("fields")
543 .and_then(|v| v.as_object())
544 .map(|obj| {
545 obj.iter()
546 .map(|(name, field_json)| {
547 let fragment_size = field_json
548 .get("fragment_size")
549 .and_then(|v| v.as_u64())
550 .map(|v| v as usize)
551 .unwrap_or(100);
552 let number_of_fragments = field_json
553 .get("number_of_fragments")
554 .and_then(|v| v.as_u64())
555 .map(|v| v as usize)
556 .unwrap_or(5);
557 HighlightFieldConfig {
558 field: name.clone(),
559 fragment_size,
560 number_of_fragments,
561 }
562 })
563 .collect()
564 })
565 .unwrap_or_default();
566
567 HighlightConfig {
568 fields,
569 require_field_match,
570 order,
571 }
572}