hermes_core/query/
boolean.rs

1//! Boolean query with MUST, SHOULD, and MUST_NOT clauses
2
3use std::sync::Arc;
4
5use crate::segment::SegmentReader;
6use crate::structures::TERMINATED;
7use crate::{DocId, Score};
8
9use super::{
10    CountFuture, GlobalStats, MaxScoreExecutor, Query, ScoredDoc, Scorer, ScorerFuture,
11    SparseTermQueryInfo,
12};
13
14/// Boolean query with MUST, SHOULD, and MUST_NOT clauses
15///
16/// When all clauses are SHOULD term queries on the same field, automatically
17/// uses MaxScore optimization for efficient top-k retrieval.
18#[derive(Default, Clone)]
19pub struct BooleanQuery {
20    pub must: Vec<Arc<dyn Query>>,
21    pub should: Vec<Arc<dyn Query>>,
22    pub must_not: Vec<Arc<dyn Query>>,
23    /// Optional global statistics for cross-segment IDF
24    global_stats: Option<Arc<GlobalStats>>,
25}
26
27impl std::fmt::Debug for BooleanQuery {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        f.debug_struct("BooleanQuery")
30            .field("must_count", &self.must.len())
31            .field("should_count", &self.should.len())
32            .field("must_not_count", &self.must_not.len())
33            .field("has_global_stats", &self.global_stats.is_some())
34            .finish()
35    }
36}
37
38impl std::fmt::Display for BooleanQuery {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        write!(f, "Boolean(")?;
41        let mut first = true;
42        for q in &self.must {
43            if !first {
44                write!(f, " ")?;
45            }
46            write!(f, "+{}", q)?;
47            first = false;
48        }
49        for q in &self.should {
50            if !first {
51                write!(f, " ")?;
52            }
53            write!(f, "{}", q)?;
54            first = false;
55        }
56        for q in &self.must_not {
57            if !first {
58                write!(f, " ")?;
59            }
60            write!(f, "-{}", q)?;
61            first = false;
62        }
63        write!(f, ")")
64    }
65}
66
67impl BooleanQuery {
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    pub fn must(mut self, query: impl Query + 'static) -> Self {
73        self.must.push(Arc::new(query));
74        self
75    }
76
77    pub fn should(mut self, query: impl Query + 'static) -> Self {
78        self.should.push(Arc::new(query));
79        self
80    }
81
82    pub fn must_not(mut self, query: impl Query + 'static) -> Self {
83        self.must_not.push(Arc::new(query));
84        self
85    }
86
87    /// Set global statistics for cross-segment IDF
88    pub fn with_global_stats(mut self, stats: Arc<GlobalStats>) -> Self {
89        self.global_stats = Some(stats);
90        self
91    }
92}
93
94/// Compute IDF for a posting list, preferring global stats.
95fn compute_idf(
96    posting_list: &crate::structures::BlockPostingList,
97    field: crate::Field,
98    term: &[u8],
99    num_docs: f32,
100    global_stats: Option<&Arc<GlobalStats>>,
101) -> f32 {
102    if let Some(stats) = global_stats {
103        let global_idf = stats.text_idf(field, &String::from_utf8_lossy(term));
104        if global_idf > 0.0 {
105            return global_idf;
106        }
107    }
108    let doc_freq = posting_list.doc_count() as f32;
109    super::bm25_idf(doc_freq, num_docs)
110}
111
112/// Shared pre-check for text MaxScore: extract term infos, field, avg_field_len, num_docs.
113/// Returns None if not all SHOULD clauses are single-field term queries.
114fn prepare_text_maxscore(
115    should: &[Arc<dyn Query>],
116    reader: &SegmentReader,
117    global_stats: Option<&Arc<GlobalStats>>,
118) -> Option<(Vec<super::TermQueryInfo>, crate::Field, f32, f32)> {
119    let infos: Vec<_> = should
120        .iter()
121        .filter_map(|q| q.as_term_query_info())
122        .collect();
123    if infos.len() != should.len() {
124        return None;
125    }
126    let field = infos[0].field;
127    if !infos.iter().all(|t| t.field == field) {
128        return None;
129    }
130    let avg_field_len = global_stats
131        .map(|s| s.avg_field_len(field))
132        .unwrap_or_else(|| reader.avg_field_len(field));
133    let num_docs = reader.num_docs() as f32;
134    Some((infos, field, avg_field_len, num_docs))
135}
136
137/// Build a TopK scorer from fetched posting lists via text MaxScore.
138fn finish_text_maxscore<'a>(
139    posting_lists: Vec<(crate::structures::BlockPostingList, f32)>,
140    avg_field_len: f32,
141    limit: usize,
142) -> crate::Result<Box<dyn Scorer + 'a>> {
143    if posting_lists.is_empty() {
144        return Ok(Box::new(EmptyScorer) as Box<dyn Scorer + 'a>);
145    }
146    let results = MaxScoreExecutor::text(posting_lists, avg_field_len, limit).execute_sync()?;
147    Ok(Box::new(TopKResultScorer::new(results)) as Box<dyn Scorer + 'a>)
148}
149
150/// Try text MaxScore for pure OR queries (async).
151async fn try_maxscore_scorer<'a>(
152    should: &[Arc<dyn Query>],
153    reader: &'a SegmentReader,
154    limit: usize,
155    global_stats: Option<&Arc<GlobalStats>>,
156) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
157    let (mut infos, _field, avg_field_len, num_docs) =
158        match prepare_text_maxscore(should, reader, global_stats) {
159            Some(v) => v,
160            None => return Ok(None),
161        };
162    let mut posting_lists = Vec::with_capacity(infos.len());
163    for info in infos.drain(..) {
164        if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
165            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
166            posting_lists.push((pl, idf));
167        }
168    }
169    Ok(Some(finish_text_maxscore(
170        posting_lists,
171        avg_field_len,
172        limit,
173    )?))
174}
175
176/// Try text MaxScore for pure OR queries (sync).
177#[cfg(feature = "sync")]
178fn try_maxscore_scorer_sync<'a>(
179    should: &[Arc<dyn Query>],
180    reader: &'a SegmentReader,
181    limit: usize,
182    global_stats: Option<&Arc<GlobalStats>>,
183) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
184    let (mut infos, _field, avg_field_len, num_docs) =
185        match prepare_text_maxscore(should, reader, global_stats) {
186            Some(v) => v,
187            None => return Ok(None),
188        };
189    let mut posting_lists = Vec::with_capacity(infos.len());
190    for info in infos.drain(..) {
191        if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
192            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
193            posting_lists.push((pl, idf));
194        }
195    }
196    Ok(Some(finish_text_maxscore(
197        posting_lists,
198        avg_field_len,
199        limit,
200    )?))
201}
202
203/// Shared grouping result for per-field MaxScore.
204struct PerFieldGrouping {
205    /// (field, avg_field_len, term_infos) for groups with 2+ terms
206    multi_term_groups: Vec<(crate::Field, f32, Vec<super::TermQueryInfo>)>,
207    /// Original indices of single-term and non-term SHOULD clauses (fallback scorers)
208    fallback_indices: Vec<usize>,
209    /// Limit per field group (over-fetched to compensate for cross-field scoring)
210    per_field_limit: usize,
211    num_docs: f32,
212}
213
214/// Group SHOULD clauses by field for per-field MaxScore.
215/// Returns None if no group has 2+ terms (no optimization benefit).
216fn prepare_per_field_grouping(
217    should: &[Arc<dyn Query>],
218    reader: &SegmentReader,
219    limit: usize,
220    global_stats: Option<&Arc<GlobalStats>>,
221) -> Option<PerFieldGrouping> {
222    let mut field_groups: rustc_hash::FxHashMap<crate::Field, Vec<(usize, super::TermQueryInfo)>> =
223        rustc_hash::FxHashMap::default();
224    let mut non_term_indices: Vec<usize> = Vec::new();
225
226    for (i, q) in should.iter().enumerate() {
227        if let Some(info) = q.as_term_query_info() {
228            field_groups.entry(info.field).or_default().push((i, info));
229        } else {
230            non_term_indices.push(i);
231        }
232    }
233
234    if !field_groups.values().any(|g| g.len() >= 2) {
235        return None;
236    }
237
238    let num_groups = field_groups.len() + non_term_indices.len();
239    let per_field_limit = limit * num_groups;
240    let num_docs = reader.num_docs() as f32;
241
242    let mut multi_term_groups = Vec::new();
243    let mut fallback_indices = non_term_indices;
244
245    for group in field_groups.into_values() {
246        if group.len() >= 2 {
247            let field = group[0].1.field;
248            let avg_field_len = global_stats
249                .map(|s| s.avg_field_len(field))
250                .unwrap_or_else(|| reader.avg_field_len(field));
251            let infos: Vec<_> = group.into_iter().map(|(_, info)| info).collect();
252            multi_term_groups.push((field, avg_field_len, infos));
253        } else {
254            fallback_indices.push(group[0].0);
255        }
256    }
257
258    Some(PerFieldGrouping {
259        multi_term_groups,
260        fallback_indices,
261        per_field_limit,
262        num_docs,
263    })
264}
265
266/// Build a SHOULD-only scorer from a vec of optimized scorers.
267fn build_should_scorer<'a>(scorers: Vec<Box<dyn Scorer + 'a>>) -> Box<dyn Scorer + 'a> {
268    if scorers.is_empty() {
269        return Box::new(EmptyScorer);
270    }
271    if scorers.len() == 1 {
272        return scorers.into_iter().next().unwrap();
273    }
274    let mut scorer = BooleanScorer {
275        must: vec![],
276        should: scorers,
277        must_not: vec![],
278        current_doc: 0,
279    };
280    scorer.current_doc = scorer.find_next_match();
281    Box::new(scorer)
282}
283
284/// Per-field MaxScore grouping for multi-field SHOULD queries (async).
285///
286/// When SHOULD clauses span multiple fields (e.g., "hello world" across title, body, desc),
287/// single-field MaxScore can't apply. This groups TermQuery clauses by field, runs MaxScore
288/// per group, and returns a compact scorer per field.
289async fn try_per_field_maxscore<'a>(
290    should: &[Arc<dyn Query>],
291    reader: &'a SegmentReader,
292    limit: usize,
293    global_stats: Option<&Arc<GlobalStats>>,
294) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
295    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
296        Some(g) => g,
297        None => return Ok(None),
298    };
299
300    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
301
302    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
303        let mut posting_lists = Vec::with_capacity(infos.len());
304        for info in infos {
305            if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
306                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
307                posting_lists.push((pl, idf));
308            }
309        }
310        if !posting_lists.is_empty() {
311            scorers.push(finish_text_maxscore(
312                posting_lists,
313                *avg_field_len,
314                grouping.per_field_limit,
315            )?);
316        }
317    }
318
319    for &idx in &grouping.fallback_indices {
320        scorers.push(should[idx].scorer(reader, limit).await?);
321    }
322
323    Ok(Some(build_should_scorer(scorers)))
324}
325
326/// Per-field MaxScore grouping for multi-field SHOULD queries (sync).
327#[cfg(feature = "sync")]
328fn try_per_field_maxscore_sync<'a>(
329    should: &[Arc<dyn Query>],
330    reader: &'a SegmentReader,
331    limit: usize,
332    global_stats: Option<&Arc<GlobalStats>>,
333) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
334    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
335        Some(g) => g,
336        None => return Ok(None),
337    };
338
339    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
340
341    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
342        let mut posting_lists = Vec::with_capacity(infos.len());
343        for info in infos {
344            if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
345                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
346                posting_lists.push((pl, idf));
347            }
348        }
349        if !posting_lists.is_empty() {
350            scorers.push(finish_text_maxscore(
351                posting_lists,
352                *avg_field_len,
353                grouping.per_field_limit,
354            )?);
355        }
356    }
357
358    for &idx in &grouping.fallback_indices {
359        scorers.push(should[idx].scorer_sync(reader, limit)?);
360    }
361
362    Ok(Some(build_should_scorer(scorers)))
363}
364
365/// Try to build a sparse MaxScoreExecutor from SHOULD clauses.
366/// Returns None if not eligible, Some(Err) for empty segment, Some(Ok) otherwise.
367fn prepare_sparse_maxscore<'a>(
368    should: &[Arc<dyn Query>],
369    reader: &'a SegmentReader,
370    limit: usize,
371) -> Option<Result<MaxScoreExecutor<'a>, Box<dyn Scorer + 'a>>> {
372    let infos: Vec<SparseTermQueryInfo> = should
373        .iter()
374        .filter_map(|q| q.as_sparse_term_query_info())
375        .collect();
376    if infos.len() != should.len() {
377        return None;
378    }
379    let field = infos[0].field;
380    if !infos.iter().all(|t| t.field == field) {
381        return None;
382    }
383    let si = match reader.sparse_index(field) {
384        Some(si) => si,
385        None => return Some(Err(Box::new(EmptyScorer))),
386    };
387    let query_terms: Vec<(u32, f32)> = infos
388        .iter()
389        .filter(|info| si.has_dimension(info.dim_id))
390        .map(|info| (info.dim_id, info.weight))
391        .collect();
392    if query_terms.is_empty() {
393        return Some(Err(Box::new(EmptyScorer)));
394    }
395    let executor_limit = (limit as f32 * infos[0].over_fetch_factor).ceil() as usize;
396    Some(Ok(MaxScoreExecutor::sparse(
397        si,
398        query_terms,
399        executor_limit,
400        infos[0].heap_factor,
401    )))
402}
403
404/// Combine raw MaxScore results with ordinal deduplication into a scorer.
405fn combine_sparse_results<'a>(
406    raw: Vec<ScoredDoc>,
407    combiner: super::MultiValueCombiner,
408    limit: usize,
409) -> Box<dyn Scorer + 'a> {
410    let combined = crate::segment::combine_ordinal_results(
411        raw.into_iter().map(|r| (r.doc_id, r.ordinal, r.score)),
412        combiner,
413        limit,
414    );
415    let scored: Vec<ScoredDoc> = combined
416        .into_iter()
417        .map(|r| ScoredDoc {
418            doc_id: r.doc_id,
419            score: r.score,
420            ordinal: 0,
421        })
422        .collect();
423    Box::new(TopKResultScorer::new(scored))
424}
425
426/// Build MaxScore scorer from sparse term infos (async).
427async fn try_sparse_maxscore_scorer<'a>(
428    should: &[Arc<dyn Query>],
429    reader: &'a SegmentReader,
430    limit: usize,
431) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
432    let executor = match prepare_sparse_maxscore(should, reader, limit) {
433        None => return Ok(None),
434        Some(Err(empty)) => return Ok(Some(empty)),
435        Some(Ok(e)) => e,
436    };
437    let combiner = should[0].as_sparse_term_query_info().unwrap().combiner;
438    let raw = executor.execute().await?;
439    Ok(Some(combine_sparse_results(raw, combiner, limit)))
440}
441
442/// Build MaxScore scorer from sparse term infos (sync).
443#[cfg(feature = "sync")]
444fn try_sparse_maxscore_scorer_sync<'a>(
445    should: &[Arc<dyn Query>],
446    reader: &'a SegmentReader,
447    limit: usize,
448) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
449    let executor = match prepare_sparse_maxscore(should, reader, limit) {
450        None => return Ok(None),
451        Some(Err(empty)) => return Ok(Some(empty)),
452        Some(Ok(e)) => e,
453    };
454    let combiner = should[0].as_sparse_term_query_info().unwrap().combiner;
455    let raw = executor.execute_sync()?;
456    Ok(Some(combine_sparse_results(raw, combiner, limit)))
457}
458
459impl Query for BooleanQuery {
460    fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
461        // Clone Arc vectors - cheap reference counting
462        let must = self.must.clone();
463        let should = self.should.clone();
464        let must_not = self.must_not.clone();
465        let global_stats = self.global_stats.clone();
466
467        Box::pin(async move {
468            // Single-clause optimization: unwrap to inner scorer directly
469            if must_not.is_empty() {
470                if must.len() == 1 && should.is_empty() {
471                    return must[0].scorer(reader, limit).await;
472                }
473                if should.len() == 1 && must.is_empty() {
474                    return should[0].scorer(reader, limit).await;
475                }
476            }
477
478            // Check if this is a pure OR query eligible for MaxScore optimization
479            // Conditions: no MUST, no MUST_NOT, multiple SHOULD clauses, all same field
480            if must.is_empty() && must_not.is_empty() && should.len() >= 2 {
481                // Try text MaxScore first
482                if let Some(scorer) =
483                    try_maxscore_scorer(&should, reader, limit, global_stats.as_ref()).await?
484                {
485                    return Ok(scorer);
486                }
487                // Try sparse MaxScore
488                if let Some(scorer) = try_sparse_maxscore_scorer(&should, reader, limit).await? {
489                    return Ok(scorer);
490                }
491                // Try per-field MaxScore grouping for multi-field text queries
492                if let Some(scorer) =
493                    try_per_field_maxscore(&should, reader, limit, global_stats.as_ref()).await?
494                {
495                    return Ok(scorer);
496                }
497            }
498
499            // Fall back to standard boolean scoring
500            let mut must_scorers = Vec::with_capacity(must.len());
501            for q in &must {
502                must_scorers.push(q.scorer(reader, limit).await?);
503            }
504
505            let mut should_scorers = Vec::with_capacity(should.len());
506            for q in &should {
507                should_scorers.push(q.scorer(reader, limit).await?);
508            }
509
510            let mut must_not_scorers = Vec::with_capacity(must_not.len());
511            for q in &must_not {
512                must_not_scorers.push(q.scorer(reader, limit).await?);
513            }
514
515            let mut scorer = BooleanScorer {
516                must: must_scorers,
517                should: should_scorers,
518                must_not: must_not_scorers,
519                current_doc: 0,
520            };
521            // Initialize to first match
522            scorer.current_doc = scorer.find_next_match();
523            Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
524        })
525    }
526
527    #[cfg(feature = "sync")]
528    fn scorer_sync<'a>(
529        &self,
530        reader: &'a SegmentReader,
531        limit: usize,
532    ) -> crate::Result<Box<dyn Scorer + 'a>> {
533        // Single-clause optimization: unwrap to inner scorer directly
534        if self.must_not.is_empty() {
535            if self.must.len() == 1 && self.should.is_empty() {
536                return self.must[0].scorer_sync(reader, limit);
537            }
538            if self.should.len() == 1 && self.must.is_empty() {
539                return self.should[0].scorer_sync(reader, limit);
540            }
541        }
542
543        // MaxScore optimization for pure OR queries
544        if self.must.is_empty() && self.must_not.is_empty() && self.should.len() >= 2 {
545            if let Some(scorer) =
546                try_maxscore_scorer_sync(&self.should, reader, limit, self.global_stats.as_ref())?
547            {
548                return Ok(scorer);
549            }
550            if let Some(scorer) = try_sparse_maxscore_scorer_sync(&self.should, reader, limit)? {
551                return Ok(scorer);
552            }
553            // Try per-field MaxScore grouping for multi-field text queries
554            if let Some(scorer) = try_per_field_maxscore_sync(
555                &self.should,
556                reader,
557                limit,
558                self.global_stats.as_ref(),
559            )? {
560                return Ok(scorer);
561            }
562        }
563
564        // Fall back to standard boolean scoring
565        let mut must_scorers = Vec::with_capacity(self.must.len());
566        for q in &self.must {
567            must_scorers.push(q.scorer_sync(reader, limit)?);
568        }
569
570        let mut should_scorers = Vec::with_capacity(self.should.len());
571        for q in &self.should {
572            should_scorers.push(q.scorer_sync(reader, limit)?);
573        }
574
575        let mut must_not_scorers = Vec::with_capacity(self.must_not.len());
576        for q in &self.must_not {
577            must_not_scorers.push(q.scorer_sync(reader, limit)?);
578        }
579
580        let mut scorer = BooleanScorer {
581            must: must_scorers,
582            should: should_scorers,
583            must_not: must_not_scorers,
584            current_doc: 0,
585        };
586        scorer.current_doc = scorer.find_next_match();
587        Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
588    }
589
590    fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
591        let must = self.must.clone();
592        let should = self.should.clone();
593
594        Box::pin(async move {
595            if !must.is_empty() {
596                let mut estimates = Vec::with_capacity(must.len());
597                for q in &must {
598                    estimates.push(q.count_estimate(reader).await?);
599                }
600                estimates
601                    .into_iter()
602                    .min()
603                    .ok_or_else(|| crate::Error::Corruption("Empty must clause".to_string()))
604            } else if !should.is_empty() {
605                let mut sum = 0u32;
606                for q in &should {
607                    sum = sum.saturating_add(q.count_estimate(reader).await?);
608                }
609                Ok(sum)
610            } else {
611                Ok(0)
612            }
613        })
614    }
615}
616
617struct BooleanScorer<'a> {
618    must: Vec<Box<dyn Scorer + 'a>>,
619    should: Vec<Box<dyn Scorer + 'a>>,
620    must_not: Vec<Box<dyn Scorer + 'a>>,
621    current_doc: DocId,
622}
623
624impl BooleanScorer<'_> {
625    fn find_next_match(&mut self) -> DocId {
626        if self.must.is_empty() && self.should.is_empty() {
627            return TERMINATED;
628        }
629
630        loop {
631            let candidate = if !self.must.is_empty() {
632                let mut max_doc = self
633                    .must
634                    .iter()
635                    .map(|s| s.doc())
636                    .max()
637                    .unwrap_or(TERMINATED);
638
639                if max_doc == TERMINATED {
640                    return TERMINATED;
641                }
642
643                loop {
644                    let mut all_match = true;
645                    for scorer in &mut self.must {
646                        let doc = scorer.seek(max_doc);
647                        if doc == TERMINATED {
648                            return TERMINATED;
649                        }
650                        if doc > max_doc {
651                            max_doc = doc;
652                            all_match = false;
653                            break;
654                        }
655                    }
656                    if all_match {
657                        break;
658                    }
659                }
660                max_doc
661            } else {
662                self.should
663                    .iter()
664                    .map(|s| s.doc())
665                    .filter(|&d| d != TERMINATED)
666                    .min()
667                    .unwrap_or(TERMINATED)
668            };
669
670            if candidate == TERMINATED {
671                return TERMINATED;
672            }
673
674            let excluded = self.must_not.iter_mut().any(|scorer| {
675                let doc = scorer.seek(candidate);
676                doc == candidate
677            });
678
679            if !excluded {
680                // Seek SHOULD scorers to candidate so score() can see their contributions
681                for scorer in &mut self.should {
682                    scorer.seek(candidate);
683                }
684                self.current_doc = candidate;
685                return candidate;
686            }
687
688            // Advance past excluded candidate
689            if !self.must.is_empty() {
690                for scorer in &mut self.must {
691                    scorer.advance();
692                }
693            } else {
694                // For SHOULD-only: seek all scorers past the excluded candidate
695                for scorer in &mut self.should {
696                    if scorer.doc() <= candidate && scorer.doc() != TERMINATED {
697                        scorer.seek(candidate + 1);
698                    }
699                }
700            }
701        }
702    }
703}
704
705impl super::docset::DocSet for BooleanScorer<'_> {
706    fn doc(&self) -> DocId {
707        self.current_doc
708    }
709
710    fn advance(&mut self) -> DocId {
711        if !self.must.is_empty() {
712            for scorer in &mut self.must {
713                scorer.advance();
714            }
715        } else {
716            for scorer in &mut self.should {
717                if scorer.doc() == self.current_doc {
718                    scorer.advance();
719                }
720            }
721        }
722
723        self.current_doc = self.find_next_match();
724        self.current_doc
725    }
726
727    fn seek(&mut self, target: DocId) -> DocId {
728        for scorer in &mut self.must {
729            scorer.seek(target);
730        }
731
732        for scorer in &mut self.should {
733            scorer.seek(target);
734        }
735
736        self.current_doc = self.find_next_match();
737        self.current_doc
738    }
739
740    fn size_hint(&self) -> u32 {
741        if !self.must.is_empty() {
742            self.must.iter().map(|s| s.size_hint()).min().unwrap_or(0)
743        } else {
744            self.should.iter().map(|s| s.size_hint()).sum()
745        }
746    }
747}
748
749impl Scorer for BooleanScorer<'_> {
750    fn score(&self) -> Score {
751        let mut total = 0.0;
752
753        for scorer in &self.must {
754            if scorer.doc() == self.current_doc {
755                total += scorer.score();
756            }
757        }
758
759        for scorer in &self.should {
760            if scorer.doc() == self.current_doc {
761                total += scorer.score();
762            }
763        }
764
765        total
766    }
767
768    fn matched_positions(&self) -> Option<super::MatchedPositions> {
769        let mut all_positions: super::MatchedPositions = Vec::new();
770
771        for scorer in &self.must {
772            if scorer.doc() == self.current_doc
773                && let Some(positions) = scorer.matched_positions()
774            {
775                all_positions.extend(positions);
776            }
777        }
778
779        for scorer in &self.should {
780            if scorer.doc() == self.current_doc
781                && let Some(positions) = scorer.matched_positions()
782            {
783                all_positions.extend(positions);
784            }
785        }
786
787        if all_positions.is_empty() {
788            None
789        } else {
790            Some(all_positions)
791        }
792    }
793}
794
795/// Scorer that iterates over pre-computed top-k results
796struct TopKResultScorer {
797    results: Vec<ScoredDoc>,
798    position: usize,
799}
800
801impl TopKResultScorer {
802    fn new(mut results: Vec<ScoredDoc>) -> Self {
803        // Sort by doc_id ascending — required for DocSet seek() correctness
804        results.sort_unstable_by_key(|r| r.doc_id);
805        Self {
806            results,
807            position: 0,
808        }
809    }
810}
811
812impl super::docset::DocSet for TopKResultScorer {
813    fn doc(&self) -> DocId {
814        if self.position < self.results.len() {
815            self.results[self.position].doc_id
816        } else {
817            TERMINATED
818        }
819    }
820
821    fn advance(&mut self) -> DocId {
822        self.position += 1;
823        self.doc()
824    }
825
826    fn seek(&mut self, target: DocId) -> DocId {
827        let remaining = &self.results[self.position..];
828        self.position += remaining.partition_point(|r| r.doc_id < target);
829        self.doc()
830    }
831
832    fn size_hint(&self) -> u32 {
833        (self.results.len() - self.position) as u32
834    }
835}
836
837impl Scorer for TopKResultScorer {
838    fn score(&self) -> Score {
839        if self.position < self.results.len() {
840            self.results[self.position].score
841        } else {
842            0.0
843        }
844    }
845}
846
847/// Empty scorer for when no terms match
848struct EmptyScorer;
849
850impl super::docset::DocSet for EmptyScorer {
851    fn doc(&self) -> DocId {
852        TERMINATED
853    }
854
855    fn advance(&mut self) -> DocId {
856        TERMINATED
857    }
858
859    fn seek(&mut self, _target: DocId) -> DocId {
860        TERMINATED
861    }
862
863    fn size_hint(&self) -> u32 {
864        0
865    }
866}
867
868impl Scorer for EmptyScorer {
869    fn score(&self) -> Score {
870        0.0
871    }
872}
873
874#[cfg(test)]
875mod tests {
876    use super::*;
877    use crate::dsl::Field;
878    use crate::query::TermQuery;
879
880    #[test]
881    fn test_maxscore_eligible_pure_or_same_field() {
882        // Pure OR query with multiple terms in same field should be MaxScore-eligible
883        let query = BooleanQuery::new()
884            .should(TermQuery::text(Field(0), "hello"))
885            .should(TermQuery::text(Field(0), "world"))
886            .should(TermQuery::text(Field(0), "foo"));
887
888        // All clauses should return term info
889        assert!(
890            query
891                .should
892                .iter()
893                .all(|q| q.as_term_query_info().is_some())
894        );
895
896        // All should be same field
897        let infos: Vec<_> = query
898            .should
899            .iter()
900            .filter_map(|q| q.as_term_query_info())
901            .collect();
902        assert_eq!(infos.len(), 3);
903        assert!(infos.iter().all(|i| i.field == Field(0)));
904    }
905
906    #[test]
907    fn test_maxscore_not_eligible_different_fields() {
908        // OR query with terms in different fields should NOT use MaxScore
909        let query = BooleanQuery::new()
910            .should(TermQuery::text(Field(0), "hello"))
911            .should(TermQuery::text(Field(1), "world")); // Different field!
912
913        let infos: Vec<_> = query
914            .should
915            .iter()
916            .filter_map(|q| q.as_term_query_info())
917            .collect();
918        assert_eq!(infos.len(), 2);
919        // Fields are different, MaxScore should not be used
920        assert!(infos[0].field != infos[1].field);
921    }
922
923    #[test]
924    fn test_maxscore_not_eligible_with_must() {
925        // Query with MUST clause should NOT use MaxScore optimization
926        let query = BooleanQuery::new()
927            .must(TermQuery::text(Field(0), "required"))
928            .should(TermQuery::text(Field(0), "hello"))
929            .should(TermQuery::text(Field(0), "world"));
930
931        // Has MUST clause, so MaxScore optimization should not kick in
932        assert!(!query.must.is_empty());
933    }
934
935    #[test]
936    fn test_maxscore_not_eligible_with_must_not() {
937        // Query with MUST_NOT clause should NOT use MaxScore optimization
938        let query = BooleanQuery::new()
939            .should(TermQuery::text(Field(0), "hello"))
940            .should(TermQuery::text(Field(0), "world"))
941            .must_not(TermQuery::text(Field(0), "excluded"));
942
943        // Has MUST_NOT clause, so MaxScore optimization should not kick in
944        assert!(!query.must_not.is_empty());
945    }
946
947    #[test]
948    fn test_maxscore_not_eligible_single_term() {
949        // Single SHOULD clause should NOT use MaxScore (no benefit)
950        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
951
952        // Only one term, MaxScore not beneficial
953        assert_eq!(query.should.len(), 1);
954    }
955
956    #[test]
957    fn test_term_query_info_extraction() {
958        let term_query = TermQuery::text(Field(42), "test");
959        let info = term_query.as_term_query_info();
960
961        assert!(info.is_some());
962        let info = info.unwrap();
963        assert_eq!(info.field, Field(42));
964        assert_eq!(info.term, b"test");
965    }
966
967    #[test]
968    fn test_boolean_query_no_term_info() {
969        // BooleanQuery itself should not return term info
970        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
971
972        assert!(query.as_term_query_info().is_none());
973    }
974}
hermes_core/query/boolean.rs

hermes_core/query/
boolean.rs