hermes_core/query/
boolean.rs

1//! Boolean query with MUST, SHOULD, and MUST_NOT clauses
2
3use std::sync::Arc;
4
5use crate::segment::SegmentReader;
6use crate::structures::TERMINATED;
7use crate::{DocId, Score};
8
9use super::{
10    CountFuture, GlobalStats, MaxScoreExecutor, Query, ScoredDoc, Scorer, ScorerFuture,
11    SparseTermQueryInfo,
12};
13
14/// Boolean query with MUST, SHOULD, and MUST_NOT clauses
15///
16/// When all clauses are SHOULD term queries on the same field, automatically
17/// uses MaxScore optimization for efficient top-k retrieval.
18#[derive(Default, Clone)]
19pub struct BooleanQuery {
20    pub must: Vec<Arc<dyn Query>>,
21    pub should: Vec<Arc<dyn Query>>,
22    pub must_not: Vec<Arc<dyn Query>>,
23    /// Optional global statistics for cross-segment IDF
24    global_stats: Option<Arc<GlobalStats>>,
25}
26
27impl std::fmt::Debug for BooleanQuery {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        f.debug_struct("BooleanQuery")
30            .field("must_count", &self.must.len())
31            .field("should_count", &self.should.len())
32            .field("must_not_count", &self.must_not.len())
33            .field("has_global_stats", &self.global_stats.is_some())
34            .finish()
35    }
36}
37
38impl std::fmt::Display for BooleanQuery {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        write!(f, "Boolean(")?;
41        let mut first = true;
42        for q in &self.must {
43            if !first {
44                write!(f, " ")?;
45            }
46            write!(f, "+{}", q)?;
47            first = false;
48        }
49        for q in &self.should {
50            if !first {
51                write!(f, " ")?;
52            }
53            write!(f, "{}", q)?;
54            first = false;
55        }
56        for q in &self.must_not {
57            if !first {
58                write!(f, " ")?;
59            }
60            write!(f, "-{}", q)?;
61            first = false;
62        }
63        write!(f, ")")
64    }
65}
66
67impl BooleanQuery {
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    pub fn must(mut self, query: impl Query + 'static) -> Self {
73        self.must.push(Arc::new(query));
74        self
75    }
76
77    pub fn should(mut self, query: impl Query + 'static) -> Self {
78        self.should.push(Arc::new(query));
79        self
80    }
81
82    pub fn must_not(mut self, query: impl Query + 'static) -> Self {
83        self.must_not.push(Arc::new(query));
84        self
85    }
86
87    /// Set global statistics for cross-segment IDF
88    pub fn with_global_stats(mut self, stats: Arc<GlobalStats>) -> Self {
89        self.global_stats = Some(stats);
90        self
91    }
92}
93
94/// Compute IDF for a posting list, preferring global stats.
95fn compute_idf(
96    posting_list: &crate::structures::BlockPostingList,
97    field: crate::Field,
98    term: &[u8],
99    num_docs: f32,
100    global_stats: Option<&Arc<GlobalStats>>,
101) -> f32 {
102    if let Some(stats) = global_stats {
103        let global_idf = stats.text_idf(field, &String::from_utf8_lossy(term));
104        if global_idf > 0.0 {
105            return global_idf;
106        }
107    }
108    let doc_freq = posting_list.doc_count() as f32;
109    super::bm25_idf(doc_freq, num_docs)
110}
111
112/// Shared pre-check for text MaxScore: extract term infos, field, avg_field_len, num_docs.
113/// Returns None if not all SHOULD clauses are single-field term queries.
114fn prepare_text_maxscore(
115    should: &[Arc<dyn Query>],
116    reader: &SegmentReader,
117    global_stats: Option<&Arc<GlobalStats>>,
118) -> Option<(Vec<super::TermQueryInfo>, crate::Field, f32, f32)> {
119    let infos: Vec<_> = should
120        .iter()
121        .filter_map(|q| q.as_term_query_info())
122        .collect();
123    if infos.len() != should.len() {
124        return None;
125    }
126    let field = infos[0].field;
127    if !infos.iter().all(|t| t.field == field) {
128        return None;
129    }
130    let avg_field_len = global_stats
131        .map(|s| s.avg_field_len(field))
132        .unwrap_or_else(|| reader.avg_field_len(field));
133    let num_docs = reader.num_docs() as f32;
134    Some((infos, field, avg_field_len, num_docs))
135}
136
137/// Build a TopK scorer from fetched posting lists via text MaxScore.
138fn finish_text_maxscore<'a>(
139    posting_lists: Vec<(crate::structures::BlockPostingList, f32)>,
140    avg_field_len: f32,
141    limit: usize,
142) -> crate::Result<Box<dyn Scorer + 'a>> {
143    if posting_lists.is_empty() {
144        return Ok(Box::new(EmptyScorer) as Box<dyn Scorer + 'a>);
145    }
146    let results = MaxScoreExecutor::text(posting_lists, avg_field_len, limit).execute_sync()?;
147    Ok(Box::new(TopKResultScorer::new(results)) as Box<dyn Scorer + 'a>)
148}
149
150/// Try text MaxScore for pure OR queries (async).
151async fn try_maxscore_scorer<'a>(
152    should: &[Arc<dyn Query>],
153    reader: &'a SegmentReader,
154    limit: usize,
155    global_stats: Option<&Arc<GlobalStats>>,
156) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
157    let (mut infos, _field, avg_field_len, num_docs) =
158        match prepare_text_maxscore(should, reader, global_stats) {
159            Some(v) => v,
160            None => return Ok(None),
161        };
162    let mut posting_lists = Vec::with_capacity(infos.len());
163    for info in infos.drain(..) {
164        if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
165            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
166            posting_lists.push((pl, idf));
167        }
168    }
169    Ok(Some(finish_text_maxscore(
170        posting_lists,
171        avg_field_len,
172        limit,
173    )?))
174}
175
176/// Try text MaxScore for pure OR queries (sync).
177#[cfg(feature = "sync")]
178fn try_maxscore_scorer_sync<'a>(
179    should: &[Arc<dyn Query>],
180    reader: &'a SegmentReader,
181    limit: usize,
182    global_stats: Option<&Arc<GlobalStats>>,
183) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
184    let (mut infos, _field, avg_field_len, num_docs) =
185        match prepare_text_maxscore(should, reader, global_stats) {
186            Some(v) => v,
187            None => return Ok(None),
188        };
189    let mut posting_lists = Vec::with_capacity(infos.len());
190    for info in infos.drain(..) {
191        if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
192            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
193            posting_lists.push((pl, idf));
194        }
195    }
196    Ok(Some(finish_text_maxscore(
197        posting_lists,
198        avg_field_len,
199        limit,
200    )?))
201}
202
203/// Shared grouping result for per-field MaxScore.
204struct PerFieldGrouping {
205    /// (field, avg_field_len, term_infos) for groups with 2+ terms
206    multi_term_groups: Vec<(crate::Field, f32, Vec<super::TermQueryInfo>)>,
207    /// Original indices of single-term and non-term SHOULD clauses (fallback scorers)
208    fallback_indices: Vec<usize>,
209    /// Limit per field group (over-fetched to compensate for cross-field scoring)
210    per_field_limit: usize,
211    num_docs: f32,
212}
213
214/// Group SHOULD clauses by field for per-field MaxScore.
215/// Returns None if no group has 2+ terms (no optimization benefit).
216fn prepare_per_field_grouping(
217    should: &[Arc<dyn Query>],
218    reader: &SegmentReader,
219    limit: usize,
220    global_stats: Option<&Arc<GlobalStats>>,
221) -> Option<PerFieldGrouping> {
222    let mut field_groups: rustc_hash::FxHashMap<crate::Field, Vec<(usize, super::TermQueryInfo)>> =
223        rustc_hash::FxHashMap::default();
224    let mut non_term_indices: Vec<usize> = Vec::new();
225
226    for (i, q) in should.iter().enumerate() {
227        if let Some(info) = q.as_term_query_info() {
228            field_groups.entry(info.field).or_default().push((i, info));
229        } else {
230            non_term_indices.push(i);
231        }
232    }
233
234    if !field_groups.values().any(|g| g.len() >= 2) {
235        return None;
236    }
237
238    let num_groups = field_groups.len() + non_term_indices.len();
239    let per_field_limit = limit * num_groups;
240    let num_docs = reader.num_docs() as f32;
241
242    let mut multi_term_groups = Vec::new();
243    let mut fallback_indices = non_term_indices;
244
245    for group in field_groups.into_values() {
246        if group.len() >= 2 {
247            let field = group[0].1.field;
248            let avg_field_len = global_stats
249                .map(|s| s.avg_field_len(field))
250                .unwrap_or_else(|| reader.avg_field_len(field));
251            let infos: Vec<_> = group.into_iter().map(|(_, info)| info).collect();
252            multi_term_groups.push((field, avg_field_len, infos));
253        } else {
254            fallback_indices.push(group[0].0);
255        }
256    }
257
258    Some(PerFieldGrouping {
259        multi_term_groups,
260        fallback_indices,
261        per_field_limit,
262        num_docs,
263    })
264}
265
266/// Build a SHOULD-only scorer from a vec of optimized scorers.
267fn build_should_scorer<'a>(scorers: Vec<Box<dyn Scorer + 'a>>) -> Box<dyn Scorer + 'a> {
268    if scorers.is_empty() {
269        return Box::new(EmptyScorer);
270    }
271    if scorers.len() == 1 {
272        return scorers.into_iter().next().unwrap();
273    }
274    let mut scorer = BooleanScorer {
275        must: vec![],
276        should: scorers,
277        must_not: vec![],
278        current_doc: 0,
279    };
280    scorer.current_doc = scorer.find_next_match();
281    Box::new(scorer)
282}
283
284/// Per-field MaxScore grouping for multi-field SHOULD queries (async).
285///
286/// When SHOULD clauses span multiple fields (e.g., "hello world" across title, body, desc),
287/// single-field MaxScore can't apply. This groups TermQuery clauses by field, runs MaxScore
288/// per group, and returns a compact scorer per field.
289async fn try_per_field_maxscore<'a>(
290    should: &[Arc<dyn Query>],
291    reader: &'a SegmentReader,
292    limit: usize,
293    global_stats: Option<&Arc<GlobalStats>>,
294) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
295    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
296        Some(g) => g,
297        None => return Ok(None),
298    };
299
300    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
301
302    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
303        let mut posting_lists = Vec::with_capacity(infos.len());
304        for info in infos {
305            if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
306                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
307                posting_lists.push((pl, idf));
308            }
309        }
310        if !posting_lists.is_empty() {
311            scorers.push(finish_text_maxscore(
312                posting_lists,
313                *avg_field_len,
314                grouping.per_field_limit,
315            )?);
316        }
317    }
318
319    for &idx in &grouping.fallback_indices {
320        scorers.push(should[idx].scorer(reader, limit).await?);
321    }
322
323    Ok(Some(build_should_scorer(scorers)))
324}
325
326/// Per-field MaxScore grouping for multi-field SHOULD queries (sync).
327#[cfg(feature = "sync")]
328fn try_per_field_maxscore_sync<'a>(
329    should: &[Arc<dyn Query>],
330    reader: &'a SegmentReader,
331    limit: usize,
332    global_stats: Option<&Arc<GlobalStats>>,
333) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
334    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
335        Some(g) => g,
336        None => return Ok(None),
337    };
338
339    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
340
341    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
342        let mut posting_lists = Vec::with_capacity(infos.len());
343        for info in infos {
344            if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
345                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
346                posting_lists.push((pl, idf));
347            }
348        }
349        if !posting_lists.is_empty() {
350            scorers.push(finish_text_maxscore(
351                posting_lists,
352                *avg_field_len,
353                grouping.per_field_limit,
354            )?);
355        }
356    }
357
358    for &idx in &grouping.fallback_indices {
359        scorers.push(should[idx].scorer_sync(reader, limit)?);
360    }
361
362    Ok(Some(build_should_scorer(scorers)))
363}
364
365/// Try to build a sparse MaxScoreExecutor from SHOULD clauses.
366/// Returns None if not eligible, Some(Err) for empty segment, Some(Ok) otherwise.
367fn prepare_sparse_maxscore<'a>(
368    should: &[Arc<dyn Query>],
369    reader: &'a SegmentReader,
370    limit: usize,
371) -> Option<Result<MaxScoreExecutor<'a>, Box<dyn Scorer + 'a>>> {
372    let infos: Vec<SparseTermQueryInfo> = should
373        .iter()
374        .filter_map(|q| q.as_sparse_term_query_info())
375        .collect();
376    if infos.len() != should.len() {
377        return None;
378    }
379    let field = infos[0].field;
380    if !infos.iter().all(|t| t.field == field) {
381        return None;
382    }
383    let si = match reader.sparse_index(field) {
384        Some(si) => si,
385        None => return Some(Err(Box::new(EmptyScorer))),
386    };
387    let query_terms: Vec<(u32, f32)> = infos
388        .iter()
389        .filter(|info| si.has_dimension(info.dim_id))
390        .map(|info| (info.dim_id, info.weight))
391        .collect();
392    if query_terms.is_empty() {
393        return Some(Err(Box::new(EmptyScorer)));
394    }
395    let executor_limit = (limit as f32 * infos[0].over_fetch_factor).ceil() as usize;
396    Some(Ok(MaxScoreExecutor::sparse(
397        si,
398        query_terms,
399        executor_limit,
400        infos[0].heap_factor,
401    )))
402}
403
404/// Combine raw MaxScore results with ordinal deduplication into a scorer.
405fn combine_sparse_results<'a>(
406    raw: Vec<ScoredDoc>,
407    combiner: super::MultiValueCombiner,
408    field: crate::Field,
409    limit: usize,
410) -> Box<dyn Scorer + 'a> {
411    let combined = crate::segment::combine_ordinal_results(
412        raw.into_iter().map(|r| (r.doc_id, r.ordinal, r.score)),
413        combiner,
414        limit,
415    );
416    Box::new(VectorTopKResultScorer::new(combined, field.0))
417}
418
419/// Build MaxScore scorer from sparse term infos (async).
420async fn try_sparse_maxscore_scorer<'a>(
421    should: &[Arc<dyn Query>],
422    reader: &'a SegmentReader,
423    limit: usize,
424) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
425    let executor = match prepare_sparse_maxscore(should, reader, limit) {
426        None => return Ok(None),
427        Some(Err(empty)) => return Ok(Some(empty)),
428        Some(Ok(e)) => e,
429    };
430    let info = should[0].as_sparse_term_query_info().unwrap();
431    let raw = executor.execute().await?;
432    Ok(Some(combine_sparse_results(
433        raw,
434        info.combiner,
435        info.field,
436        limit,
437    )))
438}
439
440/// Build MaxScore scorer from sparse term infos (sync).
441#[cfg(feature = "sync")]
442fn try_sparse_maxscore_scorer_sync<'a>(
443    should: &[Arc<dyn Query>],
444    reader: &'a SegmentReader,
445    limit: usize,
446) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
447    let executor = match prepare_sparse_maxscore(should, reader, limit) {
448        None => return Ok(None),
449        Some(Err(empty)) => return Ok(Some(empty)),
450        Some(Ok(e)) => e,
451    };
452    let info = should[0].as_sparse_term_query_info().unwrap();
453    let raw = executor.execute_sync()?;
454    Ok(Some(combine_sparse_results(
455        raw,
456        info.combiner,
457        info.field,
458        limit,
459    )))
460}
461
462impl Query for BooleanQuery {
463    fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
464        // Clone Arc vectors - cheap reference counting
465        let must = self.must.clone();
466        let should = self.should.clone();
467        let must_not = self.must_not.clone();
468        let global_stats = self.global_stats.clone();
469
470        Box::pin(async move {
471            // Single-clause optimization: unwrap to inner scorer directly
472            if must_not.is_empty() {
473                if must.len() == 1 && should.is_empty() {
474                    return must[0].scorer(reader, limit).await;
475                }
476                if should.len() == 1 && must.is_empty() {
477                    return should[0].scorer(reader, limit).await;
478                }
479            }
480
481            // Check if this is a pure OR query eligible for MaxScore optimization
482            // Conditions: no MUST, no MUST_NOT, multiple SHOULD clauses, all same field
483            if must.is_empty() && must_not.is_empty() && should.len() >= 2 {
484                // Try text MaxScore first
485                if let Some(scorer) =
486                    try_maxscore_scorer(&should, reader, limit, global_stats.as_ref()).await?
487                {
488                    return Ok(scorer);
489                }
490                // Try sparse MaxScore
491                if let Some(scorer) = try_sparse_maxscore_scorer(&should, reader, limit).await? {
492                    return Ok(scorer);
493                }
494                // Try per-field MaxScore grouping for multi-field text queries
495                if let Some(scorer) =
496                    try_per_field_maxscore(&should, reader, limit, global_stats.as_ref()).await?
497                {
498                    return Ok(scorer);
499                }
500            }
501
502            // ── Planner: filter push-down ──────────────────────────────────
503            // When there are SHOULD clauses with enough precomputed results,
504            // flip the driver: SHOULD drives iteration, MUST/MUST_NOT become
505            // per-doc checks via O(1) predicates or seek()-based verification.
506            if !should.is_empty() && !must.is_empty() && limit < usize::MAX / 4 {
507                let should_scorer = if should.len() == 1 {
508                    should[0].scorer(reader, limit).await?
509                } else {
510                    let sub = BooleanQuery {
511                        must: Vec::new(),
512                        should: should.clone(),
513                        must_not: Vec::new(),
514                        global_stats: global_stats.clone(),
515                    };
516                    sub.scorer(reader, limit).await?
517                };
518
519                if should_scorer.size_hint() >= limit as u32 {
520                    // Split MUST into predicates (O(1)) vs verifier scorers (seek)
521                    let mut predicates: Vec<super::DocPredicate<'a>> = Vec::new();
522                    let mut must_verifiers: Vec<Box<dyn super::Scorer + 'a>> = Vec::new();
523                    let mut filter_score = 0.0f32;
524
525                    for q in &must {
526                        if let Some(pred) = q.as_doc_predicate(reader) {
527                            predicates.push(pred);
528                            filter_score += 1.0;
529                        } else {
530                            must_verifiers.push(q.scorer(reader, limit).await?);
531                        }
532                    }
533
534                    // Split MUST_NOT into negated predicates vs verifier scorers
535                    let mut must_not_verifiers: Vec<Box<dyn super::Scorer + 'a>> = Vec::new();
536                    for q in &must_not {
537                        if let Some(pred) = q.as_doc_predicate(reader) {
538                            let negated: super::DocPredicate<'a> =
539                                Box::new(move |doc_id| !pred(doc_id));
540                            predicates.push(negated);
541                        } else {
542                            must_not_verifiers.push(q.scorer(reader, limit).await?);
543                        }
544                    }
545
546                    log::debug!(
547                        "BooleanQuery planner: push-down {} predicates + {} must verifiers + {} must_not verifiers, {} SHOULD drive (size_hint={})",
548                        predicates.len(),
549                        must_verifiers.len(),
550                        must_not_verifiers.len(),
551                        should.len(),
552                        should_scorer.size_hint()
553                    );
554
555                    return Ok(Box::new(super::PredicatedScorer::new(
556                        should_scorer,
557                        predicates,
558                        must_verifiers,
559                        must_not_verifiers,
560                        filter_score,
561                    )));
562                }
563
564                // size_hint < limit — reuse already-built SHOULD scorer in
565                // the standard BooleanScorer to avoid rebuilding it.
566                let mut must_scorers = Vec::with_capacity(must.len());
567                for q in &must {
568                    must_scorers.push(q.scorer(reader, limit).await?);
569                }
570
571                let mut must_not_scorers = Vec::with_capacity(must_not.len());
572                for q in &must_not {
573                    must_not_scorers.push(q.scorer(reader, limit).await?);
574                }
575
576                let mut scorer = BooleanScorer {
577                    must: must_scorers,
578                    should: vec![should_scorer],
579                    must_not: must_not_scorers,
580                    current_doc: 0,
581                };
582                scorer.current_doc = scorer.find_next_match();
583                return Ok(Box::new(scorer));
584            }
585
586            // Fall back to standard boolean scoring
587            let mut must_scorers = Vec::with_capacity(must.len());
588            for q in &must {
589                must_scorers.push(q.scorer(reader, limit).await?);
590            }
591
592            let mut should_scorers = Vec::with_capacity(should.len());
593            for q in &should {
594                should_scorers.push(q.scorer(reader, limit).await?);
595            }
596
597            let mut must_not_scorers = Vec::with_capacity(must_not.len());
598            for q in &must_not {
599                must_not_scorers.push(q.scorer(reader, limit).await?);
600            }
601
602            let mut scorer = BooleanScorer {
603                must: must_scorers,
604                should: should_scorers,
605                must_not: must_not_scorers,
606                current_doc: 0,
607            };
608            // Initialize to first match
609            scorer.current_doc = scorer.find_next_match();
610            Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
611        })
612    }
613
614    #[cfg(feature = "sync")]
615    fn scorer_sync<'a>(
616        &self,
617        reader: &'a SegmentReader,
618        limit: usize,
619    ) -> crate::Result<Box<dyn Scorer + 'a>> {
620        // Single-clause optimization: unwrap to inner scorer directly
621        if self.must_not.is_empty() {
622            if self.must.len() == 1 && self.should.is_empty() {
623                return self.must[0].scorer_sync(reader, limit);
624            }
625            if self.should.len() == 1 && self.must.is_empty() {
626                return self.should[0].scorer_sync(reader, limit);
627            }
628        }
629
630        // MaxScore optimization for pure OR queries
631        if self.must.is_empty() && self.must_not.is_empty() && self.should.len() >= 2 {
632            if let Some(scorer) =
633                try_maxscore_scorer_sync(&self.should, reader, limit, self.global_stats.as_ref())?
634            {
635                return Ok(scorer);
636            }
637            if let Some(scorer) = try_sparse_maxscore_scorer_sync(&self.should, reader, limit)? {
638                return Ok(scorer);
639            }
640            // Try per-field MaxScore grouping for multi-field text queries
641            if let Some(scorer) = try_per_field_maxscore_sync(
642                &self.should,
643                reader,
644                limit,
645                self.global_stats.as_ref(),
646            )? {
647                return Ok(scorer);
648            }
649        }
650
651        // ── Planner: filter push-down ──────────────────────────────────
652        if !self.should.is_empty() && !self.must.is_empty() && limit < usize::MAX / 4 {
653            let should_scorer = if self.should.len() == 1 {
654                self.should[0].scorer_sync(reader, limit)?
655            } else {
656                let sub = BooleanQuery {
657                    must: Vec::new(),
658                    should: self.should.clone(),
659                    must_not: Vec::new(),
660                    global_stats: self.global_stats.clone(),
661                };
662                sub.scorer_sync(reader, limit)?
663            };
664
665            if should_scorer.size_hint() >= limit as u32 {
666                let mut predicates: Vec<super::DocPredicate<'a>> = Vec::new();
667                let mut must_verifiers: Vec<Box<dyn super::Scorer + 'a>> = Vec::new();
668                let mut filter_score = 0.0f32;
669
670                for q in &self.must {
671                    if let Some(pred) = q.as_doc_predicate(reader) {
672                        predicates.push(pred);
673                        filter_score += 1.0;
674                    } else {
675                        must_verifiers.push(q.scorer_sync(reader, limit)?);
676                    }
677                }
678
679                let mut must_not_verifiers: Vec<Box<dyn super::Scorer + 'a>> = Vec::new();
680                for q in &self.must_not {
681                    if let Some(pred) = q.as_doc_predicate(reader) {
682                        let negated: super::DocPredicate<'a> =
683                            Box::new(move |doc_id| !pred(doc_id));
684                        predicates.push(negated);
685                    } else {
686                        must_not_verifiers.push(q.scorer_sync(reader, limit)?);
687                    }
688                }
689
690                log::debug!(
691                    "BooleanQuery planner (sync): push-down {} predicates + {} must verifiers + {} must_not verifiers, {} SHOULD drive",
692                    predicates.len(),
693                    must_verifiers.len(),
694                    must_not_verifiers.len(),
695                    self.should.len()
696                );
697
698                return Ok(Box::new(super::PredicatedScorer::new(
699                    should_scorer,
700                    predicates,
701                    must_verifiers,
702                    must_not_verifiers,
703                    filter_score,
704                )));
705            }
706
707            // size_hint < limit — reuse already-built SHOULD scorer in
708            // the standard BooleanScorer to avoid rebuilding it.
709            let mut must_scorers = Vec::with_capacity(self.must.len());
710            for q in &self.must {
711                must_scorers.push(q.scorer_sync(reader, limit)?);
712            }
713
714            let mut must_not_scorers = Vec::with_capacity(self.must_not.len());
715            for q in &self.must_not {
716                must_not_scorers.push(q.scorer_sync(reader, limit)?);
717            }
718
719            let mut scorer = BooleanScorer {
720                must: must_scorers,
721                should: vec![should_scorer],
722                must_not: must_not_scorers,
723                current_doc: 0,
724            };
725            scorer.current_doc = scorer.find_next_match();
726            return Ok(Box::new(scorer));
727        }
728
729        // Fall back to standard boolean scoring
730        let mut must_scorers = Vec::with_capacity(self.must.len());
731        for q in &self.must {
732            must_scorers.push(q.scorer_sync(reader, limit)?);
733        }
734
735        let mut should_scorers = Vec::with_capacity(self.should.len());
736        for q in &self.should {
737            should_scorers.push(q.scorer_sync(reader, limit)?);
738        }
739
740        let mut must_not_scorers = Vec::with_capacity(self.must_not.len());
741        for q in &self.must_not {
742            must_not_scorers.push(q.scorer_sync(reader, limit)?);
743        }
744
745        let mut scorer = BooleanScorer {
746            must: must_scorers,
747            should: should_scorers,
748            must_not: must_not_scorers,
749            current_doc: 0,
750        };
751        scorer.current_doc = scorer.find_next_match();
752        Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
753    }
754
755    fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
756        let must = self.must.clone();
757        let should = self.should.clone();
758
759        Box::pin(async move {
760            if !must.is_empty() {
761                let mut estimates = Vec::with_capacity(must.len());
762                for q in &must {
763                    estimates.push(q.count_estimate(reader).await?);
764                }
765                estimates
766                    .into_iter()
767                    .min()
768                    .ok_or_else(|| crate::Error::Corruption("Empty must clause".to_string()))
769            } else if !should.is_empty() {
770                let mut sum = 0u32;
771                for q in &should {
772                    sum = sum.saturating_add(q.count_estimate(reader).await?);
773                }
774                Ok(sum)
775            } else {
776                Ok(0)
777            }
778        })
779    }
780}
781
782struct BooleanScorer<'a> {
783    must: Vec<Box<dyn Scorer + 'a>>,
784    should: Vec<Box<dyn Scorer + 'a>>,
785    must_not: Vec<Box<dyn Scorer + 'a>>,
786    current_doc: DocId,
787}
788
789impl BooleanScorer<'_> {
790    fn find_next_match(&mut self) -> DocId {
791        if self.must.is_empty() && self.should.is_empty() {
792            return TERMINATED;
793        }
794
795        loop {
796            let candidate = if !self.must.is_empty() {
797                let mut max_doc = self
798                    .must
799                    .iter()
800                    .map(|s| s.doc())
801                    .max()
802                    .unwrap_or(TERMINATED);
803
804                if max_doc == TERMINATED {
805                    return TERMINATED;
806                }
807
808                loop {
809                    let mut all_match = true;
810                    for scorer in &mut self.must {
811                        let doc = scorer.seek(max_doc);
812                        if doc == TERMINATED {
813                            return TERMINATED;
814                        }
815                        if doc > max_doc {
816                            max_doc = doc;
817                            all_match = false;
818                            break;
819                        }
820                    }
821                    if all_match {
822                        break;
823                    }
824                }
825                max_doc
826            } else {
827                self.should
828                    .iter()
829                    .map(|s| s.doc())
830                    .filter(|&d| d != TERMINATED)
831                    .min()
832                    .unwrap_or(TERMINATED)
833            };
834
835            if candidate == TERMINATED {
836                return TERMINATED;
837            }
838
839            let excluded = self.must_not.iter_mut().any(|scorer| {
840                let doc = scorer.seek(candidate);
841                doc == candidate
842            });
843
844            if !excluded {
845                // Seek SHOULD scorers to candidate so score() can see their contributions
846                for scorer in &mut self.should {
847                    scorer.seek(candidate);
848                }
849                self.current_doc = candidate;
850                return candidate;
851            }
852
853            // Advance past excluded candidate
854            if !self.must.is_empty() {
855                for scorer in &mut self.must {
856                    scorer.advance();
857                }
858            } else {
859                // For SHOULD-only: seek all scorers past the excluded candidate
860                for scorer in &mut self.should {
861                    if scorer.doc() <= candidate && scorer.doc() != TERMINATED {
862                        scorer.seek(candidate + 1);
863                    }
864                }
865            }
866        }
867    }
868}
869
870impl super::docset::DocSet for BooleanScorer<'_> {
871    fn doc(&self) -> DocId {
872        self.current_doc
873    }
874
875    fn advance(&mut self) -> DocId {
876        if !self.must.is_empty() {
877            for scorer in &mut self.must {
878                scorer.advance();
879            }
880        } else {
881            for scorer in &mut self.should {
882                if scorer.doc() == self.current_doc {
883                    scorer.advance();
884                }
885            }
886        }
887
888        self.current_doc = self.find_next_match();
889        self.current_doc
890    }
891
892    fn seek(&mut self, target: DocId) -> DocId {
893        for scorer in &mut self.must {
894            scorer.seek(target);
895        }
896
897        for scorer in &mut self.should {
898            scorer.seek(target);
899        }
900
901        self.current_doc = self.find_next_match();
902        self.current_doc
903    }
904
905    fn size_hint(&self) -> u32 {
906        if !self.must.is_empty() {
907            self.must.iter().map(|s| s.size_hint()).min().unwrap_or(0)
908        } else {
909            self.should.iter().map(|s| s.size_hint()).sum()
910        }
911    }
912}
913
914impl Scorer for BooleanScorer<'_> {
915    fn score(&self) -> Score {
916        let mut total = 0.0;
917
918        for scorer in &self.must {
919            if scorer.doc() == self.current_doc {
920                total += scorer.score();
921            }
922        }
923
924        for scorer in &self.should {
925            if scorer.doc() == self.current_doc {
926                total += scorer.score();
927            }
928        }
929
930        total
931    }
932
933    fn matched_positions(&self) -> Option<super::MatchedPositions> {
934        let mut all_positions: super::MatchedPositions = Vec::new();
935
936        for scorer in &self.must {
937            if scorer.doc() == self.current_doc
938                && let Some(positions) = scorer.matched_positions()
939            {
940                all_positions.extend(positions);
941            }
942        }
943
944        for scorer in &self.should {
945            if scorer.doc() == self.current_doc
946                && let Some(positions) = scorer.matched_positions()
947            {
948                all_positions.extend(positions);
949            }
950        }
951
952        if all_positions.is_empty() {
953            None
954        } else {
955            Some(all_positions)
956        }
957    }
958}
959
960/// Scorer that iterates over pre-computed top-k results
961struct TopKResultScorer {
962    results: Vec<ScoredDoc>,
963    position: usize,
964}
965
966impl TopKResultScorer {
967    fn new(mut results: Vec<ScoredDoc>) -> Self {
968        // Sort by doc_id ascending — required for DocSet seek() correctness
969        results.sort_unstable_by_key(|r| r.doc_id);
970        Self {
971            results,
972            position: 0,
973        }
974    }
975}
976
977impl super::docset::DocSet for TopKResultScorer {
978    fn doc(&self) -> DocId {
979        if self.position < self.results.len() {
980            self.results[self.position].doc_id
981        } else {
982            TERMINATED
983        }
984    }
985
986    fn advance(&mut self) -> DocId {
987        self.position += 1;
988        self.doc()
989    }
990
991    fn seek(&mut self, target: DocId) -> DocId {
992        let remaining = &self.results[self.position..];
993        self.position += remaining.partition_point(|r| r.doc_id < target);
994        self.doc()
995    }
996
997    fn size_hint(&self) -> u32 {
998        (self.results.len() - self.position) as u32
999    }
1000}
1001
1002impl Scorer for TopKResultScorer {
1003    fn score(&self) -> Score {
1004        if self.position < self.results.len() {
1005            self.results[self.position].score
1006        } else {
1007            0.0
1008        }
1009    }
1010}
1011
1012/// Scorer that iterates over pre-computed vector results with ordinal information.
1013/// Used by sparse MaxScore path to preserve per-ordinal scores for matched_positions().
1014struct VectorTopKResultScorer {
1015    results: Vec<crate::segment::VectorSearchResult>,
1016    position: usize,
1017    field_id: u32,
1018}
1019
1020impl VectorTopKResultScorer {
1021    fn new(mut results: Vec<crate::segment::VectorSearchResult>, field_id: u32) -> Self {
1022        results.sort_unstable_by_key(|r| r.doc_id);
1023        Self {
1024            results,
1025            position: 0,
1026            field_id,
1027        }
1028    }
1029}
1030
1031impl super::docset::DocSet for VectorTopKResultScorer {
1032    fn doc(&self) -> DocId {
1033        if self.position < self.results.len() {
1034            self.results[self.position].doc_id
1035        } else {
1036            TERMINATED
1037        }
1038    }
1039
1040    fn advance(&mut self) -> DocId {
1041        self.position += 1;
1042        self.doc()
1043    }
1044
1045    fn seek(&mut self, target: DocId) -> DocId {
1046        let remaining = &self.results[self.position..];
1047        self.position += remaining.partition_point(|r| r.doc_id < target);
1048        self.doc()
1049    }
1050
1051    fn size_hint(&self) -> u32 {
1052        (self.results.len() - self.position) as u32
1053    }
1054}
1055
1056impl Scorer for VectorTopKResultScorer {
1057    fn score(&self) -> Score {
1058        if self.position < self.results.len() {
1059            self.results[self.position].score
1060        } else {
1061            0.0
1062        }
1063    }
1064
1065    fn matched_positions(&self) -> Option<super::MatchedPositions> {
1066        if self.position >= self.results.len() {
1067            return None;
1068        }
1069        let result = &self.results[self.position];
1070        let scored_positions: Vec<super::ScoredPosition> = result
1071            .ordinals
1072            .iter()
1073            .map(|&(ordinal, score)| super::ScoredPosition::new(ordinal, score))
1074            .collect();
1075        Some(vec![(self.field_id, scored_positions)])
1076    }
1077}
1078
1079/// Empty scorer for when no terms match
1080struct EmptyScorer;
1081
1082impl super::docset::DocSet for EmptyScorer {
1083    fn doc(&self) -> DocId {
1084        TERMINATED
1085    }
1086
1087    fn advance(&mut self) -> DocId {
1088        TERMINATED
1089    }
1090
1091    fn seek(&mut self, _target: DocId) -> DocId {
1092        TERMINATED
1093    }
1094
1095    fn size_hint(&self) -> u32 {
1096        0
1097    }
1098}
1099
1100impl Scorer for EmptyScorer {
1101    fn score(&self) -> Score {
1102        0.0
1103    }
1104}
1105
1106#[cfg(test)]
1107mod tests {
1108    use super::*;
1109    use crate::dsl::Field;
1110    use crate::query::TermQuery;
1111
1112    #[test]
1113    fn test_maxscore_eligible_pure_or_same_field() {
1114        // Pure OR query with multiple terms in same field should be MaxScore-eligible
1115        let query = BooleanQuery::new()
1116            .should(TermQuery::text(Field(0), "hello"))
1117            .should(TermQuery::text(Field(0), "world"))
1118            .should(TermQuery::text(Field(0), "foo"));
1119
1120        // All clauses should return term info
1121        assert!(
1122            query
1123                .should
1124                .iter()
1125                .all(|q| q.as_term_query_info().is_some())
1126        );
1127
1128        // All should be same field
1129        let infos: Vec<_> = query
1130            .should
1131            .iter()
1132            .filter_map(|q| q.as_term_query_info())
1133            .collect();
1134        assert_eq!(infos.len(), 3);
1135        assert!(infos.iter().all(|i| i.field == Field(0)));
1136    }
1137
1138    #[test]
1139    fn test_maxscore_not_eligible_different_fields() {
1140        // OR query with terms in different fields should NOT use MaxScore
1141        let query = BooleanQuery::new()
1142            .should(TermQuery::text(Field(0), "hello"))
1143            .should(TermQuery::text(Field(1), "world")); // Different field!
1144
1145        let infos: Vec<_> = query
1146            .should
1147            .iter()
1148            .filter_map(|q| q.as_term_query_info())
1149            .collect();
1150        assert_eq!(infos.len(), 2);
1151        // Fields are different, MaxScore should not be used
1152        assert!(infos[0].field != infos[1].field);
1153    }
1154
1155    #[test]
1156    fn test_maxscore_not_eligible_with_must() {
1157        // Query with MUST clause should NOT use MaxScore optimization
1158        let query = BooleanQuery::new()
1159            .must(TermQuery::text(Field(0), "required"))
1160            .should(TermQuery::text(Field(0), "hello"))
1161            .should(TermQuery::text(Field(0), "world"));
1162
1163        // Has MUST clause, so MaxScore optimization should not kick in
1164        assert!(!query.must.is_empty());
1165    }
1166
1167    #[test]
1168    fn test_maxscore_not_eligible_with_must_not() {
1169        // Query with MUST_NOT clause should NOT use MaxScore optimization
1170        let query = BooleanQuery::new()
1171            .should(TermQuery::text(Field(0), "hello"))
1172            .should(TermQuery::text(Field(0), "world"))
1173            .must_not(TermQuery::text(Field(0), "excluded"));
1174
1175        // Has MUST_NOT clause, so MaxScore optimization should not kick in
1176        assert!(!query.must_not.is_empty());
1177    }
1178
1179    #[test]
1180    fn test_maxscore_not_eligible_single_term() {
1181        // Single SHOULD clause should NOT use MaxScore (no benefit)
1182        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
1183
1184        // Only one term, MaxScore not beneficial
1185        assert_eq!(query.should.len(), 1);
1186    }
1187
1188    #[test]
1189    fn test_term_query_info_extraction() {
1190        let term_query = TermQuery::text(Field(42), "test");
1191        let info = term_query.as_term_query_info();
1192
1193        assert!(info.is_some());
1194        let info = info.unwrap();
1195        assert_eq!(info.field, Field(42));
1196        assert_eq!(info.term, b"test");
1197    }
1198
1199    #[test]
1200    fn test_boolean_query_no_term_info() {
1201        // BooleanQuery itself should not return term info
1202        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
1203
1204        assert!(query.as_term_query_info().is_none());
1205    }
1206}
hermes_core/query/boolean.rs

hermes_core/query/
boolean.rs