hermes_core/query/
boolean.rs

1//! Boolean query with MUST, SHOULD, and MUST_NOT clauses
2
3use std::sync::Arc;
4
5use crate::segment::SegmentReader;
6use crate::structures::TERMINATED;
7use crate::{DocId, Score};
8
9use super::{
10    CountFuture, GlobalStats, MaxScoreExecutor, Query, ScoredDoc, Scorer, ScorerFuture,
11    SparseTermQueryInfo,
12};
13
14/// Boolean query with MUST, SHOULD, and MUST_NOT clauses
15///
16/// When all clauses are SHOULD term queries on the same field, automatically
17/// uses MaxScore optimization for efficient top-k retrieval.
18#[derive(Default, Clone)]
19pub struct BooleanQuery {
20    pub must: Vec<Arc<dyn Query>>,
21    pub should: Vec<Arc<dyn Query>>,
22    pub must_not: Vec<Arc<dyn Query>>,
23    /// Optional global statistics for cross-segment IDF
24    global_stats: Option<Arc<GlobalStats>>,
25}
26
27impl std::fmt::Debug for BooleanQuery {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        f.debug_struct("BooleanQuery")
30            .field("must_count", &self.must.len())
31            .field("should_count", &self.should.len())
32            .field("must_not_count", &self.must_not.len())
33            .field("has_global_stats", &self.global_stats.is_some())
34            .finish()
35    }
36}
37
38impl std::fmt::Display for BooleanQuery {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        write!(f, "Boolean(")?;
41        let mut first = true;
42        for q in &self.must {
43            if !first {
44                write!(f, " ")?;
45            }
46            write!(f, "+{}", q)?;
47            first = false;
48        }
49        for q in &self.should {
50            if !first {
51                write!(f, " ")?;
52            }
53            write!(f, "{}", q)?;
54            first = false;
55        }
56        for q in &self.must_not {
57            if !first {
58                write!(f, " ")?;
59            }
60            write!(f, "-{}", q)?;
61            first = false;
62        }
63        write!(f, ")")
64    }
65}
66
67impl BooleanQuery {
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    pub fn must(mut self, query: impl Query + 'static) -> Self {
73        self.must.push(Arc::new(query));
74        self
75    }
76
77    pub fn should(mut self, query: impl Query + 'static) -> Self {
78        self.should.push(Arc::new(query));
79        self
80    }
81
82    pub fn must_not(mut self, query: impl Query + 'static) -> Self {
83        self.must_not.push(Arc::new(query));
84        self
85    }
86
87    /// Set global statistics for cross-segment IDF
88    pub fn with_global_stats(mut self, stats: Arc<GlobalStats>) -> Self {
89        self.global_stats = Some(stats);
90        self
91    }
92}
93
94/// Compute IDF for a posting list, preferring global stats.
95fn compute_idf(
96    posting_list: &crate::structures::BlockPostingList,
97    field: crate::Field,
98    term: &[u8],
99    num_docs: f32,
100    global_stats: Option<&Arc<GlobalStats>>,
101) -> f32 {
102    if let Some(stats) = global_stats {
103        let global_idf = stats.text_idf(field, &String::from_utf8_lossy(term));
104        if global_idf > 0.0 {
105            return global_idf;
106        }
107    }
108    let doc_freq = posting_list.doc_count() as f32;
109    super::bm25_idf(doc_freq, num_docs)
110}
111
112/// Shared pre-check for text MaxScore: extract term infos, field, avg_field_len, num_docs.
113/// Returns None if not all SHOULD clauses are single-field term queries.
114fn prepare_text_maxscore(
115    should: &[Arc<dyn Query>],
116    reader: &SegmentReader,
117    global_stats: Option<&Arc<GlobalStats>>,
118) -> Option<(Vec<super::TermQueryInfo>, crate::Field, f32, f32)> {
119    let infos: Vec<_> = should
120        .iter()
121        .filter_map(|q| q.as_term_query_info())
122        .collect();
123    if infos.len() != should.len() {
124        return None;
125    }
126    let field = infos[0].field;
127    if !infos.iter().all(|t| t.field == field) {
128        return None;
129    }
130    let avg_field_len = global_stats
131        .map(|s| s.avg_field_len(field))
132        .unwrap_or_else(|| reader.avg_field_len(field));
133    let num_docs = reader.num_docs() as f32;
134    Some((infos, field, avg_field_len, num_docs))
135}
136
137/// Build a TopK scorer from fetched posting lists via text MaxScore.
138fn finish_text_maxscore<'a>(
139    posting_lists: Vec<(crate::structures::BlockPostingList, f32)>,
140    avg_field_len: f32,
141    limit: usize,
142) -> crate::Result<Box<dyn Scorer + 'a>> {
143    if posting_lists.is_empty() {
144        return Ok(Box::new(EmptyScorer) as Box<dyn Scorer + 'a>);
145    }
146    let results = MaxScoreExecutor::text(posting_lists, avg_field_len, limit).execute_sync()?;
147    Ok(Box::new(TopKResultScorer::new(results)) as Box<dyn Scorer + 'a>)
148}
149
150/// Try text MaxScore for pure OR queries (async).
151async fn try_maxscore_scorer<'a>(
152    should: &[Arc<dyn Query>],
153    reader: &'a SegmentReader,
154    limit: usize,
155    global_stats: Option<&Arc<GlobalStats>>,
156) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
157    let (mut infos, _field, avg_field_len, num_docs) =
158        match prepare_text_maxscore(should, reader, global_stats) {
159            Some(v) => v,
160            None => return Ok(None),
161        };
162    let mut posting_lists = Vec::with_capacity(infos.len());
163    for info in infos.drain(..) {
164        if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
165            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
166            posting_lists.push((pl, idf));
167        }
168    }
169    Ok(Some(finish_text_maxscore(
170        posting_lists,
171        avg_field_len,
172        limit,
173    )?))
174}
175
176/// Try text MaxScore for pure OR queries (sync).
177#[cfg(feature = "sync")]
178fn try_maxscore_scorer_sync<'a>(
179    should: &[Arc<dyn Query>],
180    reader: &'a SegmentReader,
181    limit: usize,
182    global_stats: Option<&Arc<GlobalStats>>,
183) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
184    let (mut infos, _field, avg_field_len, num_docs) =
185        match prepare_text_maxscore(should, reader, global_stats) {
186            Some(v) => v,
187            None => return Ok(None),
188        };
189    let mut posting_lists = Vec::with_capacity(infos.len());
190    for info in infos.drain(..) {
191        if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
192            let idf = compute_idf(&pl, info.field, &info.term, num_docs, global_stats);
193            posting_lists.push((pl, idf));
194        }
195    }
196    Ok(Some(finish_text_maxscore(
197        posting_lists,
198        avg_field_len,
199        limit,
200    )?))
201}
202
203/// Shared grouping result for per-field MaxScore.
204struct PerFieldGrouping {
205    /// (field, avg_field_len, term_infos) for groups with 2+ terms
206    multi_term_groups: Vec<(crate::Field, f32, Vec<super::TermQueryInfo>)>,
207    /// Original indices of single-term and non-term SHOULD clauses (fallback scorers)
208    fallback_indices: Vec<usize>,
209    /// Limit per field group (over-fetched to compensate for cross-field scoring)
210    per_field_limit: usize,
211    num_docs: f32,
212}
213
214/// Group SHOULD clauses by field for per-field MaxScore.
215/// Returns None if no group has 2+ terms (no optimization benefit).
216fn prepare_per_field_grouping(
217    should: &[Arc<dyn Query>],
218    reader: &SegmentReader,
219    limit: usize,
220    global_stats: Option<&Arc<GlobalStats>>,
221) -> Option<PerFieldGrouping> {
222    let mut field_groups: rustc_hash::FxHashMap<crate::Field, Vec<(usize, super::TermQueryInfo)>> =
223        rustc_hash::FxHashMap::default();
224    let mut non_term_indices: Vec<usize> = Vec::new();
225
226    for (i, q) in should.iter().enumerate() {
227        if let Some(info) = q.as_term_query_info() {
228            field_groups.entry(info.field).or_default().push((i, info));
229        } else {
230            non_term_indices.push(i);
231        }
232    }
233
234    if !field_groups.values().any(|g| g.len() >= 2) {
235        return None;
236    }
237
238    let num_groups = field_groups.len() + non_term_indices.len();
239    let per_field_limit = limit * num_groups;
240    let num_docs = reader.num_docs() as f32;
241
242    let mut multi_term_groups = Vec::new();
243    let mut fallback_indices = non_term_indices;
244
245    for group in field_groups.into_values() {
246        if group.len() >= 2 {
247            let field = group[0].1.field;
248            let avg_field_len = global_stats
249                .map(|s| s.avg_field_len(field))
250                .unwrap_or_else(|| reader.avg_field_len(field));
251            let infos: Vec<_> = group.into_iter().map(|(_, info)| info).collect();
252            multi_term_groups.push((field, avg_field_len, infos));
253        } else {
254            fallback_indices.push(group[0].0);
255        }
256    }
257
258    Some(PerFieldGrouping {
259        multi_term_groups,
260        fallback_indices,
261        per_field_limit,
262        num_docs,
263    })
264}
265
266/// Build a SHOULD-only scorer from a vec of optimized scorers.
267fn build_should_scorer<'a>(scorers: Vec<Box<dyn Scorer + 'a>>) -> Box<dyn Scorer + 'a> {
268    if scorers.is_empty() {
269        return Box::new(EmptyScorer);
270    }
271    if scorers.len() == 1 {
272        return scorers.into_iter().next().unwrap();
273    }
274    let mut scorer = BooleanScorer {
275        must: vec![],
276        should: scorers,
277        must_not: vec![],
278        current_doc: 0,
279    };
280    scorer.current_doc = scorer.find_next_match();
281    Box::new(scorer)
282}
283
284/// Per-field MaxScore grouping for multi-field SHOULD queries (async).
285///
286/// When SHOULD clauses span multiple fields (e.g., "hello world" across title, body, desc),
287/// single-field MaxScore can't apply. This groups TermQuery clauses by field, runs MaxScore
288/// per group, and returns a compact scorer per field.
289async fn try_per_field_maxscore<'a>(
290    should: &[Arc<dyn Query>],
291    reader: &'a SegmentReader,
292    limit: usize,
293    global_stats: Option<&Arc<GlobalStats>>,
294) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
295    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
296        Some(g) => g,
297        None => return Ok(None),
298    };
299
300    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
301
302    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
303        let mut posting_lists = Vec::with_capacity(infos.len());
304        for info in infos {
305            if let Some(pl) = reader.get_postings(info.field, &info.term).await? {
306                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
307                posting_lists.push((pl, idf));
308            }
309        }
310        if !posting_lists.is_empty() {
311            scorers.push(finish_text_maxscore(
312                posting_lists,
313                *avg_field_len,
314                grouping.per_field_limit,
315            )?);
316        }
317    }
318
319    for &idx in &grouping.fallback_indices {
320        scorers.push(should[idx].scorer(reader, limit).await?);
321    }
322
323    Ok(Some(build_should_scorer(scorers)))
324}
325
326/// Per-field MaxScore grouping for multi-field SHOULD queries (sync).
327#[cfg(feature = "sync")]
328fn try_per_field_maxscore_sync<'a>(
329    should: &[Arc<dyn Query>],
330    reader: &'a SegmentReader,
331    limit: usize,
332    global_stats: Option<&Arc<GlobalStats>>,
333) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
334    let grouping = match prepare_per_field_grouping(should, reader, limit, global_stats) {
335        Some(g) => g,
336        None => return Ok(None),
337    };
338
339    let mut scorers: Vec<Box<dyn Scorer + 'a>> = Vec::new();
340
341    for (field, avg_field_len, infos) in &grouping.multi_term_groups {
342        let mut posting_lists = Vec::with_capacity(infos.len());
343        for info in infos {
344            if let Some(pl) = reader.get_postings_sync(info.field, &info.term)? {
345                let idf = compute_idf(&pl, *field, &info.term, grouping.num_docs, global_stats);
346                posting_lists.push((pl, idf));
347            }
348        }
349        if !posting_lists.is_empty() {
350            scorers.push(finish_text_maxscore(
351                posting_lists,
352                *avg_field_len,
353                grouping.per_field_limit,
354            )?);
355        }
356    }
357
358    for &idx in &grouping.fallback_indices {
359        scorers.push(should[idx].scorer_sync(reader, limit)?);
360    }
361
362    Ok(Some(build_should_scorer(scorers)))
363}
364
365/// Try to build a sparse MaxScoreExecutor from SHOULD clauses.
366/// Returns None if not eligible, Some(Err) for empty segment, Some(Ok) otherwise.
367fn prepare_sparse_maxscore<'a>(
368    should: &[Arc<dyn Query>],
369    reader: &'a SegmentReader,
370    limit: usize,
371) -> Option<Result<MaxScoreExecutor<'a>, Box<dyn Scorer + 'a>>> {
372    let infos: Vec<SparseTermQueryInfo> = should
373        .iter()
374        .filter_map(|q| q.as_sparse_term_query_info())
375        .collect();
376    if infos.len() != should.len() {
377        return None;
378    }
379    let field = infos[0].field;
380    if !infos.iter().all(|t| t.field == field) {
381        return None;
382    }
383    let si = match reader.sparse_index(field) {
384        Some(si) => si,
385        None => return Some(Err(Box::new(EmptyScorer))),
386    };
387    let query_terms: Vec<(u32, f32)> = infos
388        .iter()
389        .filter(|info| si.has_dimension(info.dim_id))
390        .map(|info| (info.dim_id, info.weight))
391        .collect();
392    if query_terms.is_empty() {
393        return Some(Err(Box::new(EmptyScorer)));
394    }
395    let executor_limit = (limit as f32 * infos[0].over_fetch_factor).ceil() as usize;
396    Some(Ok(MaxScoreExecutor::sparse(
397        si,
398        query_terms,
399        executor_limit,
400        infos[0].heap_factor,
401    )))
402}
403
404/// Combine raw MaxScore results with ordinal deduplication into a scorer.
405fn combine_sparse_results<'a>(
406    raw: Vec<ScoredDoc>,
407    combiner: super::MultiValueCombiner,
408    field: crate::Field,
409    limit: usize,
410) -> Box<dyn Scorer + 'a> {
411    let combined = crate::segment::combine_ordinal_results(
412        raw.into_iter().map(|r| (r.doc_id, r.ordinal, r.score)),
413        combiner,
414        limit,
415    );
416    Box::new(VectorTopKResultScorer::new(combined, field.0))
417}
418
419/// Build MaxScore scorer from sparse term infos (async).
420async fn try_sparse_maxscore_scorer<'a>(
421    should: &[Arc<dyn Query>],
422    reader: &'a SegmentReader,
423    limit: usize,
424) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
425    let executor = match prepare_sparse_maxscore(should, reader, limit) {
426        None => return Ok(None),
427        Some(Err(empty)) => return Ok(Some(empty)),
428        Some(Ok(e)) => e,
429    };
430    let info = should[0].as_sparse_term_query_info().unwrap();
431    let raw = executor.execute().await?;
432    Ok(Some(combine_sparse_results(
433        raw,
434        info.combiner,
435        info.field,
436        limit,
437    )))
438}
439
440/// Build MaxScore scorer from sparse term infos (sync).
441#[cfg(feature = "sync")]
442fn try_sparse_maxscore_scorer_sync<'a>(
443    should: &[Arc<dyn Query>],
444    reader: &'a SegmentReader,
445    limit: usize,
446) -> crate::Result<Option<Box<dyn Scorer + 'a>>> {
447    let executor = match prepare_sparse_maxscore(should, reader, limit) {
448        None => return Ok(None),
449        Some(Err(empty)) => return Ok(Some(empty)),
450        Some(Ok(e)) => e,
451    };
452    let info = should[0].as_sparse_term_query_info().unwrap();
453    let raw = executor.execute_sync()?;
454    Ok(Some(combine_sparse_results(
455        raw,
456        info.combiner,
457        info.field,
458        limit,
459    )))
460}
461
462/// Try to compile all MUST filter queries (and optionally MUST_NOT) into a
463/// single combined predicate. Returns `None` if:
464/// - any MUST clause is not a filter (`is_filter() == false`)
465/// - any MUST clause can't produce a `DocPredicate`
466/// - any MUST_NOT clause can't produce a `DocPredicate`
467///
468/// On success returns `(predicate, filter_score)`.
469fn compile_filter_predicates<'a>(
470    must: &[std::sync::Arc<dyn Query>],
471    must_not: &[std::sync::Arc<dyn Query>],
472    reader: &'a SegmentReader,
473) -> Option<(super::DocPredicate<'a>, f32)> {
474    if must.is_empty() || !must.iter().all(|q| q.is_filter()) {
475        return None;
476    }
477
478    let predicates: Vec<_> = must
479        .iter()
480        .filter_map(|q| q.as_doc_predicate(reader))
481        .collect();
482    if predicates.len() != must.len() {
483        return None;
484    }
485
486    let filter_score = must.len() as f32;
487
488    let combined: super::DocPredicate<'a> = if predicates.len() == 1 {
489        predicates.into_iter().next().unwrap()
490    } else {
491        Box::new(move |doc_id| predicates.iter().all(|p| p(doc_id)))
492    };
493
494    if must_not.is_empty() {
495        return Some((combined, filter_score));
496    }
497
498    // All MUST_NOT must also be convertible to predicates; otherwise
499    // we can't guarantee correctness and must fall back to BooleanScorer.
500    let not_predicates: Vec<_> = must_not
501        .iter()
502        .filter_map(|q| q.as_doc_predicate(reader))
503        .collect();
504    if not_predicates.len() != must_not.len() {
505        return None;
506    }
507
508    Some((
509        Box::new(move |doc_id| combined(doc_id) && not_predicates.iter().all(|p| !p(doc_id))),
510        filter_score,
511    ))
512}
513
514impl Query for BooleanQuery {
515    fn scorer<'a>(&self, reader: &'a SegmentReader, limit: usize) -> ScorerFuture<'a> {
516        // Clone Arc vectors - cheap reference counting
517        let must = self.must.clone();
518        let should = self.should.clone();
519        let must_not = self.must_not.clone();
520        let global_stats = self.global_stats.clone();
521
522        Box::pin(async move {
523            // Single-clause optimization: unwrap to inner scorer directly
524            if must_not.is_empty() {
525                if must.len() == 1 && should.is_empty() {
526                    return must[0].scorer(reader, limit).await;
527                }
528                if should.len() == 1 && must.is_empty() {
529                    return should[0].scorer(reader, limit).await;
530                }
531            }
532
533            // Check if this is a pure OR query eligible for MaxScore optimization
534            // Conditions: no MUST, no MUST_NOT, multiple SHOULD clauses, all same field
535            if must.is_empty() && must_not.is_empty() && should.len() >= 2 {
536                // Try text MaxScore first
537                if let Some(scorer) =
538                    try_maxscore_scorer(&should, reader, limit, global_stats.as_ref()).await?
539                {
540                    return Ok(scorer);
541                }
542                // Try sparse MaxScore
543                if let Some(scorer) = try_sparse_maxscore_scorer(&should, reader, limit).await? {
544                    return Ok(scorer);
545                }
546                // Try per-field MaxScore grouping for multi-field text queries
547                if let Some(scorer) =
548                    try_per_field_maxscore(&should, reader, limit, global_stats.as_ref()).await?
549                {
550                    return Ok(scorer);
551                }
552            }
553
554            // ── Planner: filter push-down for MUST filters + SHOULD scoring ──
555            // When ALL MUST clauses are pure filters (score=1.0) and there are
556            // scoring SHOULD clauses, flip the driver: SHOULD drives iteration,
557            // MUST filters become O(1) per-doc predicates.
558            if !should.is_empty()
559                && limit < usize::MAX / 4
560                && let Some((predicate, filter_score)) =
561                    compile_filter_predicates(&must, &must_not, reader)
562            {
563                let should_scorer = if should.len() == 1 {
564                    should[0].scorer(reader, limit).await?
565                } else {
566                    let sub = BooleanQuery {
567                        must: Vec::new(),
568                        should: should.clone(),
569                        must_not: Vec::new(),
570                        global_stats: global_stats.clone(),
571                    };
572                    sub.scorer(reader, limit).await?
573                };
574
575                // Only use push-down when SHOULD has enough results to
576                // fill top-k. Otherwise fall through to standard scoring.
577                if should_scorer.size_hint() >= limit as u32 {
578                    log::debug!(
579                        "BooleanQuery planner: filter push-down, {} MUST filters → predicate, {} SHOULD scorers drive (size_hint={})",
580                        must.len(),
581                        should.len(),
582                        should_scorer.size_hint()
583                    );
584                    return Ok(Box::new(super::PredicatedScorer::new(
585                        should_scorer,
586                        predicate,
587                        filter_score,
588                    )));
589                }
590            }
591
592            // Fall back to standard boolean scoring
593            let mut must_scorers = Vec::with_capacity(must.len());
594            for q in &must {
595                must_scorers.push(q.scorer(reader, limit).await?);
596            }
597
598            let mut should_scorers = Vec::with_capacity(should.len());
599            for q in &should {
600                should_scorers.push(q.scorer(reader, limit).await?);
601            }
602
603            let mut must_not_scorers = Vec::with_capacity(must_not.len());
604            for q in &must_not {
605                must_not_scorers.push(q.scorer(reader, limit).await?);
606            }
607
608            let mut scorer = BooleanScorer {
609                must: must_scorers,
610                should: should_scorers,
611                must_not: must_not_scorers,
612                current_doc: 0,
613            };
614            // Initialize to first match
615            scorer.current_doc = scorer.find_next_match();
616            Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
617        })
618    }
619
620    #[cfg(feature = "sync")]
621    fn scorer_sync<'a>(
622        &self,
623        reader: &'a SegmentReader,
624        limit: usize,
625    ) -> crate::Result<Box<dyn Scorer + 'a>> {
626        // Single-clause optimization: unwrap to inner scorer directly
627        if self.must_not.is_empty() {
628            if self.must.len() == 1 && self.should.is_empty() {
629                return self.must[0].scorer_sync(reader, limit);
630            }
631            if self.should.len() == 1 && self.must.is_empty() {
632                return self.should[0].scorer_sync(reader, limit);
633            }
634        }
635
636        // MaxScore optimization for pure OR queries
637        if self.must.is_empty() && self.must_not.is_empty() && self.should.len() >= 2 {
638            if let Some(scorer) =
639                try_maxscore_scorer_sync(&self.should, reader, limit, self.global_stats.as_ref())?
640            {
641                return Ok(scorer);
642            }
643            if let Some(scorer) = try_sparse_maxscore_scorer_sync(&self.should, reader, limit)? {
644                return Ok(scorer);
645            }
646            // Try per-field MaxScore grouping for multi-field text queries
647            if let Some(scorer) = try_per_field_maxscore_sync(
648                &self.should,
649                reader,
650                limit,
651                self.global_stats.as_ref(),
652            )? {
653                return Ok(scorer);
654            }
655        }
656
657        // ── Planner: filter push-down for MUST filters + SHOULD scoring ──
658        if !self.should.is_empty()
659            && limit < usize::MAX / 4
660            && let Some((predicate, filter_score)) =
661                compile_filter_predicates(&self.must, &self.must_not, reader)
662        {
663            let should_scorer = if self.should.len() == 1 {
664                self.should[0].scorer_sync(reader, limit)?
665            } else {
666                let sub = BooleanQuery {
667                    must: Vec::new(),
668                    should: self.should.clone(),
669                    must_not: Vec::new(),
670                    global_stats: self.global_stats.clone(),
671                };
672                sub.scorer_sync(reader, limit)?
673            };
674
675            if should_scorer.size_hint() >= limit as u32 {
676                log::debug!(
677                    "BooleanQuery planner (sync): filter push-down, {} MUST filters → predicate, {} SHOULD scorers drive",
678                    self.must.len(),
679                    self.should.len()
680                );
681                return Ok(Box::new(super::PredicatedScorer::new(
682                    should_scorer,
683                    predicate,
684                    filter_score,
685                )));
686            }
687        }
688
689        // Fall back to standard boolean scoring
690        let mut must_scorers = Vec::with_capacity(self.must.len());
691        for q in &self.must {
692            must_scorers.push(q.scorer_sync(reader, limit)?);
693        }
694
695        let mut should_scorers = Vec::with_capacity(self.should.len());
696        for q in &self.should {
697            should_scorers.push(q.scorer_sync(reader, limit)?);
698        }
699
700        let mut must_not_scorers = Vec::with_capacity(self.must_not.len());
701        for q in &self.must_not {
702            must_not_scorers.push(q.scorer_sync(reader, limit)?);
703        }
704
705        let mut scorer = BooleanScorer {
706            must: must_scorers,
707            should: should_scorers,
708            must_not: must_not_scorers,
709            current_doc: 0,
710        };
711        scorer.current_doc = scorer.find_next_match();
712        Ok(Box::new(scorer) as Box<dyn Scorer + 'a>)
713    }
714
715    fn count_estimate<'a>(&self, reader: &'a SegmentReader) -> CountFuture<'a> {
716        let must = self.must.clone();
717        let should = self.should.clone();
718
719        Box::pin(async move {
720            if !must.is_empty() {
721                let mut estimates = Vec::with_capacity(must.len());
722                for q in &must {
723                    estimates.push(q.count_estimate(reader).await?);
724                }
725                estimates
726                    .into_iter()
727                    .min()
728                    .ok_or_else(|| crate::Error::Corruption("Empty must clause".to_string()))
729            } else if !should.is_empty() {
730                let mut sum = 0u32;
731                for q in &should {
732                    sum = sum.saturating_add(q.count_estimate(reader).await?);
733                }
734                Ok(sum)
735            } else {
736                Ok(0)
737            }
738        })
739    }
740}
741
742struct BooleanScorer<'a> {
743    must: Vec<Box<dyn Scorer + 'a>>,
744    should: Vec<Box<dyn Scorer + 'a>>,
745    must_not: Vec<Box<dyn Scorer + 'a>>,
746    current_doc: DocId,
747}
748
749impl BooleanScorer<'_> {
750    fn find_next_match(&mut self) -> DocId {
751        if self.must.is_empty() && self.should.is_empty() {
752            return TERMINATED;
753        }
754
755        loop {
756            let candidate = if !self.must.is_empty() {
757                let mut max_doc = self
758                    .must
759                    .iter()
760                    .map(|s| s.doc())
761                    .max()
762                    .unwrap_or(TERMINATED);
763
764                if max_doc == TERMINATED {
765                    return TERMINATED;
766                }
767
768                loop {
769                    let mut all_match = true;
770                    for scorer in &mut self.must {
771                        let doc = scorer.seek(max_doc);
772                        if doc == TERMINATED {
773                            return TERMINATED;
774                        }
775                        if doc > max_doc {
776                            max_doc = doc;
777                            all_match = false;
778                            break;
779                        }
780                    }
781                    if all_match {
782                        break;
783                    }
784                }
785                max_doc
786            } else {
787                self.should
788                    .iter()
789                    .map(|s| s.doc())
790                    .filter(|&d| d != TERMINATED)
791                    .min()
792                    .unwrap_or(TERMINATED)
793            };
794
795            if candidate == TERMINATED {
796                return TERMINATED;
797            }
798
799            let excluded = self.must_not.iter_mut().any(|scorer| {
800                let doc = scorer.seek(candidate);
801                doc == candidate
802            });
803
804            if !excluded {
805                // Seek SHOULD scorers to candidate so score() can see their contributions
806                for scorer in &mut self.should {
807                    scorer.seek(candidate);
808                }
809                self.current_doc = candidate;
810                return candidate;
811            }
812
813            // Advance past excluded candidate
814            if !self.must.is_empty() {
815                for scorer in &mut self.must {
816                    scorer.advance();
817                }
818            } else {
819                // For SHOULD-only: seek all scorers past the excluded candidate
820                for scorer in &mut self.should {
821                    if scorer.doc() <= candidate && scorer.doc() != TERMINATED {
822                        scorer.seek(candidate + 1);
823                    }
824                }
825            }
826        }
827    }
828}
829
830impl super::docset::DocSet for BooleanScorer<'_> {
831    fn doc(&self) -> DocId {
832        self.current_doc
833    }
834
835    fn advance(&mut self) -> DocId {
836        if !self.must.is_empty() {
837            for scorer in &mut self.must {
838                scorer.advance();
839            }
840        } else {
841            for scorer in &mut self.should {
842                if scorer.doc() == self.current_doc {
843                    scorer.advance();
844                }
845            }
846        }
847
848        self.current_doc = self.find_next_match();
849        self.current_doc
850    }
851
852    fn seek(&mut self, target: DocId) -> DocId {
853        for scorer in &mut self.must {
854            scorer.seek(target);
855        }
856
857        for scorer in &mut self.should {
858            scorer.seek(target);
859        }
860
861        self.current_doc = self.find_next_match();
862        self.current_doc
863    }
864
865    fn size_hint(&self) -> u32 {
866        if !self.must.is_empty() {
867            self.must.iter().map(|s| s.size_hint()).min().unwrap_or(0)
868        } else {
869            self.should.iter().map(|s| s.size_hint()).sum()
870        }
871    }
872}
873
874impl Scorer for BooleanScorer<'_> {
875    fn score(&self) -> Score {
876        let mut total = 0.0;
877
878        for scorer in &self.must {
879            if scorer.doc() == self.current_doc {
880                total += scorer.score();
881            }
882        }
883
884        for scorer in &self.should {
885            if scorer.doc() == self.current_doc {
886                total += scorer.score();
887            }
888        }
889
890        total
891    }
892
893    fn matched_positions(&self) -> Option<super::MatchedPositions> {
894        let mut all_positions: super::MatchedPositions = Vec::new();
895
896        for scorer in &self.must {
897            if scorer.doc() == self.current_doc
898                && let Some(positions) = scorer.matched_positions()
899            {
900                all_positions.extend(positions);
901            }
902        }
903
904        for scorer in &self.should {
905            if scorer.doc() == self.current_doc
906                && let Some(positions) = scorer.matched_positions()
907            {
908                all_positions.extend(positions);
909            }
910        }
911
912        if all_positions.is_empty() {
913            None
914        } else {
915            Some(all_positions)
916        }
917    }
918}
919
920/// Scorer that iterates over pre-computed top-k results
921struct TopKResultScorer {
922    results: Vec<ScoredDoc>,
923    position: usize,
924}
925
926impl TopKResultScorer {
927    fn new(mut results: Vec<ScoredDoc>) -> Self {
928        // Sort by doc_id ascending — required for DocSet seek() correctness
929        results.sort_unstable_by_key(|r| r.doc_id);
930        Self {
931            results,
932            position: 0,
933        }
934    }
935}
936
937impl super::docset::DocSet for TopKResultScorer {
938    fn doc(&self) -> DocId {
939        if self.position < self.results.len() {
940            self.results[self.position].doc_id
941        } else {
942            TERMINATED
943        }
944    }
945
946    fn advance(&mut self) -> DocId {
947        self.position += 1;
948        self.doc()
949    }
950
951    fn seek(&mut self, target: DocId) -> DocId {
952        let remaining = &self.results[self.position..];
953        self.position += remaining.partition_point(|r| r.doc_id < target);
954        self.doc()
955    }
956
957    fn size_hint(&self) -> u32 {
958        (self.results.len() - self.position) as u32
959    }
960}
961
962impl Scorer for TopKResultScorer {
963    fn score(&self) -> Score {
964        if self.position < self.results.len() {
965            self.results[self.position].score
966        } else {
967            0.0
968        }
969    }
970}
971
972/// Scorer that iterates over pre-computed vector results with ordinal information.
973/// Used by sparse MaxScore path to preserve per-ordinal scores for matched_positions().
974struct VectorTopKResultScorer {
975    results: Vec<crate::segment::VectorSearchResult>,
976    position: usize,
977    field_id: u32,
978}
979
980impl VectorTopKResultScorer {
981    fn new(mut results: Vec<crate::segment::VectorSearchResult>, field_id: u32) -> Self {
982        results.sort_unstable_by_key(|r| r.doc_id);
983        Self {
984            results,
985            position: 0,
986            field_id,
987        }
988    }
989}
990
991impl super::docset::DocSet for VectorTopKResultScorer {
992    fn doc(&self) -> DocId {
993        if self.position < self.results.len() {
994            self.results[self.position].doc_id
995        } else {
996            TERMINATED
997        }
998    }
999
1000    fn advance(&mut self) -> DocId {
1001        self.position += 1;
1002        self.doc()
1003    }
1004
1005    fn seek(&mut self, target: DocId) -> DocId {
1006        let remaining = &self.results[self.position..];
1007        self.position += remaining.partition_point(|r| r.doc_id < target);
1008        self.doc()
1009    }
1010
1011    fn size_hint(&self) -> u32 {
1012        (self.results.len() - self.position) as u32
1013    }
1014}
1015
1016impl Scorer for VectorTopKResultScorer {
1017    fn score(&self) -> Score {
1018        if self.position < self.results.len() {
1019            self.results[self.position].score
1020        } else {
1021            0.0
1022        }
1023    }
1024
1025    fn matched_positions(&self) -> Option<super::MatchedPositions> {
1026        if self.position >= self.results.len() {
1027            return None;
1028        }
1029        let result = &self.results[self.position];
1030        let scored_positions: Vec<super::ScoredPosition> = result
1031            .ordinals
1032            .iter()
1033            .map(|&(ordinal, score)| super::ScoredPosition::new(ordinal, score))
1034            .collect();
1035        Some(vec![(self.field_id, scored_positions)])
1036    }
1037}
1038
1039/// Empty scorer for when no terms match
1040struct EmptyScorer;
1041
1042impl super::docset::DocSet for EmptyScorer {
1043    fn doc(&self) -> DocId {
1044        TERMINATED
1045    }
1046
1047    fn advance(&mut self) -> DocId {
1048        TERMINATED
1049    }
1050
1051    fn seek(&mut self, _target: DocId) -> DocId {
1052        TERMINATED
1053    }
1054
1055    fn size_hint(&self) -> u32 {
1056        0
1057    }
1058}
1059
1060impl Scorer for EmptyScorer {
1061    fn score(&self) -> Score {
1062        0.0
1063    }
1064}
1065
1066#[cfg(test)]
1067mod tests {
1068    use super::*;
1069    use crate::dsl::Field;
1070    use crate::query::TermQuery;
1071
1072    #[test]
1073    fn test_maxscore_eligible_pure_or_same_field() {
1074        // Pure OR query with multiple terms in same field should be MaxScore-eligible
1075        let query = BooleanQuery::new()
1076            .should(TermQuery::text(Field(0), "hello"))
1077            .should(TermQuery::text(Field(0), "world"))
1078            .should(TermQuery::text(Field(0), "foo"));
1079
1080        // All clauses should return term info
1081        assert!(
1082            query
1083                .should
1084                .iter()
1085                .all(|q| q.as_term_query_info().is_some())
1086        );
1087
1088        // All should be same field
1089        let infos: Vec<_> = query
1090            .should
1091            .iter()
1092            .filter_map(|q| q.as_term_query_info())
1093            .collect();
1094        assert_eq!(infos.len(), 3);
1095        assert!(infos.iter().all(|i| i.field == Field(0)));
1096    }
1097
1098    #[test]
1099    fn test_maxscore_not_eligible_different_fields() {
1100        // OR query with terms in different fields should NOT use MaxScore
1101        let query = BooleanQuery::new()
1102            .should(TermQuery::text(Field(0), "hello"))
1103            .should(TermQuery::text(Field(1), "world")); // Different field!
1104
1105        let infos: Vec<_> = query
1106            .should
1107            .iter()
1108            .filter_map(|q| q.as_term_query_info())
1109            .collect();
1110        assert_eq!(infos.len(), 2);
1111        // Fields are different, MaxScore should not be used
1112        assert!(infos[0].field != infos[1].field);
1113    }
1114
1115    #[test]
1116    fn test_maxscore_not_eligible_with_must() {
1117        // Query with MUST clause should NOT use MaxScore optimization
1118        let query = BooleanQuery::new()
1119            .must(TermQuery::text(Field(0), "required"))
1120            .should(TermQuery::text(Field(0), "hello"))
1121            .should(TermQuery::text(Field(0), "world"));
1122
1123        // Has MUST clause, so MaxScore optimization should not kick in
1124        assert!(!query.must.is_empty());
1125    }
1126
1127    #[test]
1128    fn test_maxscore_not_eligible_with_must_not() {
1129        // Query with MUST_NOT clause should NOT use MaxScore optimization
1130        let query = BooleanQuery::new()
1131            .should(TermQuery::text(Field(0), "hello"))
1132            .should(TermQuery::text(Field(0), "world"))
1133            .must_not(TermQuery::text(Field(0), "excluded"));
1134
1135        // Has MUST_NOT clause, so MaxScore optimization should not kick in
1136        assert!(!query.must_not.is_empty());
1137    }
1138
1139    #[test]
1140    fn test_maxscore_not_eligible_single_term() {
1141        // Single SHOULD clause should NOT use MaxScore (no benefit)
1142        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
1143
1144        // Only one term, MaxScore not beneficial
1145        assert_eq!(query.should.len(), 1);
1146    }
1147
1148    #[test]
1149    fn test_term_query_info_extraction() {
1150        let term_query = TermQuery::text(Field(42), "test");
1151        let info = term_query.as_term_query_info();
1152
1153        assert!(info.is_some());
1154        let info = info.unwrap();
1155        assert_eq!(info.field, Field(42));
1156        assert_eq!(info.term, b"test");
1157    }
1158
1159    #[test]
1160    fn test_boolean_query_no_term_info() {
1161        // BooleanQuery itself should not return term info
1162        let query = BooleanQuery::new().should(TermQuery::text(Field(0), "hello"));
1163
1164        assert!(query.as_term_query_info().is_none());
1165    }
1166}
hermes_core/query/boolean.rs

hermes_core/query/
boolean.rs