lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Shared per-segment supplier for constant-score multi-term queries.
//!
//! Used by `prefix`, `wildcard`, `fuzzy`, and `regexp`. Each calling
//! query is responsible for enumerating its own matching terms in the
//! segment (via `terms_with_prefix`, `automaton_search`, etc.); this
//! module provides the post-enumeration scoring infrastructure:
//!
//! 1. Open each term's postings on demand (one FST lookup per term).
//! 2. Wrap in [`FilterScorer`] (no BM25 setup, no norms I/O).
//! 3. Union via [`BufferedUnionScorer`] (windowed bitset).
//!
//! Both `FilterScorer::score()` and `BufferedUnionScorer::score()`
//! return a constant 1.0 — matches Lucene's
//! `MultiTermQueryConstantScoreBlendedWrapper` semantics where the
//! disjunction is wrapped in a `ConstantScoreScorer`.
//!
//! See [[optimization-multiterm-constant-score-rewrite]] and
//! [[fix-disjunction-heap-inefficiency]].

use crate::core::{DocId, FieldId, NO_MORE_DOCS, Result, Scorer, TwoPhaseIterator};

use crate::query::ScorerSupplier;
use crate::query::term::FilterScorer;
use crate::search::buffered_union::BufferedUnionScorer;
use crate::segment::reader::SegmentReader;

/// Per-segment supplier for constant-score multi-term queries.
pub(crate) struct ConstantScoreMultiTermSupplier {
    field_id: FieldId,
    /// `(term, doc_freq)` pairs as discovered by the calling query.
    /// `doc_freq` is used only for cost estimation, not for scoring.
    terms: Vec<(String, u32)>,
    cost: u64,
    /// Raw pointer to segment reader. Safe because the supplier's
    /// lifetime is bounded by the search call which holds the reader.
    segment_data: *const SegmentReader,
}

// SAFETY: Only used within a single search call where the
// SegmentReader outlives the supplier.
unsafe impl Send for ConstantScoreMultiTermSupplier {}

impl ConstantScoreMultiTermSupplier {
    pub(crate) fn new(
        reader: &SegmentReader,
        field_id: FieldId,
        terms: Vec<(String, u32)>,
    ) -> Self {
        let cost: u64 = terms.iter().map(|(_, df)| *df as u64).sum();
        Self {
            field_id,
            terms,
            cost,
            segment_data: reader as *const SegmentReader,
        }
    }
}

impl ScorerSupplier for ConstantScoreMultiTermSupplier {
    fn cost(&self) -> u64 {
        self.cost
    }

    fn scorer(self: Box<Self>) -> Result<Box<dyn Scorer>> {
        // SAFETY: the segment reader outlives this scorer supplier
        let reader = unsafe { &*self.segment_data };

        let mut scorers: Vec<Box<dyn Scorer>> = Vec::with_capacity(self.terms.len());
        for (term, _) in &self.terms {
            if let Some(postings) = reader.postings(self.field_id, term) {
                scorers.push(Box::new(FilterScorer::new(postings)));
            }
        }

        if scorers.is_empty() {
            return Ok(Box::new(EmptyScorer));
        }
        // Single-term fast path: return the FilterScorer directly. Its
        // score() already returns the constant 1.0 — no wrapper needed.
        if scorers.len() == 1 {
            return Ok(scorers.pop().unwrap());
        }
        // Multi-term path: union via BufferedUnionScorer (windowed bitset).
        Ok(Box::new(BufferedUnionScorer::new(scorers)))
    }
}

/// Empty scorer for the no-matches case.
pub(crate) struct EmptyScorer;

impl Scorer for EmptyScorer {
    fn doc_id(&self) -> DocId {
        NO_MORE_DOCS
    }
    fn next(&mut self) -> DocId {
        NO_MORE_DOCS
    }
    fn advance(&mut self, _: DocId) -> DocId {
        NO_MORE_DOCS
    }
    fn score(&mut self) -> f32 {
        0.0
    }
    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
        None
    }
}