lucisearch 0.8.0

//! BM25 scorer implementing `luci-core::Scorer`.
//!
//! Implements the Okapi BM25 scoring function as used by Lucene and
//! Elasticsearch:
//!
//! ```text
//! score = IDF * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / avgdl))
//! IDF = ln(1 + (N - n + 0.5) / (n + 0.5))
//! ```
//!
//! Parameters: k1=1.2, b=0.75 (ES defaults).
//!
//! See [[best-matching-25]] and [[architecture-query-execution]].

use crate::core::{DocId, NO_MORE_DOCS, Scorer, TwoPhaseIterator};

use crate::inverted::norms::FieldNormsReader;
use crate::inverted::postings::{BlockMaxPostingListReader, PostingListReader};

/// Default BM25 parameters (Elasticsearch defaults).
const K1: f32 = 1.2;
const B: f32 = 0.75;

/// Compute BM25 IDF: ln(1 + (N - n + 0.5) / (n + 0.5))
///
/// - `total_docs`: total documents in the index (N)
/// - `doc_freq`: number of documents containing the term (n)
pub fn bm25_idf(total_docs: u32, doc_freq: u32) -> f32 {
    let n = doc_freq as f64;
    let big_n = total_docs as f64;
    ((1.0 + (big_n - n + 0.5) / (n + 0.5)).ln()) as f32
}

/// Compute the BM25 score for a single document.
///
/// - `idf`: precomputed IDF
/// - `tf`: term frequency in this document
/// - `dl`: document length (number of tokens in the field)
/// - `avgdl`: average document length across the index
pub fn bm25_score(idf: f32, tf: f32, dl: f32, avgdl: f32) -> f32 {
    let norm_tf = (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * dl / avgdl));
    idf * norm_tf
}

/// Precomputed segment-independent BM25 statistics.
///
/// Created once per query term, then used to create per-segment scorers.
/// Includes a 256-entry norm-to-score lookup table matching Lucene's
/// `BM25Similarity.scorer()` approach. See [[optimization-collector-scaling]].
#[derive(Clone)]
pub struct Bm25Weight {
    pub idf: f32,
    pub avg_field_length: f32,
    /// Precomputed: for each possible norm byte (0-255), the value
    /// `1 / (k1 * (1 - b + b * decode(byte) / avgdl))`.
    /// Per-doc scoring becomes: `idf * (tf * (k1+1)) / (tf + k1 * norm_cache[byte])`
    /// which is just `idf * tf * (k1+1) * norm_inv` with one array lookup.
    norm_cache: [f32; 256],
}

impl Bm25Weight {
    pub fn new(total_docs: u32, doc_freq: u32, avg_field_length: f32) -> Self {
        let idf = bm25_idf(total_docs, doc_freq);
        let avgdl = avg_field_length;

        // Precompute norm table: for each norm byte, store the denominator factor.
        // norm_cache[b] = k1 * (1 - B + B * decode(b) / avgdl)
        // Then score = idf * (tf * (k1+1)) / (tf + norm_cache[b])
        let mut norm_cache = [0.0f32; 256];
        for i in 0..256 {
            let dl = crate::inverted::norms::decode_norm(i as u8);
            norm_cache[i] = K1 * (1.0 - B + B * dl / avgdl);
        }

        Self {
            idf,
            avg_field_length: avgdl,
            norm_cache,
        }
    }

    /// Upper bound on BM25 score for a given max TF.
    /// Uses dl=1.0 (shortest possible document) for a conservative bound.
    pub fn max_score_for_tf(&self, max_tf: f32) -> f32 {
        bm25_score(self.idf, max_tf, 1.0, self.avg_field_length)
    }

    /// Conservative upper bound assuming TF can be arbitrarily large.
    /// This is the limit of BM25 as TF → ∞: IDF * (k1 + 1).
    pub fn max_score_unbounded(&self) -> f32 {
        self.idf * (K1 + 1.0)
    }
}

/// BM25 scorer that implements `luci-core::Scorer`.
///
/// Wraps a `PostingListReader` for doc ID iteration and a `FieldNormsReader`
/// for field length lookups.
pub struct Bm25Scorer<'a> {
    weight: Bm25Weight,
    postings: PostingListReader<'a>,
    norms: FieldNormsReader<'a>,
    current_doc_id: DocId,
    current_tf: u32,
    /// Precomputed score for TF=1 when norms are uniform (keyword fields).
    constant_score: Option<f32>,
}

impl<'a> Bm25Scorer<'a> {
    /// Create a new BM25 scorer.
    pub fn new(
        weight: Bm25Weight,
        postings: PostingListReader<'a>,
        norms: FieldNormsReader<'a>,
    ) -> Self {
        let constant_score = norms
            .uniform_norm()
            .map(|dl| bm25_score(weight.idf, 1.0, dl, weight.avg_field_length));
        let mut scorer = Self {
            weight,
            postings,
            norms,
            current_doc_id: DocId::new(0),
            current_tf: 0,
            constant_score,
        };
        scorer.read_next();
        scorer
    }

    fn read_next(&mut self) {
        match self.postings.next() {
            Some((doc_id, tf)) => {
                self.current_doc_id = doc_id;
                self.current_tf = tf;
            }
            None => {
                self.current_doc_id = NO_MORE_DOCS;
                self.current_tf = 0;
            }
        }
    }
}

impl Scorer for Bm25Scorer<'_> {
    fn doc_id(&self) -> DocId {
        self.current_doc_id
    }

    fn next(&mut self) -> DocId {
        self.read_next();
        self.current_doc_id
    }

    fn advance(&mut self, target: DocId) -> DocId {
        while self.current_doc_id < target && self.current_doc_id != NO_MORE_DOCS {
            self.read_next();
        }
        self.current_doc_id
    }

    fn score(&mut self) -> f32 {
        if self.current_tf == 1 {
            if let Some(cs) = self.constant_score {
                return cs;
            }
        }
        let tf = self.current_tf as f32;
        // Use precomputed norm cache: O(1) array lookup instead of
        // decode_norm_to_length + full BM25 formula.
        let norm_byte = self.norms.raw_byte(self.current_doc_id);
        let denom = tf + self.weight.norm_cache[norm_byte as usize];
        self.weight.idf * (tf * (K1 + 1.0)) / denom
    }

    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
        None
    }

    fn max_score(&self) -> f32 {
        self.weight.max_score_unbounded()
    }
}

/// BM25 scorer backed by a block-max posting list.
///
/// Provides tight per-term and per-block max score bounds for WAND optimization.
/// See [[architecture-query-execution#WAND / MaxScore Optimization]].
pub struct BlockMaxBm25Scorer<'a> {
    weight: Bm25Weight,
    postings: BlockMaxPostingListReader<'a>,
    norms: FieldNormsReader<'a>,
    current_doc_id: DocId,
    current_tf: u32,
    /// Maximum BM25 score across all blocks (computed once at construction).
    term_max_score: f32,
    /// Per-block max scores, indexed by block number.
    block_max_scores: Vec<f32>,
    /// Precomputed score for TF=1 when norms are uniform (keyword fields).
    constant_score: Option<f32>,
}

impl<'a> BlockMaxBm25Scorer<'a> {
    pub fn new(
        weight: Bm25Weight,
        postings: BlockMaxPostingListReader<'a>,
        norms: FieldNormsReader<'a>,
    ) -> Self {
        // Precompute per-block and global max scores from block headers
        let num_blocks = postings.num_blocks();
        let mut block_max_scores = Vec::with_capacity(num_blocks as usize);
        let mut global_max = 0.0f32;

        for b in 0..num_blocks {
            let max_tf = postings.block_max_tf(b) as f32;
            let score = weight.max_score_for_tf(max_tf);
            block_max_scores.push(score);
            if score > global_max {
                global_max = score;
            }
        }

        let constant_score = norms
            .uniform_norm()
            .map(|dl| bm25_score(weight.idf, 1.0, dl, weight.avg_field_length));

        let mut scorer = Self {
            weight,
            postings,
            norms,
            current_doc_id: DocId::new(0),
            current_tf: 0,
            term_max_score: global_max,
            block_max_scores,
            constant_score,
        };
        scorer.read_next();
        scorer
    }

    fn read_next(&mut self) {
        match self.postings.next() {
            Some((doc_id, tf)) => {
                self.current_doc_id = doc_id;
                self.current_tf = tf;
            }
            None => {
                self.current_doc_id = NO_MORE_DOCS;
                self.current_tf = 0;
            }
        }
    }
}

impl Scorer for BlockMaxBm25Scorer<'_> {
    fn doc_id(&self) -> DocId {
        self.current_doc_id
    }

    fn next(&mut self) -> DocId {
        self.read_next();
        self.current_doc_id
    }

    fn advance(&mut self, target: DocId) -> DocId {
        // Use block-level skipping when possible
        if self.current_doc_id < target && self.current_doc_id != NO_MORE_DOCS {
            self.postings.advance_to_block(target);
            // Decode entries within the target block until we reach target
            loop {
                match self.postings.next() {
                    Some((doc_id, tf)) => {
                        self.current_doc_id = doc_id;
                        self.current_tf = tf;
                        if doc_id >= target {
                            return self.current_doc_id;
                        }
                    }
                    None => {
                        self.current_doc_id = NO_MORE_DOCS;
                        self.current_tf = 0;
                        return NO_MORE_DOCS;
                    }
                }
            }
        }
        self.current_doc_id
    }

    fn score(&mut self) -> f32 {
        if self.current_tf == 1 {
            if let Some(cs) = self.constant_score {
                return cs;
            }
        }
        let tf = self.current_tf as f32;
        let norm_byte = self.norms.raw_byte(self.current_doc_id);
        let denom = tf + self.weight.norm_cache[norm_byte as usize];
        self.weight.idf * (tf * (K1 + 1.0)) / denom
    }

    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
        None
    }

    fn max_score(&self) -> f32 {
        self.term_max_score
    }

    fn block_max_score(&mut self, doc: DocId) -> f32 {
        self.postings.advance_shallow(doc);
        let block = self.postings.current_block_idx();
        if (block as usize) < self.block_max_scores.len() {
            self.block_max_scores[block as usize]
        } else {
            self.term_max_score
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::core::FieldId;
    use crate::inverted::norms::FieldNormsWriter;
    use crate::inverted::postings::PostingListWriter;

    fn make_postings(docs: &[(u32, u32)]) -> Vec<u8> {
        let mut writer = PostingListWriter::new();
        for &(doc_id, tf) in docs {
            writer.add(DocId::new(doc_id), tf);
        }
        writer.finish()
    }

    fn make_norms(lengths: &[u32]) -> Vec<u8> {
        let mut writer = FieldNormsWriter::new(FieldId::new(0));
        for &len in lengths {
            writer.add(len);
        }
        writer.finish()
    }

    #[test]
    fn idf_rare_term_higher() {
        let common_idf = bm25_idf(1000, 900); // appears in 90% of docs
        let rare_idf = bm25_idf(1000, 10); // appears in 1% of docs
        assert!(rare_idf > common_idf);
    }

    #[test]
    fn idf_zero_doc_freq() {
        let idf = bm25_idf(1000, 0);
        assert!(idf > 0.0);
        assert!(idf.is_finite());
    }

    #[test]
    fn idf_all_docs_match() {
        let idf = bm25_idf(1000, 1000);
        // When all docs match, IDF should be low but non-negative
        assert!(idf >= 0.0);
    }

    #[test]
    fn higher_tf_higher_score() {
        let idf = bm25_idf(100, 10);
        let s1 = bm25_score(idf, 1.0, 10.0, 10.0);
        let s2 = bm25_score(idf, 5.0, 10.0, 10.0);
        let s3 = bm25_score(idf, 20.0, 10.0, 10.0);
        assert!(s2 > s1, "higher TF should give higher score");
        assert!(s3 > s2, "even higher TF should give higher score");
    }

    #[test]
    fn tf_saturation() {
        let idf = bm25_idf(100, 10);
        // Scores should grow with diminishing returns
        let s10 = bm25_score(idf, 10.0, 10.0, 10.0);
        let s100 = bm25_score(idf, 100.0, 10.0, 10.0);
        let s1000 = bm25_score(idf, 1000.0, 10.0, 10.0);
        assert!(s100 > s10);
        assert!(s1000 > s100);
        // Diminishing returns: increments get smaller
        assert!((s100 - s10) > (s1000 - s100));
    }

    #[test]
    fn longer_docs_lower_score() {
        let idf = bm25_idf(100, 10);
        let short = bm25_score(idf, 2.0, 5.0, 10.0);
        let avg = bm25_score(idf, 2.0, 10.0, 10.0);
        let long = bm25_score(idf, 2.0, 20.0, 10.0);
        assert!(short > avg, "shorter doc should score higher");
        assert!(avg > long, "average doc should score higher than long");
    }

    #[test]
    fn scorer_iterates_docs() {
        let postings_data = make_postings(&[(0, 1), (5, 2), (10, 1)]);
        let norms_data = make_norms(&[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]);

        let weight = Bm25Weight::new(100, 3, 3.0);
        let reader = PostingListReader::new(&postings_data);
        let norms = FieldNormsReader::open(&norms_data);
        let mut scorer = Bm25Scorer::new(weight, reader, norms);

        assert_eq!(scorer.doc_id(), DocId::new(0));
        assert_eq!(scorer.next(), DocId::new(5));
        assert_eq!(scorer.next(), DocId::new(10));
        assert_eq!(scorer.next(), NO_MORE_DOCS);
    }

    #[test]
    fn scorer_advance() {
        let postings_data = make_postings(&[(0, 1), (5, 2), (10, 1), (20, 3)]);
        let norms_data = make_norms(&(0..21).map(|_| 5u32).collect::<Vec<_>>());

        let weight = Bm25Weight::new(100, 4, 5.0);
        let reader = PostingListReader::new(&postings_data);
        let norms = FieldNormsReader::open(&norms_data);
        let mut scorer = Bm25Scorer::new(weight, reader, norms);

        assert_eq!(scorer.advance(DocId::new(5)), DocId::new(5));
        assert_eq!(scorer.advance(DocId::new(15)), DocId::new(20));
        assert_eq!(scorer.advance(DocId::new(21)), NO_MORE_DOCS);
    }

    #[test]
    fn scorer_advance_past_end() {
        let postings_data = make_postings(&[(0, 1), (1, 1)]);
        let norms_data = make_norms(&[5, 5]);

        let weight = Bm25Weight::new(10, 2, 5.0);
        let reader = PostingListReader::new(&postings_data);
        let norms = FieldNormsReader::open(&norms_data);
        let mut scorer = Bm25Scorer::new(weight, reader, norms);

        assert_eq!(scorer.advance(DocId::new(100)), NO_MORE_DOCS);
    }

    #[test]
    fn scorer_scores_correctly() {
        // Simple corpus: 10 docs, term in 2 docs
        let postings_data = make_postings(&[(0, 3), (5, 1)]);
        let norms_data = make_norms(&[10, 10, 10, 10, 10, 5, 10, 10, 10, 10]);

        let avg_dl = 10.0 * 9.0 / 10.0 + 5.0 / 10.0; // 9.5
        let weight = Bm25Weight::new(10, 2, avg_dl);
        let reader = PostingListReader::new(&postings_data);
        let norms = FieldNormsReader::open(&norms_data);
        let mut scorer = Bm25Scorer::new(weight, reader, norms);

        // Doc 0: tf=3, dl=10
        assert_eq!(scorer.doc_id(), DocId::new(0));
        let score0 = scorer.score();

        // Doc 5: tf=1, dl=5
        scorer.next();
        assert_eq!(scorer.doc_id(), DocId::new(5));
        let score5 = scorer.score();

        // Both should be positive
        assert!(score0 > 0.0);
        assert!(score5 > 0.0);

        // Doc 0 has higher TF but longer length. Doc 5 has lower TF but shorter.
        // The exact comparison depends on parameters, but we can verify they're different.
        assert_ne!(score0, score5);
    }

    #[test]
    fn scorer_no_two_phase() {
        let postings_data = make_postings(&[(0, 1)]);
        let norms_data = make_norms(&[5]);

        let weight = Bm25Weight::new(10, 1, 5.0);
        let reader = PostingListReader::new(&postings_data);
        let norms = FieldNormsReader::open(&norms_data);
        let mut scorer = Bm25Scorer::new(weight, reader, norms);

        assert!(scorer.two_phase().is_none());
    }

    #[test]
    fn hand_computed_bm25() {
        // Verify against hand computation:
        // N=100, n=10, tf=2, dl=15, avgdl=10
        // IDF = ln(1 + (100 - 10 + 0.5) / (10 + 0.5)) = ln(1 + 90.5/10.5) = ln(9.619) ≈ 2.264
        // tf_norm = (2 * 2.2) / (2 + 1.2 * (1 - 0.75 + 0.75 * 15/10))
        //         = 4.4 / (2 + 1.2 * (0.25 + 1.125))
        //         = 4.4 / (2 + 1.65)
        //         = 4.4 / 3.65 ≈ 1.205
        // score = 2.264 * 1.205 ≈ 2.728

        let idf = bm25_idf(100, 10);
        let score = bm25_score(idf, 2.0, 15.0, 10.0);

        // Allow some floating point tolerance
        let expected_idf = ((1.0 + 90.5 / 10.5) as f64).ln() as f32;
        assert!(
            (idf - expected_idf).abs() < 0.001,
            "idf={idf} expected={expected_idf}"
        );

        let expected = expected_idf * (2.0 * 2.2) / (2.0 + 1.2 * (0.25 + 0.75 * 15.0 / 10.0));
        assert!(
            (score - expected).abs() < 0.01,
            "score={score} expected={expected}"
        );
    }
}