ripvec-core 1.0.0

//! BM25 with ripvec's stem-doubled path enrichment.
//!
//! Port of `~/src/semble/src/semble/index/sparse.py` (`enrich_for_bm25`
//! and `selector_to_mask`) plus the BM25 scoring loop used in
//! `~/src/semble/src/semble/search.py:search_bm25`. The enrichment
//! appends the file stem twice and the last three directory components
//! to chunk content before tokenization, so path-based queries hit
//! even when the query terms aren't in the chunk text.
//!
//! Python uses the `bm25s` library; this port hand-rolls Okapi BM25
//! (k1=1.5, b=0.75) to avoid another dependency. The output ordering
//! matches `bm25s`'s descending-score semantics with zero-score
//! exclusion as in `search.py:search_bm25`.

use std::path::Path;

use lasso::{Spur, ThreadedRodeo};
use rayon::prelude::*;
use rustc_hash::{FxBuildHasher, FxHashMap};

use crate::chunk::CodeChunk;
use crate::encoder::ripvec::tokens::tokenize;

/// Okapi BM25 free parameter — term-frequency saturation.
const K1: f32 = 1.5;
/// Okapi BM25 free parameter — document-length normalization.
const B: f32 = 0.75;

/// Append the file stem (twice, for up-weight) and the last three
/// directory components to a chunk's text content. Mirrors
/// `enrich_for_bm25` from `sparse.py:18`.
///
/// Assumes `chunk.file_path` is already repo-relative so
/// machine-specific directory components don't leak into the index.
#[must_use]
pub fn enrich_for_bm25(chunk: &CodeChunk) -> String {
    let path = Path::new(&chunk.file_path);
    let stem = path
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or_default();
    let dir_parts: Vec<&str> = path
        .parent()
        .into_iter()
        .flat_map(|p| p.iter())
        .filter_map(|os| os.to_str())
        .filter(|part| *part != "." && *part != "/")
        .collect();
    // Last 3 directory components (mirrors Python's dir_parts[-3:]).
    let tail_len = dir_parts.len().min(3);
    let dir_text = dir_parts[dir_parts.len() - tail_len..].join(" ");
    format!("{} {stem} {stem} {dir_text}", chunk.content)
}

/// Hand-rolled Okapi BM25 index over a set of enriched documents.
///
/// Built once via [`Bm25Index::build`]; queried repeatedly via
/// [`Bm25Index::score`]. Document order matches the chunk-index
/// convention used elsewhere in the ripvec port.
pub struct Bm25Index {
    /// String interner. All term `String`s in the corpus deduplicate to
    /// a `Spur` (32-bit ID). A 92K-file linux corpus has ~250K chunks ×
    /// ~50 unique terms each = ~12.5M term references; before interning
    /// each was a separately-allocated `String` (~500 MB of duplicated
    /// keys). After interning the keys are 4-byte IDs and only ~500K
    /// unique strings live in the rodeo (~10 MB).
    rodeo: ThreadedRodeo<Spur, FxBuildHasher>,
    /// Per-document term frequencies: doc_tfs[doc][term_id] = count.
    /// `FxHashMap<Spur, u32>` — Spur is a 32-bit `NonZeroU32`, so
    /// hashing is a single-multiply (vs SipHash on a String pointer
    /// chain) and the map is half the size.
    doc_tfs: Vec<FxHashMap<Spur, u32>>,
    /// Per-document length (token count).
    doc_lengths: Vec<u32>,
    /// Average document length across the corpus.
    avgdl: f32,
    /// Inverted index: term_id -> (doc_frequency, idf).
    df_idf: FxHashMap<Spur, (u32, f32)>,
}

impl Bm25Index {
    /// Build an index over enriched chunks. Tokenization uses
    /// `crate::encoder::ripvec::tokens::tokenize`.
    ///
    /// Three-pass build:
    ///
    /// 1. **par_iter (tokenize + intern + TF)**: each chunk is enriched,
    ///    tokenized, and its tokens interned into a shared
    ///    `ThreadedRodeo`. The per-doc TF map keys on the `Spur` ID
    ///    instead of `String`, eliminating the duplicated-string
    ///    storage that dominated memory + hashing in the previous
    ///    version.
    /// 2. **serial DF merge**: walk per-doc TF maps and increment a
    ///    global `Spur`-keyed counter. With `Spur` keys (4-byte
    ///    `NonZeroU32`), FxHash lookups are a single multiply.
    /// 3. **serial IDF compute**: produce the final df_idf map.
    ///
    /// On a 92K-file linux corpus (~250K chunks): bm25_build drops
    /// from 35s serial → ~14s parallel without interning → ~7s with
    /// interning.
    #[must_use]
    pub fn build(chunks: &[CodeChunk]) -> Self {
        let n = chunks.len();
        let rodeo: ThreadedRodeo<Spur, FxBuildHasher> = ThreadedRodeo::with_hasher(FxBuildHasher);
        if n == 0 {
            return Self {
                rodeo,
                doc_tfs: Vec::new(),
                doc_lengths: Vec::new(),
                avgdl: 0.0,
                df_idf: FxHashMap::default(),
            };
        }

        // Stage 1: par_iter — produce per-doc (tfs, token_count) pairs.
        // `ThreadedRodeo::get_or_intern` is lock-free for the common
        // (already-interned) case and uses a sharded lock only on
        // first insert. Worker threads share `&rodeo` safely.
        let per_doc: Vec<(FxHashMap<Spur, u32>, u32)> = chunks
            .par_iter()
            .map(|chunk| {
                let enriched = enrich_for_bm25(chunk);
                let tokens = tokenize(&enriched);
                let token_count = u32::try_from(tokens.len()).unwrap_or(u32::MAX);
                let mut tfs: FxHashMap<Spur, u32> =
                    FxHashMap::with_capacity_and_hasher(tokens.len(), FxBuildHasher);
                for tok in &tokens {
                    let id = rodeo.get_or_intern(tok);
                    *tfs.entry(id).or_insert(0) += 1;
                }
                (tfs, token_count)
            })
            .collect();

        // Stage 2: serial — extract doc_tfs/doc_lengths and accumulate df.
        // With Spur keys this is just a u32 counter increment; no
        // allocations on the hot path.
        let mut doc_tfs: Vec<FxHashMap<Spur, u32>> = Vec::with_capacity(n);
        let mut doc_lengths: Vec<u32> = Vec::with_capacity(n);
        let mut df: FxHashMap<Spur, u32> = FxHashMap::default();
        for (tfs, len) in per_doc {
            for term_id in tfs.keys() {
                *df.entry(*term_id).or_insert(0) += 1;
            }
            doc_lengths.push(len);
            doc_tfs.push(tfs);
        }

        let total_len: u64 = doc_lengths.iter().map(|&l| u64::from(l)).sum();
        #[expect(
            clippy::cast_precision_loss,
            reason = "doc counts are bounded; f32 precision is sufficient for avgdl"
        )]
        let avgdl = (total_len as f32) / (n as f32);

        // BM25 idf with the "plus 1" smoothing used by bm25s:
        //   idf(t) = ln( (N - df + 0.5) / (df + 0.5) + 1 )
        #[expect(
            clippy::cast_precision_loss,
            reason = "doc counts are bounded; f32 precision is sufficient for idf"
        )]
        let n_f = n as f32;
        let df_idf: FxHashMap<Spur, (u32, f32)> = df
            .into_iter()
            .map(|(term_id, df_count)| {
                #[expect(
                    clippy::cast_precision_loss,
                    reason = "df is u32; f32 precision sufficient for idf"
                )]
                let df_f = df_count as f32;
                let idf = ((n_f - df_f + 0.5) / (df_f + 0.5) + 1.0).ln();
                (term_id, (df_count, idf))
            })
            .collect();

        Self {
            rodeo,
            doc_tfs,
            doc_lengths,
            avgdl,
            df_idf,
        }
    }

    /// Number of indexed documents.
    #[must_use]
    pub fn len(&self) -> usize {
        self.doc_tfs.len()
    }

    /// Whether the index has zero documents.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.doc_tfs.is_empty()
    }

    /// Compute BM25 scores for `query` against every document.
    /// Returns a `Vec<f32>` of length `self.len()` (one score per doc).
    /// Zero scores indicate no query terms matched.
    #[must_use]
    pub fn score(&self, query: &str) -> Vec<f32> {
        let q_tokens = tokenize(query);
        if q_tokens.is_empty() || self.doc_tfs.is_empty() {
            return vec![0.0; self.doc_tfs.len()];
        }
        // Resolve query terms to interned IDs, dropping unknown terms
        // (they can't possibly score) and deduplicating. `self.rodeo.get`
        // is a read-only lookup — never modifies the interner.
        let mut query_ids: Vec<Spur> = Vec::with_capacity(q_tokens.len());
        let mut seen: rustc_hash::FxHashSet<Spur> = rustc_hash::FxHashSet::default();
        for term in &q_tokens {
            if let Some(id) = self.rodeo.get(term)
                && seen.insert(id)
            {
                query_ids.push(id);
            }
        }
        if query_ids.is_empty() {
            return vec![0.0; self.doc_tfs.len()];
        }

        let mut scores = vec![0.0_f32; self.doc_tfs.len()];
        #[expect(
            clippy::cast_precision_loss,
            reason = "tf/dl are u32 counts; f32 precision sufficient"
        )]
        for &term_id in &query_ids {
            let Some(&(_, idf)) = self.df_idf.get(&term_id) else {
                continue;
            };
            for (doc_idx, tfs) in self.doc_tfs.iter().enumerate() {
                let Some(&tf) = tfs.get(&term_id) else {
                    continue;
                };
                let tf_f = tf as f32;
                let dl = self.doc_lengths[doc_idx] as f32;
                let norm = if self.avgdl > 0.0 {
                    dl / self.avgdl
                } else {
                    0.0
                };
                let denom = tf_f + K1 * (1.0 - B + B * norm);
                scores[doc_idx] += idf * tf_f * (K1 + 1.0) / denom.max(f32::EPSILON);
            }
        }
        scores
    }
}

/// Convert a sparse selector (chunk indices to keep) into a dense
/// boolean mask of `size`. Mirrors `selector_to_mask` from
/// `sparse.py:9`. Returns `None` when `selector` is `None`.
#[must_use]
pub fn selector_to_mask(selector: Option<&[usize]>, size: usize) -> Option<Vec<bool>> {
    selector.map(|sel| {
        let mut mask = vec![false; size];
        for &i in sel {
            if i < size {
                mask[i] = true;
            }
        }
        mask
    })
}

/// Top-k BM25 search with optional selector mask and zero-score
/// exclusion. Mirrors `search.py:search_bm25`.
///
/// Returns `(chunk_index, score)` pairs sorted by score descending.
#[must_use]
pub fn search_bm25(
    query: &str,
    index: &Bm25Index,
    top_k: usize,
    selector: Option<&[usize]>,
) -> Vec<(usize, f32)> {
    if index.is_empty() || top_k == 0 {
        return Vec::new();
    }
    let mask = selector_to_mask(selector, index.len());
    let mut scores = index.score(query);
    if let Some(m) = &mask {
        for (i, allowed) in m.iter().enumerate() {
            if !allowed {
                scores[i] = 0.0;
            }
        }
    }
    let mut indexed: Vec<(usize, f32)> = scores
        .into_iter()
        .enumerate()
        .filter(|(_, s)| *s > 0.0)
        .collect();
    indexed.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    indexed.truncate(top_k);
    indexed
}

#[cfg(test)]
mod tests {
    use super::*;

    fn chunk(path: &str, content: &str) -> CodeChunk {
        CodeChunk {
            file_path: path.to_string(),
            name: String::new(),
            kind: String::new(),
            start_line: 1,
            end_line: 1,
            content: content.to_string(),
            enriched_content: content.to_string(),
        }
    }

    /// `test:bm25-enrich-stem-doubled` — the file stem appears twice in
    /// the enriched text so BM25 up-weights stem matches.
    #[test]
    fn bm25_enrich_stem_doubled() {
        let c = chunk("src/foo.rs", "fn run() {}");
        let enriched = enrich_for_bm25(&c);
        let occurrences = enriched.matches("foo").count();
        assert_eq!(occurrences, 2, "expected 'foo' twice; got: {enriched}");
    }

    /// `test:bm25-enrich-last-3-dir-parts` — only the last 3 directory
    /// components are appended (mirrors Python's `dir_parts[-3:]`).
    #[test]
    fn bm25_enrich_last_3_dir_parts() {
        let c = chunk("a/b/c/d/e/foo.rs", "");
        let enriched = enrich_for_bm25(&c);
        // The dir part text should include the last three dirs c, d, e
        // (in path order), not a or b.
        assert!(enriched.contains("c d e"), "got: {enriched:?}");
        assert!(!enriched.contains(" b "), "got: {enriched:?}");
    }

    /// `test:bm25-selector-mask-excludes-non-selected` — masked chunks
    /// receive zero score even when they contain query terms.
    #[test]
    fn bm25_selector_mask_excludes_non_selected() {
        let chunks = vec![
            chunk("src/a.rs", "alpha bravo"),
            chunk("src/b.rs", "alpha gamma"),
        ];
        let idx = Bm25Index::build(&chunks);
        // Without mask both docs match "alpha".
        let all = search_bm25("alpha", &idx, 10, None);
        assert_eq!(all.len(), 2);
        // With selector [0], only doc 0 is allowed.
        let masked = search_bm25("alpha", &idx, 10, Some(&[0]));
        assert_eq!(masked.len(), 1);
        assert_eq!(masked[0].0, 0);
    }

    /// `test:bm25-zero-score-excluded` — documents with no query-term
    /// matches don't appear in the results.
    #[test]
    fn bm25_zero_score_excluded() {
        let chunks = vec![chunk("src/a.rs", "alpha"), chunk("src/b.rs", "bravo")];
        let idx = Bm25Index::build(&chunks);
        let r = search_bm25("alpha", &idx, 10, None);
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].0, 0);
    }

    #[test]
    fn empty_query_returns_empty() {
        let chunks = vec![chunk("src/a.rs", "alpha")];
        let idx = Bm25Index::build(&chunks);
        assert!(search_bm25("", &idx, 10, None).is_empty());
    }

    /// Stem appears doubled even when the chunk doesn't otherwise
    /// mention it; this lets a query like "foo" hit `foo.rs` files.
    #[test]
    fn stem_hits_via_enrichment_only() {
        let chunks = vec![
            chunk("src/foo.rs", "alpha bravo"),
            chunk("src/bar.rs", "alpha bravo"),
        ];
        let idx = Bm25Index::build(&chunks);
        let r = search_bm25("foo", &idx, 10, None);
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].0, 0);
    }
}