ripvec-core 1.0.3

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! Cross-encoder reranker for top-K refinement.
//!
//! ## Why this module exists
//!
//! ripvec's bi-encoder retrieval (BERT or semble) embeds query and
//! documents into a shared vector space and ranks by cosine. That's
//! cheap to scale, but the model can't express cross-token
//! interactions between query and document — each side is encoded
//! independently. On natural-language and prose corpora this caps
//! quality.
//!
//! A cross-encoder concatenates the pair `[CLS] query [SEP] doc [SEP]`
//! and runs full attention across both, producing a single relevance
//! score. Quality is meaningfully higher but cost is O(candidates),
//! so it's used only as a reranker on the bi-encoder's top-K.
//!
//! ## Architecture
//!
//! This module is a thin orchestrator: tokenize `(query, doc)` pairs,
//! delegate scoring to a [`RerankBackend`](crate::backend::RerankBackend)
//! (currently [`crate::backend::cpu::CpuRerankBackend`] — same BERT
//! trunk as the bi-encoder, plus a `Linear(hidden -> 1)` classifier
//! head + sigmoid).
//!
//! Adding GPU rerankers later is mechanical: implement
//! `RerankBackend` for Metal/CUDA/MLX, mirror `load_reranker_cpu` in
//! `backend/mod.rs`, route through `Reranker::from_pretrained`.

use anyhow::anyhow;
use tokenizers::Tokenizer;

use crate::backend::{Encoding, RerankBackend};

/// Default cross-encoder model.
/// `cross-encoder/ms-marco-MiniLM-L-12-v2` is 33MB, ~10ms per
/// query/doc pair on CPU, NDCG@10 = 74.5 on MS MARCO dev. Picked over
/// the smaller L-6 (22MB, NDCG 74.3) because the 4-corpus benchmark
/// matrix showed L-12 added meaningful target-hit lift across both
/// prose (Gutenberg) and code (Tokio) — and the ~5ms/pair extra is
/// invisible against the indexing budget on any non-trivial corpus.
pub const DEFAULT_RERANK_MODEL: &str = "cross-encoder/ms-marco-MiniLM-L-12-v2";

/// Default cap on candidates passed to the reranker.
///
/// Cost is linear in candidates; 100 is the standard top-K in the
/// retrieve-then-rerank literature. At ~5ms/pair on MiniLM-L-6 this
/// is ~500ms total, the upper edge of interactive.
pub const DEFAULT_RERANK_CANDIDATES: usize = 100;

/// Cross-encoder reranker orchestrator.
///
/// Owns a `RerankBackend` (model trunk + classifier head) and the
/// tokenizer that produced the encodings the backend expects.
///
/// Construct via [`Self::from_pretrained`]. Use [`score_pairs`] to
/// rank candidate `(query, doc)` text pairs.
///
/// [`score_pairs`]: Self::score_pairs
pub struct Reranker {
    backend: Box<dyn RerankBackend>,
    tokenizer: Tokenizer,
}

impl Reranker {
    /// Load a cross-encoder by `HuggingFace` repo ID.
    ///
    /// Routes through [`crate::backend::load_reranker_cpu`] for now;
    /// GPU paths slot in here as feature-gated branches when added.
    /// The tokenizer is downloaded via the same `hf-hub` cache, so
    /// multiple sub-agent MCP processes share weights through
    /// `~/.cache/huggingface/hub/`.
    ///
    /// # Errors
    ///
    /// Returns an error if the model can't be downloaded, lacks a
    /// classifier head (i.e., a bi-encoder was supplied by mistake),
    /// or fails to load.
    pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
        let backend = crate::backend::load_reranker_cpu(model_repo)?;
        let tokenizer = crate::tokenize::load_tokenizer(model_repo)?;
        Ok(Self { backend, tokenizer })
    }

    /// Score a batch of `(query, document)` pairs.
    ///
    /// Returns scores in `[0, 1]` (sigmoid-activated), one per input
    /// pair, in input order. Tokenizes with a `(query, doc)` tuple so
    /// `token_type_ids` are 0 for the query side, 1 for the doc side —
    /// the convention BERT cross-encoders are trained on.
    ///
    /// # Errors
    ///
    /// Propagates tokenization or forward-pass errors.
    pub fn score_pairs(&self, pairs: &[(&str, &str)]) -> crate::Result<Vec<f32>> {
        if pairs.is_empty() {
            return Ok(Vec::new());
        }
        let max_tokens = self.backend.max_tokens();
        let encodings: crate::Result<Vec<Encoding>> = pairs
            .iter()
            .map(|(q, d)| {
                let enc = self
                    .tokenizer
                    .encode((*q, *d), true)
                    .map_err(|e| crate::Error::Other(anyhow!("rerank tokenize failed: {e}")))?;
                let len = enc.get_ids().len().min(max_tokens);
                Ok(Encoding {
                    input_ids: enc.get_ids()[..len].iter().map(|&x| i64::from(x)).collect(),
                    attention_mask: enc.get_attention_mask()[..len]
                        .iter()
                        .map(|&x| i64::from(x))
                        .collect(),
                    token_type_ids: enc.get_type_ids()[..len]
                        .iter()
                        .map(|&x| i64::from(x))
                        .collect(),
                })
            })
            .collect();
        let encodings = encodings?;
        self.backend.score_batch(&encodings)
    }

    /// Max sequence length supported by the underlying model.
    #[must_use]
    pub fn max_tokens(&self) -> usize {
        self.backend.max_tokens()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `Reranker::from_pretrained` works end-to-end on the default model.
    /// Gated `--ignored` since it downloads weights from `HuggingFace`.
    ///
    /// Verifies the two structural claims:
    /// 1. The cross-encoder ranks a relevant doc higher than an
    ///    irrelevant one for the same query.
    /// 2. Scores are in `[0, 1]` (sigmoid range).
    #[test]
    #[ignore = "requires network + model download (~22MB)"]
    fn loads_and_ranks_default_cross_encoder() {
        let rr = Reranker::from_pretrained(DEFAULT_RERANK_MODEL)
            .expect("default cross-encoder should load");
        let scores = rr
            .score_pairs(&[
                (
                    "how to make pasta",
                    "Boil water, add salt, cook pasta for 8 minutes.",
                ),
                (
                    "how to make pasta",
                    "The mitochondria is the powerhouse of the cell.",
                ),
            ])
            .expect("scoring should succeed");
        assert_eq!(scores.len(), 2);
        assert!(scores.iter().all(|&s| (0.0..=1.0).contains(&s)));
        assert!(
            scores[0] > scores[1],
            "relevant doc ({}) should beat irrelevant ({})",
            scores[0],
            scores[1]
        );
    }
}