ripvec-core 3.0.2

//! Cross-encoder reranker for top-K refinement.
//!
//! ## Why this module exists
//!
//! ripvec's bi-encoder retrieval (BERT or semble) embeds query and
//! documents into a shared vector space and ranks by cosine. That's
//! cheap to scale, but the model can't express cross-token
//! interactions between query and document — each side is encoded
//! independently. On natural-language and prose corpora this caps
//! quality.
//!
//! A cross-encoder concatenates the pair `[CLS] query [SEP] doc [SEP]`
//! and runs full attention across both, producing a single relevance
//! score. Quality is meaningfully higher but cost is O(candidates),
//! so it's used only as a reranker on the bi-encoder's top-K.
//!
//! ## Architecture
//!
//! This module is a thin orchestrator: tokenize `(query, doc)` pairs,
//! delegate scoring to a [`RerankBackend`](crate::backend::RerankBackend)
//! (currently [`crate::backend::cpu::CpuRerankBackend`] — same BERT
//! trunk as the bi-encoder, plus a `Linear(hidden -> 1)` classifier
//! head + sigmoid).
//!
//! Only the CPU rerank backend is wired today. Adding GPU rerankers
//! later would require implementing `RerankBackend` for the target
//! device, mirroring `load_reranker_cpu` in `backend/mod.rs`, and
//! routing through `Reranker::from_pretrained`.

use anyhow::anyhow;
use tokenizers::{Tokenizer, TruncationDirection, TruncationParams, TruncationStrategy};

use crate::backend::{Encoding, RerankBackend};

/// Default cross-encoder model.
/// `cross-encoder/ms-marco-TinyBERT-L-2-v2` (~5 MB, 2-layer
/// distilled-from-BERT-base) replaced the prior MiniLM-L-12-v2
/// default after a model sweep on the gutenberg prose benchmark
/// (15 NL queries) showed it bit-identical on NDCG@10 / recall@10
/// while running 20x faster at the warm-query path:
///
/// ```text
///   model                              NDCG@10  recall@10  p50
///   ms-marco-MiniLM-L-12-v2 (old)      1.0000   1.000      671 ms
///   ms-marco-MiniLM-L-6-v2             1.0000   1.000      344 ms
///   ms-marco-MiniLM-L-2-v2             0.9508   1.000      125 ms  <- quality drop
///   ms-marco-TinyBERT-L-2-v2 (new)     1.0000   1.000       33 ms
/// ```
///
/// The distinction is distillation: TinyBERT-L-2 was trained with
/// teacher-distillation to preserve the larger model's behavior at
/// 2 layers, whereas plain MiniLM-L-2 sheds layers without that
/// regularization and loses precision. Two layers vs twelve cuts
/// inference cost ~6x; combined with smaller embedding dim it lands
/// at 20x in practice. Override via the CLI flag or
/// `Reranker::from_pretrained` directly when a corpus needs more
/// capacity (e.g. fine-grained domain reranking).
pub const DEFAULT_RERANK_MODEL: &str = "cross-encoder/ms-marco-TinyBERT-L-2-v2";

/// Default cap on candidates passed to the reranker.
///
/// Cost is linear in candidates. The retrieve-then-rerank literature
/// suggests 100 as a safe upper bound, but empirically — on the
/// gutenberg prose benchmark with the L-12 ms-marco cross-encoder —
/// NDCG@10 is bit-identical from K=100 all the way down to K=20
/// (recall stays at 1.000, the bi-encoder + ranking layer already
/// puts the relevant doc at rank 1 in every test query, so the
/// rerank's job is confirmation rather than reordering). 50 is a
/// 2x speedup over the literature default with enough headroom for
/// corpora where the bi-encoder is less confident; users on
/// high-confidence corpora can drop further (CLI: `--candidates 30`).
///
/// Bench (gutenberg, 15 NL queries, scope=docs, NDCG=1.000 throughout):
///
/// ```text
/// K=100  p50 1335 ms
/// K=50   p50  676 ms
/// K=30   p50  418 ms
/// K=20   p50  275 ms
/// ```
pub const DEFAULT_RERANK_CANDIDATES: usize = 50;

/// Cross-encoder reranker orchestrator.
///
/// Owns a `RerankBackend` (model trunk + classifier head) and the
/// tokenizer that produced the encodings the backend expects.
///
/// Construct via [`Self::from_pretrained`]. Use [`score_pairs`] to
/// rank candidate `(query, doc)` text pairs.
///
/// ## cfg-gating
///
/// The `backend` field type is cfg-gated by the `collapse-rerank-trait`
/// Cargo feature per the mandated pattern in
/// `docs/surgery/backend_trait_microbench.md` Section 4
/// (@Lampson (1983) "Hints for Computer System Design"):
///
/// - **default** (`collapse-rerank-trait` off): `Box<dyn RerankBackend>` —
///   heap-allocated vtable dispatch; future GPU rerankers slot in here.
/// - **Variant C** (`collapse-rerank-trait` on): [`crate::backend::cpu::CpuRerankBackend`]
///   held directly — monomorphic static dispatch; LLVM may inline through.
///
/// The call site `self.backend.score_batch(...)` is identical in source for
/// both variants; the compiler generates an indirect vtable call for Variant T
/// and a direct call for Variant C. This structural difference is what the
/// microbench measures. Anti-patterns (enum wrapping, type alias, two parallel
/// structs) are explicitly avoided per Section 4 to prevent LLVM from
/// collapsing both variants to zero overhead by construction.
///
/// [`score_pairs`]: Self::score_pairs
pub struct Reranker {
    /// Under default build: heap-allocated trait object for future GPU reranker extensibility.
    /// Under `collapse-rerank-trait`: concrete `CpuRerankBackend` for direct static dispatch.
    #[cfg(not(feature = "collapse-rerank-trait"))]
    backend: Box<dyn RerankBackend>,
    #[cfg(feature = "collapse-rerank-trait")]
    backend: crate::backend::cpu::CpuRerankBackend,
    tokenizer: Tokenizer,
}

impl Reranker {
    /// Load a cross-encoder by `HuggingFace` repo ID.
    ///
    /// Under default builds routes through [`crate::backend::load_reranker_cpu`]
    /// which boxes the result as `Box<dyn RerankBackend>`. Under
    /// `collapse-rerank-trait` calls [`crate::backend::cpu::CpuRerankBackend::load`]
    /// directly, storing the concrete type without boxing — this is the structural
    /// difference the `collapse-rerank-trait` microbench exploits
    /// (@Lampson (1983) "Hints for Computer System Design").
    ///
    /// The tokenizer is downloaded via the same `hf-hub` cache, so
    /// multiple sub-agent MCP processes share weights through
    /// `~/.cache/huggingface/hub/`.
    ///
    /// # Errors
    ///
    /// Returns an error if the model can't be downloaded, lacks a
    /// classifier head (i.e., a bi-encoder was supplied by mistake),
    /// or fails to load.
    pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
        #[cfg(not(feature = "collapse-rerank-trait"))]
        let backend = crate::backend::load_reranker_cpu(model_repo)?;
        #[cfg(feature = "collapse-rerank-trait")]
        let backend = crate::backend::cpu::CpuRerankBackend::load(model_repo)?;
        let mut tokenizer = crate::tokenize::load_tokenizer(model_repo)?;
        // Configure `LongestFirst` truncation against the model's
        // declared max sequence length. Without this the tokenizer
        // returns full-length encodings and ripvec used to head-truncate
        // the already-joined `[CLS] q [SEP] d [SEP]` sequence, which
        // can drop the trailing `[SEP]` and let the doc tail overflow
        // into garbage on long inputs. With `LongestFirst` the
        // tokenizer trims whichever of (query, doc) is longer until
        // the joined sequence fits, preserving special tokens.
        let max_tokens = backend.max_tokens();
        tokenizer
            .with_truncation(Some(TruncationParams {
                max_length: max_tokens,
                strategy: TruncationStrategy::LongestFirst,
                stride: 0,
                direction: TruncationDirection::Right,
            }))
            .map_err(|e| crate::Error::Other(anyhow!("rerank tokenizer truncation: {e}")))?;
        Ok(Self { backend, tokenizer })
    }

    /// Score a batch of `(query, document)` pairs.
    ///
    /// Returns raw logits (sentence-transformers `Identity` activation —
    /// the canonical public score for ms-marco cross-encoders), one
    /// per input pair, in input order. Tokenizes with a `(query, doc)`
    /// tuple so `token_type_ids` are 0 for the query side, 1 for the
    /// doc side — the convention BERT cross-encoders are trained on.
    /// The tokenizer is pre-configured with `LongestFirst` truncation
    /// at the model's `max_position_embeddings`, so callers don't need
    /// to clip outputs.
    ///
    /// # Errors
    ///
    /// Propagates tokenization or forward-pass errors.
    pub fn score_pairs(&self, pairs: &[(&str, &str)]) -> crate::Result<Vec<f32>> {
        if pairs.is_empty() {
            return Ok(Vec::new());
        }
        let encodings: crate::Result<Vec<Encoding>> = pairs
            .iter()
            .map(|(q, d)| {
                // The tokenizer is configured with LongestFirst
                // truncation in from_pretrained; the returned encoding
                // already fits within max_position_embeddings and
                // preserves [CLS] / [SEP] tokens at the correct
                // positions.
                let enc = self
                    .tokenizer
                    .encode((*q, *d), true)
                    .map_err(|e| crate::Error::Other(anyhow!("rerank tokenize failed: {e}")))?;
                Ok(Encoding {
                    input_ids: enc.get_ids().iter().map(|&x| i64::from(x)).collect(),
                    attention_mask: enc
                        .get_attention_mask()
                        .iter()
                        .map(|&x| i64::from(x))
                        .collect(),
                    token_type_ids: enc.get_type_ids().iter().map(|&x| i64::from(x)).collect(),
                })
            })
            .collect();
        let encodings = encodings?;
        self.backend.score_batch(&encodings)
    }

    /// Max sequence length supported by the underlying model.
    #[must_use]
    pub fn max_tokens(&self) -> usize {
        self.backend.max_tokens()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `Reranker::from_pretrained` works end-to-end on the default model.
    /// Gated `--ignored` since it downloads weights from `HuggingFace`.
    ///
    /// Verifies the two structural claims:
    /// 1. The cross-encoder ranks a relevant doc higher than an
    ///    irrelevant one for the same query.
    /// 2. Scores span a meaningful range (raw logits — the reference
    ///    spread for this model is roughly [-11, +5]).
    #[test]
    #[ignore = "requires network + model download (~22MB)"]
    fn loads_and_ranks_default_cross_encoder() {
        let rr = Reranker::from_pretrained(DEFAULT_RERANK_MODEL)
            .expect("default cross-encoder should load");
        let scores = rr
            .score_pairs(&[
                (
                    "how to make pasta",
                    "Boil water, add salt, cook pasta for 8 minutes.",
                ),
                (
                    "how to make pasta",
                    "The mitochondria is the powerhouse of the cell.",
                ),
            ])
            .expect("scoring should succeed");
        assert_eq!(scores.len(), 2);
        assert!(
            scores[0] > scores[1] + 1.0,
            "relevant doc ({}) should beat irrelevant ({}) by a clear logit margin",
            scores[0],
            scores[1]
        );
    }
}