ripvec-core 3.0.2

//! Backend abstraction layer.
//!
//! Post-v3.0.0 the only backend is the CPU cross-encoder reranker
//! ([`cpu::CpuRerankBackend`], backed by [`cpu::CpuBertModel`]). The
//! [`RerankBackend`] trait, [`Encoding`] input type, and [`BackendKind`]
//! discriminant survive; the [`EmbedBackend`] trait and bi-encoder
//! `load_backend` / `detect_backends` entry points were removed when
//! the transformer engines came out.

pub mod blas_info;
// `cpu` covers CpuBertModel + CpuRerankBackend (both keep-anchors per
// the surgery's backend_split.md §3). The CpuBackend wrapper struct
// was removed with the bi-encoder backends; the trunk + reranker survived.
// Gate widened from `cfg(feature = "cpu")` so the macOS default build
// (which uses `cpu-accelerate`) gets the reranker.
#[cfg(any(feature = "cpu", feature = "cpu-accelerate"))]
pub mod cpu;

/// Pre-tokenized encoding ready for inference.
///
/// Token IDs, attention mask, and token type IDs must all have the same length.
/// Token count is capped at `MODEL_MAX_TOKENS` (512) by the tokenizer before
/// reaching the backend.
#[derive(Debug, Clone)]
pub struct Encoding {
    /// Token IDs produced by the tokenizer.
    pub input_ids: Vec<i64>,
    /// Attention mask (1 for real tokens, 0 for padding).
    pub attention_mask: Vec<i64>,
    /// Token type IDs (0 for single-sequence models).
    pub token_type_ids: Vec<i64>,
}

/// Trait for cross-encoder rerank backends.
///
/// Parallel to [`EmbedBackend`], but the forward pass terminates in a
/// scalar relevance score per pair instead of a pooled vector. Used by
/// the retrieve-then-rerank pipeline: a bi-encoder ([`EmbedBackend`])
/// retrieves top-K cheaply, then [`RerankBackend`] re-scores those K
/// candidates with the cross-encoder's higher-quality cross-attention
/// over the concatenated `[CLS] query [SEP] doc [SEP]` sequence.
///
/// # Why a separate trait
///
/// Cross-encoders share BERT's trunk with bi-encoders, but the head and
/// pooling differ: bi-encoder = CLS pool + L2-normalize, cross-encoder
/// = CLS pool + linear(hidden -> 1) + sigmoid. The two return shapes are
/// incompatible (`Vec<Vec<f32>>` vs `Vec<f32>`), so unifying them under
/// a single trait would force every caller to handle an awkward sum
/// type. Sibling traits keep both call sites direct.
pub trait RerankBackend: Send + Sync {
    /// Score a batch of pre-tokenized pairs and return one score per
    /// encoding. Scores are sigmoid-activated and lie in `[0, 1]`.
    ///
    /// The encoding's `token_type_ids` should mark the query side as
    /// 0 and the doc side as 1 (standard BERT pair convention); this
    /// is what `tokenizers::Tokenizer::encode((query, doc), ..)`
    /// produces.
    ///
    /// # Errors
    ///
    /// Returns an error if tensor construction or the forward pass fails.
    fn score_batch(&self, encodings: &[Encoding]) -> crate::Result<Vec<f32>>;

    /// Maximum token count this model supports.
    fn max_tokens(&self) -> usize {
        512
    }

    /// Whether this backend runs on a GPU.
    fn is_gpu(&self) -> bool;

    /// Short human-readable label for this backend.
    fn name(&self) -> &'static str {
        if self.is_gpu() { "GPU" } else { "CPU" }
    }
}

/// Detect available backends and load them.
///
/// The `CpuBackend` wrapper was removed with the bi-encoder backends; the embedding path is
/// excised (B6 will prune `embed.rs` and `cache/reindex.rs`). This function
/// now always returns an error. Retained here until B6 removes the `server.rs`
/// caller at line 463.
///
/// # Errors
///
/// Load a cross-encoder rerank model for CPU inference.
///
/// MS-MARCO family rerankers (the default
/// `cross-encoder/ms-marco-MiniLM-L-6-v2`) are ClassicBert-shaped, so
/// they route through [`cpu::CpuRerankBackend`] - same trunk as the
/// bi-encoder, plus a `Linear(hidden -> 1)` classifier head.
///
/// Not feature-gated like the (now-deleted) embedding backends: the rerank
/// path is load-bearing for the document-search use case (cacheless prose
/// queries) and must work in the default build. The underlying
/// `CpuRerankBackend` uses the same ndarray BLAS setup as the former
/// `CpuBackend`, so it works wherever the CPU embedding backend did -
/// `feature = "cpu"` or `feature = "cpu-accelerate"`.
///
/// # Errors
///
/// Returns an error if the model cannot be downloaded, if it lacks a
/// classifier head (i.e., the caller pointed at a bi-encoder by
/// mistake), or if the weights fail to parse.
#[cfg(any(feature = "cpu", feature = "cpu-accelerate"))]
pub fn load_reranker_cpu(model_repo: &str) -> crate::Result<Box<dyn RerankBackend>> {
    let backend = cpu::CpuRerankBackend::load(model_repo)?;
    Ok(Box::new(backend))
}

#[cfg(not(any(feature = "cpu", feature = "cpu-accelerate")))]
pub fn load_reranker_cpu(_model_repo: &str) -> crate::Result<Box<dyn RerankBackend>> {
    Err(crate::Error::Other(anyhow::anyhow!(
        "cross-encoder rerank requires building with --features cpu \
         or --features cpu-accelerate"
    )))
}

#[cfg(test)]
mod tests {
    use super::*;

    // EmbedBackend trait object-safety + Send/Sync tests removed in v3.0.0:
    // the trait itself was deleted (zero impls post-surgery). The surviving
    // RerankBackend trait has one impl (CpuRerankBackend), used as a concrete
    // type at every call site; no trait-object assertions needed.

    #[test]
    fn encoding_construction() {
        let enc = Encoding {
            input_ids: vec![101, 2023, 2003, 1037, 3231, 102],
            attention_mask: vec![1, 1, 1, 1, 1, 1],
            token_type_ids: vec![0, 0, 0, 0, 0, 0],
        };
        assert_eq!(enc.input_ids.len(), 6);
        assert_eq!(enc.attention_mask.len(), 6);
        assert_eq!(enc.token_type_ids.len(), 6);
    }

    #[test]
    fn encoding_clone() {
        let enc = Encoding {
            input_ids: vec![101, 102],
            attention_mask: vec![1, 1],
            token_type_ids: vec![0, 0],
        };
        let cloned = enc.clone();
        assert_eq!(enc.input_ids, cloned.input_ids);
    }
}