ripvec-core 2.0.0

//! BERT-family encoder: wraps the existing `embed_all` pipeline.
//!
//! Bridges the [`VectorEncoder`] trait surface to the
//! [`embed::embed_all`](crate::embed::embed_all) function that powers
//! `--model bert` and `--model modernbert`. Owns the
//! `Vec<Box<dyn EmbedBackend>>` (one per detected GPU/CPU backend) and the
//! HuggingFace tokenizer, both required by the streaming pipeline.
//!
//! ## Why a thin wrapper, not a refactor
//!
//! The `embed_all` body — walk → chunk → tokenize → embed via streaming
//! pipeline — is non-trivial and rich in edge cases (rayon clones for CPU,
//! ring-buffer for GPU, sort-by-length batching, file-count threshold for
//! streaming vs batch). Wrapping rather than relocating keeps that battle-
//! tested code intact and reduces P0.3 to a small adapter, deferring any
//! deeper refactor until benefits emerge.
//!
//! See `docs/PLAN.md:P0.3` for the acceptance predicates.

use std::path::Path;

use crate::backend::EmbedBackend;
use crate::chunk::CodeChunk;
use crate::embed::{SearchConfig, embed_all};
use crate::encoder::VectorEncoder;
use crate::profile::Profiler;

/// BERT-family encoder implementation of [`VectorEncoder`].
///
/// Constructed by `main.rs::load_pipeline` with detected backends and the
/// model's HF tokenizer. The `model_repo` and `hidden_dim` are recorded
/// at construction time for cache keying and diagnostics; both are
/// known from the model selection in the CLI.
pub struct BertEncoder {
    backends: Vec<Box<dyn EmbedBackend>>,
    tokenizer: tokenizers::Tokenizer,
    model_repo: String,
    hidden_dim: usize,
}

impl BertEncoder {
    /// Build a [`BertEncoder`] from already-loaded backends and tokenizer.
    ///
    /// `model_repo` is the HuggingFace repo string for cache keying and
    /// `identity()`. `hidden_dim` is the embedding output dimension
    /// (384 for BGE-small-en-v1.5; 768 for ModernBERT-embed-base).
    #[must_use]
    pub fn new(
        backends: Vec<Box<dyn EmbedBackend>>,
        tokenizer: tokenizers::Tokenizer,
        model_repo: String,
        hidden_dim: usize,
    ) -> Self {
        Self {
            backends,
            tokenizer,
            model_repo,
            hidden_dim,
        }
    }

    /// Borrow the underlying backends.
    ///
    /// Useful for query-time embedding (e.g., interactive mode re-uses
    /// the loaded backend to embed a query string).
    #[must_use]
    pub fn backends(&self) -> &[Box<dyn EmbedBackend>] {
        &self.backends
    }

    /// Borrow the underlying tokenizer.
    ///
    /// Same query-time use case as [`backends`](Self::backends).
    #[must_use]
    pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
        &self.tokenizer
    }
}

impl VectorEncoder for BertEncoder {
    fn embed_root(
        &self,
        root: &Path,
        cfg: &SearchConfig,
        profiler: &Profiler,
    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
        // embed_all takes `&[&dyn EmbedBackend]`; convert from owned boxes.
        let backend_refs: Vec<&dyn EmbedBackend> = self.backends.iter().map(Box::as_ref).collect();
        embed_all(root, &backend_refs, &self.tokenizer, cfg, profiler)
    }

    fn hidden_dim(&self) -> usize {
        self.hidden_dim
    }

    fn identity(&self) -> &str {
        &self.model_repo
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `BertEncoder` implements `VectorEncoder` — verified at compile
    /// time by accepting it as a trait-object generic argument. If the
    /// trait signature drifts (e.g., return type changes), this test
    /// fails to compile.
    ///
    /// Corresponds to acceptance `test:bert-encoder-streaming-pipeline-intact`
    /// in `docs/PLAN.md:P0.3` — preserving the trait surface is what keeps
    /// downstream code (cache layer, SearchIndex) unchanged.
    #[test]
    fn bert_encoder_implements_vector_encoder() {
        fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
        assert_trait_object::<BertEncoder>();
    }

    /// `BertEncoder::embed_root` delegates to `embed::embed_all` with a
    /// matching return type, so callers see exactly the
    /// `(Vec<CodeChunk>, Vec<Vec<f32>>)` shape that the existing
    /// pipeline produced. Verified at compile time.
    ///
    /// Corresponds to acceptance
    /// `test:bert-encoder-preserves-embed-all-output` in
    /// `docs/PLAN.md:P0.3` — behavioral parity for the wrap is the
    /// goal of P0.3, and the wrap is a single forwarding call.
    /// Behavioral end-to-end parity is covered by the existing
    /// `embed::tests` suite plus the integration test at
    /// `crates/ripvec/tests/integration.rs` once P4.1 wires
    /// `BertEncoder` through `load_pipeline`.
    #[test]
    fn embed_root_return_type_matches_embed_all() {
        // Type-level assertion: BertEncoder::embed_root's return is
        // assignable to a (Vec<CodeChunk>, Vec<Vec<f32>>)-shaped
        // crate::Result. If embed_all's signature drifts, the wrap in
        // the impl block fails to compile; this test is a sentinel.
        fn signature_check<E: VectorEncoder>(
            e: &E,
            root: &Path,
            cfg: &SearchConfig,
            profiler: &Profiler,
        ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
            e.embed_root(root, cfg, profiler)
        }
        // Reference the function to keep the type-check live across
        // dead-code analysis.
        let _ = signature_check::<BertEncoder>;
    }
}