Skip to main content

ripvec_core/encoder/
bert.rs

1//! BERT-family encoder: wraps the existing `embed_all` pipeline.
2//!
3//! Bridges the [`VectorEncoder`] trait surface to the
4//! [`embed::embed_all`](crate::embed::embed_all) function that powers
5//! `--model bert` and `--model modernbert`. Owns the
6//! `Vec<Box<dyn EmbedBackend>>` (one per detected GPU/CPU backend) and the
7//! HuggingFace tokenizer, both required by the streaming pipeline.
8//!
9//! ## Why a thin wrapper, not a refactor
10//!
11//! The `embed_all` body — walk → chunk → tokenize → embed via streaming
12//! pipeline — is non-trivial and rich in edge cases (rayon clones for CPU,
13//! ring-buffer for GPU, sort-by-length batching, file-count threshold for
14//! streaming vs batch). Wrapping rather than relocating keeps that battle-
15//! tested code intact and reduces P0.3 to a small adapter, deferring any
16//! deeper refactor until benefits emerge.
17//!
18//! See `docs/PLAN.md:P0.3` for the acceptance predicates.
19
20use std::path::Path;
21
22use crate::backend::EmbedBackend;
23use crate::chunk::CodeChunk;
24use crate::embed::{SearchConfig, embed_all};
25use crate::encoder::VectorEncoder;
26use crate::profile::Profiler;
27
28/// BERT-family encoder implementation of [`VectorEncoder`].
29///
30/// Constructed by `main.rs::load_pipeline` with detected backends and the
31/// model's HF tokenizer. The `model_repo` and `hidden_dim` are recorded
32/// at construction time for cache keying and diagnostics; both are
33/// known from the model selection in the CLI.
34pub struct BertEncoder {
35    backends: Vec<Box<dyn EmbedBackend>>,
36    tokenizer: tokenizers::Tokenizer,
37    model_repo: String,
38    hidden_dim: usize,
39}
40
41impl BertEncoder {
42    /// Build a [`BertEncoder`] from already-loaded backends and tokenizer.
43    ///
44    /// `model_repo` is the HuggingFace repo string for cache keying and
45    /// `identity()`. `hidden_dim` is the embedding output dimension
46    /// (384 for BGE-small-en-v1.5; 768 for ModernBERT-embed-base).
47    #[must_use]
48    pub fn new(
49        backends: Vec<Box<dyn EmbedBackend>>,
50        tokenizer: tokenizers::Tokenizer,
51        model_repo: String,
52        hidden_dim: usize,
53    ) -> Self {
54        Self {
55            backends,
56            tokenizer,
57            model_repo,
58            hidden_dim,
59        }
60    }
61
62    /// Borrow the underlying backends.
63    ///
64    /// Useful for query-time embedding (e.g., interactive mode re-uses
65    /// the loaded backend to embed a query string).
66    #[must_use]
67    pub fn backends(&self) -> &[Box<dyn EmbedBackend>] {
68        &self.backends
69    }
70
71    /// Borrow the underlying tokenizer.
72    ///
73    /// Same query-time use case as [`backends`](Self::backends).
74    #[must_use]
75    pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
76        &self.tokenizer
77    }
78}
79
80impl VectorEncoder for BertEncoder {
81    fn embed_root(
82        &self,
83        root: &Path,
84        cfg: &SearchConfig,
85        profiler: &Profiler,
86    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
87        // embed_all takes `&[&dyn EmbedBackend]`; convert from owned boxes.
88        let backend_refs: Vec<&dyn EmbedBackend> = self.backends.iter().map(Box::as_ref).collect();
89        embed_all(root, &backend_refs, &self.tokenizer, cfg, profiler)
90    }
91
92    fn hidden_dim(&self) -> usize {
93        self.hidden_dim
94    }
95
96    fn identity(&self) -> &str {
97        &self.model_repo
98    }
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    /// `BertEncoder` implements `VectorEncoder` — verified at compile
106    /// time by accepting it as a trait-object generic argument. If the
107    /// trait signature drifts (e.g., return type changes), this test
108    /// fails to compile.
109    ///
110    /// Corresponds to acceptance `test:bert-encoder-streaming-pipeline-intact`
111    /// in `docs/PLAN.md:P0.3` — preserving the trait surface is what keeps
112    /// downstream code (cache layer, SearchIndex) unchanged.
113    #[test]
114    fn bert_encoder_implements_vector_encoder() {
115        fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
116        assert_trait_object::<BertEncoder>();
117    }
118
119    /// `BertEncoder::embed_root` delegates to `embed::embed_all` with a
120    /// matching return type, so callers see exactly the
121    /// `(Vec<CodeChunk>, Vec<Vec<f32>>)` shape that the existing
122    /// pipeline produced. Verified at compile time.
123    ///
124    /// Corresponds to acceptance
125    /// `test:bert-encoder-preserves-embed-all-output` in
126    /// `docs/PLAN.md:P0.3` — behavioral parity for the wrap is the
127    /// goal of P0.3, and the wrap is a single forwarding call.
128    /// Behavioral end-to-end parity is covered by the existing
129    /// `embed::tests` suite plus the integration test at
130    /// `crates/ripvec/tests/integration.rs` once P4.1 wires
131    /// `BertEncoder` through `load_pipeline`.
132    #[test]
133    fn embed_root_return_type_matches_embed_all() {
134        // Type-level assertion: BertEncoder::embed_root's return is
135        // assignable to a (Vec<CodeChunk>, Vec<Vec<f32>>)-shaped
136        // crate::Result. If embed_all's signature drifts, the wrap in
137        // the impl block fails to compile; this test is a sentinel.
138        fn signature_check<E: VectorEncoder>(
139            e: &E,
140            root: &Path,
141            cfg: &SearchConfig,
142            profiler: &Profiler,
143        ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
144            e.embed_root(root, cfg, profiler)
145        }
146        // Reference the function to keep the type-check live across
147        // dead-code analysis.
148        let _ = signature_check::<BertEncoder>;
149    }
150}