ripvec_core/encoder/bert.rs
1//! BERT-family encoder: wraps the existing `embed_all` pipeline.
2//!
3//! Bridges the [`VectorEncoder`] trait surface to the
4//! [`embed::embed_all`](crate::embed::embed_all) function that powers
5//! `--model bert` and `--model modernbert`. Owns the
6//! `Vec<Box<dyn EmbedBackend>>` (one per detected GPU/CPU backend) and the
7//! HuggingFace tokenizer, both required by the streaming pipeline.
8//!
9//! ## Why a thin wrapper, not a refactor
10//!
11//! The `embed_all` body — walk → chunk → tokenize → embed via streaming
12//! pipeline — is non-trivial and rich in edge cases (rayon clones for CPU,
13//! ring-buffer for GPU, sort-by-length batching, file-count threshold for
14//! streaming vs batch). Wrapping rather than relocating keeps that battle-
15//! tested code intact and reduces P0.3 to a small adapter, deferring any
16//! deeper refactor until benefits emerge.
17//!
18//! See `docs/PLAN.md:P0.3` for the acceptance predicates.
19
20use std::path::Path;
21
22use crate::backend::EmbedBackend;
23use crate::chunk::CodeChunk;
24use crate::embed::{SearchConfig, embed_all};
25use crate::encoder::VectorEncoder;
26use crate::profile::Profiler;
27
28/// BERT-family encoder implementation of [`VectorEncoder`].
29///
30/// Constructed by `main.rs::load_pipeline` with detected backends and the
31/// model's HF tokenizer. The `model_repo` and `hidden_dim` are recorded
32/// at construction time for cache keying and diagnostics; both are
33/// known from the model selection in the CLI.
34pub struct BertEncoder {
35 backends: Vec<Box<dyn EmbedBackend>>,
36 tokenizer: tokenizers::Tokenizer,
37 model_repo: String,
38 hidden_dim: usize,
39}
40
41impl BertEncoder {
42 /// Build a [`BertEncoder`] from already-loaded backends and tokenizer.
43 ///
44 /// `model_repo` is the HuggingFace repo string for cache keying and
45 /// `identity()`. `hidden_dim` is the embedding output dimension
46 /// (384 for BGE-small-en-v1.5; 768 for ModernBERT-embed-base).
47 #[must_use]
48 pub fn new(
49 backends: Vec<Box<dyn EmbedBackend>>,
50 tokenizer: tokenizers::Tokenizer,
51 model_repo: String,
52 hidden_dim: usize,
53 ) -> Self {
54 Self {
55 backends,
56 tokenizer,
57 model_repo,
58 hidden_dim,
59 }
60 }
61
62 /// Borrow the underlying backends.
63 ///
64 /// Useful for query-time embedding (e.g., interactive mode re-uses
65 /// the loaded backend to embed a query string).
66 #[must_use]
67 pub fn backends(&self) -> &[Box<dyn EmbedBackend>] {
68 &self.backends
69 }
70
71 /// Borrow the underlying tokenizer.
72 ///
73 /// Same query-time use case as [`backends`](Self::backends).
74 #[must_use]
75 pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
76 &self.tokenizer
77 }
78}
79
80impl VectorEncoder for BertEncoder {
81 fn embed_root(
82 &self,
83 root: &Path,
84 cfg: &SearchConfig,
85 profiler: &Profiler,
86 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
87 // embed_all takes `&[&dyn EmbedBackend]`; convert from owned boxes.
88 let backend_refs: Vec<&dyn EmbedBackend> = self.backends.iter().map(Box::as_ref).collect();
89 embed_all(root, &backend_refs, &self.tokenizer, cfg, profiler)
90 }
91
92 fn hidden_dim(&self) -> usize {
93 self.hidden_dim
94 }
95
96 fn identity(&self) -> &str {
97 &self.model_repo
98 }
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104
105 /// `BertEncoder` implements `VectorEncoder` — verified at compile
106 /// time by accepting it as a trait-object generic argument. If the
107 /// trait signature drifts (e.g., return type changes), this test
108 /// fails to compile.
109 ///
110 /// Corresponds to acceptance `test:bert-encoder-streaming-pipeline-intact`
111 /// in `docs/PLAN.md:P0.3` — preserving the trait surface is what keeps
112 /// downstream code (cache layer, SearchIndex) unchanged.
113 #[test]
114 fn bert_encoder_implements_vector_encoder() {
115 fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
116 assert_trait_object::<BertEncoder>();
117 }
118
119 /// `BertEncoder::embed_root` delegates to `embed::embed_all` with a
120 /// matching return type, so callers see exactly the
121 /// `(Vec<CodeChunk>, Vec<Vec<f32>>)` shape that the existing
122 /// pipeline produced. Verified at compile time.
123 ///
124 /// Corresponds to acceptance
125 /// `test:bert-encoder-preserves-embed-all-output` in
126 /// `docs/PLAN.md:P0.3` — behavioral parity for the wrap is the
127 /// goal of P0.3, and the wrap is a single forwarding call.
128 /// Behavioral end-to-end parity is covered by the existing
129 /// `embed::tests` suite plus the integration test at
130 /// `crates/ripvec/tests/integration.rs` once P4.1 wires
131 /// `BertEncoder` through `load_pipeline`.
132 #[test]
133 fn embed_root_return_type_matches_embed_all() {
134 // Type-level assertion: BertEncoder::embed_root's return is
135 // assignable to a (Vec<CodeChunk>, Vec<Vec<f32>>)-shaped
136 // crate::Result. If embed_all's signature drifts, the wrap in
137 // the impl block fails to compile; this test is a sentinel.
138 fn signature_check<E: VectorEncoder>(
139 e: &E,
140 root: &Path,
141 cfg: &SearchConfig,
142 profiler: &Profiler,
143 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
144 e.embed_root(root, cfg, profiler)
145 }
146 // Reference the function to keep the type-check live across
147 // dead-code analysis.
148 let _ = signature_check::<BertEncoder>;
149 }
150}