ripvec_core/encoder/mod.rs
1//! Encoder abstraction above [`EmbedBackend`](crate::backend::EmbedBackend).
2//!
3//! [`VectorEncoder`] hides the difference between transformer and static-table
4//! encoders behind one interface, so downstream search code (CLI dispatch,
5//! [`HybridIndex`](crate::hybrid::HybridIndex), cache layer) does not branch
6//! on encoder family.
7//!
8//! ## Two implementations
9//!
10//! - [`BertEncoder`] (P0.3) — wraps `Vec<Box<dyn EmbedBackend>>` + tokenizer.
11//! Used for `--model bert` and `--model modernbert`. Owns the existing
12//! walk/chunk/tokenize/embed streaming pipeline.
13//!
14//! - [`StaticEncoder`](crate::encoder::ripvec::dense::StaticEncoder) (P1.5) —
15//! wraps [`model2vec::Model2Vec`]. Used for `--model ripvec`. CPU-only;
16//! no batching or ring buffer (table-lookup encoder is memory-bound, not
17//! compute-bound).
18//!
19//! ## Design rationale
20//!
21//! Each implementation owns its full pipeline because transformer and static
22//! encoders have fundamentally different compute shapes:
23//!
24//! | | BERT | static |
25//! |---|---|---|
26//! | Tokenizer | HuggingFace BPE/WordPiece | model2vec internal |
27//! | Inference | multi-layer attention + GEMM | embedding-table lookup |
28//! | Scheduler | rayon clones (CPU) / ring buffer (GPU) | single-threaded encode |
29//! | Hidden dim | 384 / 768 | 256 |
30//!
31//! Forcing a uniform "tokenize then encode" abstraction would either lie
32//! about static encoders (no real tokens to expose) or impose transformer
33//! ceremony on a lookup table. `VectorEncoder` instead abstracts at the
34//! repo→(chunks, embeddings) boundary, where the shapes naturally agree.
35//!
36//! See `docs/PLAN.md` cluster P0 for the broader port architecture.
37
38use std::path::Path;
39
40use crate::chunk::CodeChunk;
41use crate::embed::SearchConfig;
42use crate::profile::Profiler;
43
44pub mod bert;
45pub mod ripvec;
46
47pub use bert::BertEncoder;
48
49/// Trait that abstracts text/chunks → embedding vectors.
50///
51/// Implementations own their full pipeline (walk, chunk, tokenize, encode)
52/// since transformer-family and static-table encoders have fundamentally
53/// different compute shapes (see module-level docs).
54///
55/// # Object safety
56///
57/// `dyn VectorEncoder` is constructible. Methods take `&self` and use only
58/// concrete return types — no associated types or generic methods.
59///
60/// # Thread safety
61///
62/// `Send + Sync` is required because the encoder is shared across the
63/// indexing pipeline's rayon and channel-based workers.
64pub trait VectorEncoder: Send + Sync {
65 /// Walk `root`, chunk every supported file, and embed every chunk.
66 ///
67 /// Returns the chunks and their embeddings in parallel order: chunk `i`
68 /// has embedding `embeddings[i]`. Implementations choose their own
69 /// chunker (BERT uses ripvec's tree-sitter chunker; ripvec uses
70 /// ripvec's AST-merge chunker — they emit different chunk shapes,
71 /// both projected onto [`CodeChunk`]).
72 ///
73 /// `cfg` carries pipeline tuning (batch size, token caps, walk filters).
74 /// Static encoders ignore the transformer-specific fields (`batch_size`,
75 /// `max_tokens`) but still consult walk-related fields.
76 ///
77 /// # Errors
78 ///
79 /// Returns an error if file walking, chunking, tokenization, or
80 /// inference fails.
81 fn embed_root(
82 &self,
83 root: &Path,
84 cfg: &SearchConfig,
85 profiler: &Profiler,
86 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)>;
87
88 /// Hidden dimension of the emitted embeddings.
89 ///
90 /// Used by [`SearchIndex`](crate::index::SearchIndex) for the embedding
91 /// matrix shape, and by the cache layer to refuse cross-family loads
92 /// (a 256-dim semble index cannot be queried by a 768-dim ModernBERT
93 /// query).
94 fn hidden_dim(&self) -> usize;
95
96 /// Stable identifier used as the cache-manifest key.
97 ///
98 /// For HuggingFace-backed encoders, the model repo string (e.g.
99 /// `"nomic-ai/modernbert-embed-base"`, `"minishlab/potion-code-16M"`).
100 /// The ripvec engine path does not write the cache; this is still consulted
101 /// for logging and diagnostics.
102 fn identity(&self) -> &str;
103}
104
105#[cfg(test)]
106mod tests {
107 use super::*;
108
109 /// Verify that `VectorEncoder` is object-safe by constructing a trait
110 /// object type. Compilation is the test.
111 #[test]
112 fn trait_is_object_safe() {
113 fn assert_object_safe(_: &dyn VectorEncoder) {}
114 // Constructing the function item is the load-bearing check;
115 // referencing it keeps the type-check live across dead-code analysis.
116 let _ = assert_object_safe;
117 }
118
119 /// Verify that `Box<dyn VectorEncoder>` is `Send` + `Sync`.
120 #[test]
121 fn trait_object_is_send_and_sync() {
122 fn assert_send_sync<T: Send + Sync>() {}
123 assert_send_sync::<Box<dyn VectorEncoder>>();
124 }
125
126 /// Verify that `&dyn VectorEncoder` is `Send` (parallel pipelines).
127 #[test]
128 fn shared_reference_is_send() {
129 fn assert_send<T: Send>() {}
130 assert_send::<&dyn VectorEncoder>();
131 }
132}