Skip to main content

ripvec_core/encoder/
mod.rs

1//! Encoder abstraction above [`EmbedBackend`](crate::backend::EmbedBackend).
2//!
3//! [`VectorEncoder`] hides the difference between transformer and static-table
4//! encoders behind one interface, so downstream search code (CLI dispatch,
5//! [`HybridIndex`](crate::hybrid::HybridIndex), cache layer) does not branch
6//! on encoder family.
7//!
8//! ## Two implementations
9//!
10//! - [`BertEncoder`] (P0.3) — wraps `Vec<Box<dyn EmbedBackend>>` + tokenizer.
11//!   Used for `--model bert` and `--model modernbert`. Owns the existing
12//!   walk/chunk/tokenize/embed streaming pipeline.
13//!
14//! - [`StaticEncoder`](crate::encoder::ripvec::dense::StaticEncoder) (P1.5) —
15//!   wraps [`model2vec::Model2Vec`]. Used for `--model ripvec`. CPU-only;
16//!   no batching or ring buffer (table-lookup encoder is memory-bound, not
17//!   compute-bound).
18//!
19//! ## Design rationale
20//!
21//! Each implementation owns its full pipeline because transformer and static
22//! encoders have fundamentally different compute shapes:
23//!
24//! | | BERT | static |
25//! |---|---|---|
26//! | Tokenizer | HuggingFace BPE/WordPiece | model2vec internal |
27//! | Inference | multi-layer attention + GEMM | embedding-table lookup |
28//! | Scheduler | rayon clones (CPU) / ring buffer (GPU) | single-threaded encode |
29//! | Hidden dim | 384 / 768 | 256 |
30//!
31//! Forcing a uniform "tokenize then encode" abstraction would either lie
32//! about static encoders (no real tokens to expose) or impose transformer
33//! ceremony on a lookup table. `VectorEncoder` instead abstracts at the
34//! repo→(chunks, embeddings) boundary, where the shapes naturally agree.
35//!
36//! See `docs/PLAN.md` cluster P0 for the broader port architecture.
37
38use std::path::Path;
39
40use crate::chunk::CodeChunk;
41use crate::embed::SearchConfig;
42use crate::profile::Profiler;
43
44pub mod bert;
45pub mod ripvec;
46
47pub use bert::BertEncoder;
48
49/// Trait that abstracts text/chunks → embedding vectors.
50///
51/// Implementations own their full pipeline (walk, chunk, tokenize, encode)
52/// since transformer-family and static-table encoders have fundamentally
53/// different compute shapes (see module-level docs).
54///
55/// # Object safety
56///
57/// `dyn VectorEncoder` is constructible. Methods take `&self` and use only
58/// concrete return types — no associated types or generic methods.
59///
60/// # Thread safety
61///
62/// `Send + Sync` is required because the encoder is shared across the
63/// indexing pipeline's rayon and channel-based workers.
64pub trait VectorEncoder: Send + Sync {
65    /// Walk `root`, chunk every supported file, and embed every chunk.
66    ///
67    /// Returns the chunks and their embeddings in parallel order: chunk `i`
68    /// has embedding `embeddings[i]`. Implementations choose their own
69    /// chunker (BERT uses ripvec's tree-sitter chunker; ripvec uses
70    /// ripvec's AST-merge chunker — they emit different chunk shapes,
71    /// both projected onto [`CodeChunk`]).
72    ///
73    /// `cfg` carries pipeline tuning (batch size, token caps, walk filters).
74    /// Static encoders ignore the transformer-specific fields (`batch_size`,
75    /// `max_tokens`) but still consult walk-related fields.
76    ///
77    /// # Errors
78    ///
79    /// Returns an error if file walking, chunking, tokenization, or
80    /// inference fails.
81    fn embed_root(
82        &self,
83        root: &Path,
84        cfg: &SearchConfig,
85        profiler: &Profiler,
86    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)>;
87
88    /// Hidden dimension of the emitted embeddings.
89    ///
90    /// Used by [`SearchIndex`](crate::index::SearchIndex) for the embedding
91    /// matrix shape, and by the cache layer to refuse cross-family loads
92    /// (a 256-dim semble index cannot be queried by a 768-dim ModernBERT
93    /// query).
94    fn hidden_dim(&self) -> usize;
95
96    /// Stable identifier used as the cache-manifest key.
97    ///
98    /// For HuggingFace-backed encoders, the model repo string (e.g.
99    /// `"nomic-ai/modernbert-embed-base"`, `"minishlab/potion-code-16M"`).
100    /// The ripvec engine path does not write the cache; this is still consulted
101    /// for logging and diagnostics.
102    fn identity(&self) -> &str;
103}
104
105#[cfg(test)]
106mod tests {
107    use super::*;
108
109    /// Verify that `VectorEncoder` is object-safe by constructing a trait
110    /// object type. Compilation is the test.
111    #[test]
112    fn trait_is_object_safe() {
113        fn assert_object_safe(_: &dyn VectorEncoder) {}
114        // Constructing the function item is the load-bearing check;
115        // referencing it keeps the type-check live across dead-code analysis.
116        let _ = assert_object_safe;
117    }
118
119    /// Verify that `Box<dyn VectorEncoder>` is `Send` + `Sync`.
120    #[test]
121    fn trait_object_is_send_and_sync() {
122        fn assert_send_sync<T: Send + Sync>() {}
123        assert_send_sync::<Box<dyn VectorEncoder>>();
124    }
125
126    /// Verify that `&dyn VectorEncoder` is `Send` (parallel pipelines).
127    #[test]
128    fn shared_reference_is_send() {
129        fn assert_send<T: Send>() {}
130        assert_send::<&dyn VectorEncoder>();
131    }
132}