ripvec_core/encoder/mod.rs
1//! Encoder abstraction for the ripvec static-table engine.
2//!
3//! [`VectorEncoder`] exposes the surviving ripvec engine behind one interface,
4//! so downstream search code (CLI dispatch,
5//! [`HybridIndex`](crate::hybrid::HybridIndex), cache layer) does not branch
6//! on encoder internals.
7//!
8//! ## Implementation
9//!
10//! - [`StaticEncoder`](crate::encoder::ripvec::dense::StaticEncoder) —
11//! static embedding-table lookup via the in-process Model2Vec engine.
12//! Used for `--model ripvec`. CPU-only; no batching or ring buffer
13//! (table-lookup encoder is memory-bound, not compute-bound).
14//!
15//! ## Design rationale
16//!
17//! `VectorEncoder` abstracts at the repo→(chunks, embeddings) boundary,
18//! where the concrete pipeline shape does not leak through. Callers receive
19//! a `(Vec<CodeChunk>, Vec<Vec<f32>>)` pair regardless of how the encoder
20//! implements walk, chunk, and embed internally.
21//!
22//! @Parnas (1972) — the module hides which engine is active; the trait is
23//! the stable interface boundary. @Postel (1980) — callers use the same
24//! `VectorEncoder` surface; no change at the call site after the transformer
25//! path was removed.
26//!
27//! See `docs/PLAN.md` cluster B6 for the surgery context.
28
29use std::path::Path;
30
31use crate::chunk::CodeChunk;
32use crate::embed::SearchConfig;
33use crate::profile::Profiler;
34
35pub mod ripvec;
36
37/// Trait that abstracts text/chunks → embedding vectors.
38///
39/// The implementation owns its full pipeline (walk, chunk, encode).
40///
41/// # Object safety
42///
43/// `dyn VectorEncoder` is constructible. Methods take `&self` and use only
44/// concrete return types — no associated types or generic methods.
45///
46/// # Thread safety
47///
48/// `Send + Sync` is required because the encoder is shared across the
49/// indexing pipeline's rayon and channel-based workers.
50pub trait VectorEncoder: Send + Sync {
51 /// Walk `root`, chunk every supported file, and embed every chunk.
52 ///
53 /// Returns the chunks and their embeddings in parallel order: chunk `i`
54 /// has embedding `embeddings[i]`. The ripvec engine uses an AST-merge
55 /// chunker and projects chunks onto [`CodeChunk`].
56 ///
57 /// `cfg` carries pipeline tuning (walk filters, etc.).
58 ///
59 /// # Errors
60 ///
61 /// Returns an error if file walking, chunking, or inference fails.
62 fn embed_root(
63 &self,
64 root: &Path,
65 cfg: &SearchConfig,
66 profiler: &Profiler,
67 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)>;
68
69 /// Hidden dimension of the emitted embeddings.
70 ///
71 /// Used by [`SearchIndex`](crate::index::SearchIndex) for the embedding
72 /// matrix shape and by the cache layer to refuse dimension-mismatched
73 /// loads.
74 fn hidden_dim(&self) -> usize;
75
76 /// Stable identifier used as the cache-manifest key.
77 ///
78 /// For the ripvec engine, the Model2Vec repo string (e.g.
79 /// `"minishlab/potion-code-16M"`). Consulted for logging and diagnostics.
80 fn identity(&self) -> &str;
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 /// Verify that `VectorEncoder` is object-safe by constructing a trait
88 /// object type. Compilation is the test.
89 #[test]
90 fn trait_is_object_safe() {
91 fn assert_object_safe(_: &dyn VectorEncoder) {}
92 // Constructing the function item is the load-bearing check;
93 // referencing it keeps the type-check live across dead-code analysis.
94 let _ = assert_object_safe;
95 }
96
97 /// Verify that `Box<dyn VectorEncoder>` is `Send` + `Sync`.
98 #[test]
99 fn trait_object_is_send_and_sync() {
100 fn assert_send_sync<T: Send + Sync>() {}
101 assert_send_sync::<Box<dyn VectorEncoder>>();
102 }
103
104 /// Verify that `&dyn VectorEncoder` is `Send` (parallel pipelines).
105 #[test]
106 fn shared_reference_is_send() {
107 fn assert_send<T: Send>() {}
108 assert_send::<&dyn VectorEncoder>();
109 }
110}