Skip to main content

ripvec_core/encoder/ripvec/
dense.rs

1//! Static encoder: in-process `StaticEmbedModel` reimplementation.
2//!
3//! Port of `~/src/semble/src/semble/index/dense.py`. Wraps
4//! [`StaticEmbedModel`] loaded with `minishlab/potion-base-32M`
5//! (256-dim, L2-normalized). Implements [`VectorEncoder`] for the
6//! `--model ripvec` path. CPU-only; no batching ring buffer.
7//!
8//! Default was bumped to `potion-base-32M` in v1.3.0 after the
9//! gutenberg + python-repos matrix showed 32M winning prose by
10//! 0.058 NDCG@10 while losing code by only 0.004 — a clear
11//! single-default win once the i64 mapping bug and the reranker
12//! pooler / sigmoid / truncation bugs were fixed. The code-tuned
13//! `potion-code-16M` is still available via `--model-repo`.
14//!
15//! ## Why not `model2vec-rs`?
16//!
17//! The previous wave used the upstream `model2vec-rs` crate. Two real
18//! problems pushed us to reimplement (see
19//! `crates/ripvec-core/src/encoder/semble/static_model.rs` for the
20//! full design rationale):
21//!
22//! 1. `model2vec_rs::StaticModel::encode_with_args` runs `pool_ids`
23//!    in a serial inner loop while `tokenizers::encode_batch_fast`
24//!    spawns its own rayon pool. Wrapping that path in our outer
25//!    `par_chunks` produced 60% `__psynch_cvwait` in the linux-corpus
26//!    profile — nested rayon scopes parking on each other. The
27//!    reimplementation does ONE big tokenize plus a `par_iter` over
28//!    `pool_ids` — no nested rayon, no parking.
29//! 2. `model2vec-rs 0.2` pinned `ndarray 0.15`; ripvec-core uses
30//!    `ndarray 0.17`. The two `Array2<f32>` types were not
31//!    interchangeable, forcing a `Vec<Vec<f32>>` shim. Owning the
32//!    load path eliminates the mismatch.
33
34use std::path::{Path, PathBuf};
35use std::sync::Mutex;
36
37use crossbeam_channel::bounded;
38use hf_hub::api::sync::Api;
39use rayon::prelude::*;
40
41use crate::chunk::CodeChunk;
42use crate::embed::SearchConfig;
43use crate::encoder::VectorEncoder;
44use crate::encoder::ripvec::chunking::{DEFAULT_DESIRED_CHUNK_CHARS, chunk_source};
45use crate::encoder::ripvec::static_model::StaticEmbedModel;
46use crate::languages::config_for_extension;
47use crate::profile::Profiler;
48use crate::walk::collect_files_with_options;
49
50/// Encode batch size used by the streaming pipeline. Matches
51/// `StaticEmbedModel`'s internal `BATCH_SIZE` so each emitted batch
52/// is exactly one `encode_batch_fast` call's worth of work.
53const PIPELINE_BATCH_SIZE: usize = 1024;
54
55/// Number of full batches allowed in-flight from chunker to encoder.
56/// Provides enough pipeline depth for the encoder to stay busy while
57/// the chunker fills the next batch; small enough that peak memory
58/// stays bounded.
59const PIPELINE_RING_SIZE: usize = 4;
60
61/// Default model repo identifier for the ripvec path. This is the HF
62/// repo string used as `identity()`; the loader reads files from a
63/// local path passed via `--model-repo`.
64pub const DEFAULT_MODEL_REPO: &str = "minishlab/potion-base-32M";
65
66/// Default hidden dimension for [`DEFAULT_MODEL_REPO`].
67pub const DEFAULT_HIDDEN_DIM: usize = 256;
68
69/// Maximum source file size to read, in bytes (mirrors semble's
70/// `_MAX_FILE_BYTES = 1_000_000` from `index/create.py:16`).
71const MAX_FILE_BYTES: u64 = 1_000_000;
72
73/// CPU-only static encoder.
74///
75/// Owns a loaded [`StaticEmbedModel`] plus identity metadata. The
76/// embedder is constructed by `main.rs::load_pipeline` via
77/// [`StaticEncoder::from_pretrained`], passing either a local path
78/// containing the Model2Vec files or (planned) an HF repo ID.
79pub struct StaticEncoder {
80    model: StaticEmbedModel,
81    model_repo: String,
82    hidden_dim: usize,
83}
84
85impl StaticEncoder {
86    /// Encode a query string into a single embedding row.
87    ///
88    /// Used by `RipvecIndex::search` for hybrid/semantic dispatch.
89    #[must_use]
90    pub fn encode_query(&self, query: &str) -> Vec<f32> {
91        self.model.encode_query(query)
92    }
93
94    /// Load a model by HuggingFace repo ID or local path.
95    ///
96    /// Two acceptance shapes:
97    ///
98    /// 1. **Local path** — if `model_repo` names an existing directory,
99    ///    load directly from it. Used by the parity test fixture path
100    ///    (`/tmp/potion-base-32M`) and any user pre-staging files.
101    /// 2. **HuggingFace repo ID** — otherwise treat as `org/repo`,
102    ///    download `config.json` / `tokenizer.json` / `model.safetensors`
103    ///    via `hf-hub` into `~/.cache/huggingface/hub/`, and load from
104    ///    there. Matches `load_classic_cpu` / `load_modernbert_cpu`'s
105    ///    behaviour so the user-facing API is consistent: bare `--model
106    ///    ripvec` with no `--model-repo` flag works.
107    ///
108    /// # Errors
109    ///
110    /// Propagates the underlying I/O, download, or parse error if the
111    /// files cannot be obtained or the safetensors layout is
112    /// unrecognized.
113    pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
114        let resolved = Self::resolve_model_dir(model_repo)?;
115        let model = StaticEmbedModel::from_path(&resolved, Some(true))
116            .map_err(|e| crate::Error::Other(anyhow::anyhow!("static model load failed: {e}")))?;
117        let hidden_dim = model.hidden_dim();
118        Ok(Self {
119            model,
120            model_repo: model_repo.to_string(),
121            hidden_dim,
122        })
123    }
124
125    /// Resolve `model_repo` to a directory containing the model files.
126    ///
127    /// If `model_repo` is an existing local directory, returns it as-is.
128    /// Otherwise downloads via `hf-hub` and returns the cache directory.
129    fn resolve_model_dir(model_repo: &str) -> crate::Result<PathBuf> {
130        let local = Path::new(model_repo);
131        if local.is_dir() {
132            return Ok(local.to_path_buf());
133        }
134
135        // HuggingFace repo path. Download the three required files and
136        // return the directory `hf-hub` cached them into. All files
137        // land in the same snapshot directory.
138        let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
139        let repo = api.model(model_repo.to_string());
140        let _ = repo
141            .get("config.json")
142            .map_err(|e| crate::Error::Download(e.to_string()))?;
143        let _ = repo
144            .get("tokenizer.json")
145            .map_err(|e| crate::Error::Download(e.to_string()))?;
146        let weights_path = repo
147            .get("model.safetensors")
148            .map_err(|e| crate::Error::Download(e.to_string()))?;
149        // hf-hub returns the file path; the snapshot directory is its parent.
150        weights_path
151            .parent()
152            .map(std::path::Path::to_path_buf)
153            .ok_or_else(|| {
154                crate::Error::Other(anyhow::anyhow!(
155                    "hf-hub returned root path for {model_repo}; cannot resolve snapshot dir"
156                ))
157            })
158    }
159}
160
161impl VectorEncoder for StaticEncoder {
162    /// Three-stage bounded-queue pipeline:
163    ///
164    /// 1. **Chunk producer** — rayon `par_iter` over the file list. Each
165    ///    file is read, parsed by tree-sitter (or line-merged on
166    ///    fallback), and emitted as `(CodeChunk, String)` pairs into a
167    ///    bounded channel of capacity `PIPELINE_BATCH_SIZE * 8`.
168    /// 2. **Batch accumulator** — a single scoped thread drains the
169    ///    chunk channel, packs `PIPELINE_BATCH_SIZE` pairs per batch,
170    ///    and forwards into a bounded channel of capacity
171    ///    `PIPELINE_RING_SIZE`.
172    /// 3. **Encode worker** — a single scoped thread receives batches
173    ///    and calls `StaticEmbedModel::encode_batch`, whose internal
174    ///    `par_iter` lights up rayon for the pool_ids kernel.
175    ///
176    /// Why this shape:
177    ///
178    /// - The previous "chunk all, then embed all" implementation held
179    ///   the entire `Vec<String>` of chunk contents in memory between
180    ///   phases. On the linux corpus that was ~400 MB peak. The
181    ///   bounded queues cap in-flight memory at
182    ///   `PIPELINE_BATCH_SIZE * 8 + PIPELINE_RING_SIZE * PIPELINE_BATCH_SIZE`
183    ///   chunks regardless of corpus size — under 15 MB.
184    /// - The chunk phase (13s on linux) is hidden inside the embed
185    ///   phase (70s) instead of serializing before it. Pre-pipeline
186    ///   profile showed user-time at 394s on 82s wall = 4.8x
187    ///   parallelism on 12 cores; pipeline lets idle cores chew on
188    ///   chunking while embed runs.
189    /// - Mirrors `embed::embed_all_streaming`'s shape so the two
190    ///   pipelines (BERT + semble) share architectural conventions.
191    fn embed_root(
192        &self,
193        root: &Path,
194        cfg: &SearchConfig,
195        profiler: &Profiler,
196    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
197        // Phase 1: walk (still serial-to-pipeline because we need the
198        // full file list to par_iter over; the walk itself is rayon).
199        let walk_options = cfg.walk_options();
200        let file_paths = {
201            let _guard = profiler.phase("walk");
202            collect_files_with_options(root, &walk_options)
203        };
204        if file_paths.is_empty() {
205            return Ok((Vec::new(), Vec::new()));
206        }
207
208        // Bounded channels. See module constants for the rationale on
209        // PIPELINE_BATCH_SIZE and PIPELINE_RING_SIZE.
210        let (chunk_tx, chunk_rx) = bounded::<(CodeChunk, String)>(PIPELINE_BATCH_SIZE * 8);
211        let (batch_tx, batch_rx) = bounded::<Vec<(CodeChunk, String)>>(PIPELINE_RING_SIZE);
212
213        // The encoder stage writes ordered output behind a Mutex. Order
214        // across files isn't meaningful (RipvecIndex doesn't rely on
215        // chunk order), only the chunk[i] <-> embedding[i] pairing
216        // matters — which we preserve trivially by pushing in lockstep.
217        let output: Mutex<Vec<(CodeChunk, Vec<f32>)>> = Mutex::new(Vec::new());
218        let model = &self.model;
219
220        // Stage 1 runs on a DEDICATED rayon thread pool. If we used
221        // the global pool, Stage 1's par_iter workers would park on
222        // full `chunk_tx.send()` calls, and Stage 3's
223        // `encode_batch` → `pool_ids` par_iter would have no rayon
224        // workers available (they're all parked). That's a classic
225        // nested-rayon deadlock — observed in profiling as PID stuck
226        // at 0% CPU with 16 parked threads.
227        //
228        // Half the cores for chunking, half remain in the global pool
229        // for the encode worker's pool_ids. The chunk phase (tree-
230        // sitter + I/O bound) doesn't need full parallelism to
231        // pipeline cleanly behind embed.
232        let num_cores = rayon::current_num_threads().max(2);
233        let chunk_threads = (num_cores / 2).max(1);
234        let chunk_pool = rayon::ThreadPoolBuilder::new()
235            .num_threads(chunk_threads)
236            .thread_name(|i| format!("semble-chunk-{i}"))
237            .build()
238            .map_err(|e| crate::Error::Other(anyhow::anyhow!("chunk thread pool build: {e}")))?;
239
240        let _phase_guard = profiler.phase("pipeline");
241        std::thread::scope(|scope| {
242            // Stage 1: chunk producer on the dedicated pool.
243            let chunk_tx_owned = chunk_tx;
244            scope.spawn(move || {
245                chunk_pool.install(|| {
246                    file_paths.par_iter().for_each(|full| {
247                        let (chunks, contents) = chunk_one_file(root, full);
248                        for (chunk, content) in chunks.into_iter().zip(contents) {
249                            if chunk_tx_owned.send((chunk, content)).is_err() {
250                                return;
251                            }
252                        }
253                    });
254                });
255                // chunk_tx_owned drops here, closing the channel.
256            });
257
258            // Stage 2: batch accumulator.
259            let batch_tx_owned = batch_tx;
260            scope.spawn(move || {
261                let mut buf: Vec<(CodeChunk, String)> = Vec::with_capacity(PIPELINE_BATCH_SIZE);
262                for pair in chunk_rx {
263                    buf.push(pair);
264                    if buf.len() >= PIPELINE_BATCH_SIZE {
265                        let batch =
266                            std::mem::replace(&mut buf, Vec::with_capacity(PIPELINE_BATCH_SIZE));
267                        if batch_tx_owned.send(batch).is_err() {
268                            return;
269                        }
270                    }
271                }
272                if !buf.is_empty() {
273                    let _ = batch_tx_owned.send(buf);
274                }
275                // batch_tx_owned drops here, closing the channel.
276            });
277
278            // Stage 3: encode worker.
279            scope.spawn(|| {
280                for batch in batch_rx {
281                    if batch.is_empty() {
282                        continue;
283                    }
284                    let mut chunks = Vec::with_capacity(batch.len());
285                    let mut texts: Vec<String> = Vec::with_capacity(batch.len());
286                    for (chunk, text) in batch {
287                        chunks.push(chunk);
288                        texts.push(text);
289                    }
290                    let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
291                    let embeddings = model.encode_batch(&text_refs);
292                    debug_assert_eq!(embeddings.len(), chunks.len());
293                    let mut out = output.lock().expect("output mutex poisoned");
294                    for (chunk, emb) in chunks.into_iter().zip(embeddings) {
295                        out.push((chunk, emb));
296                    }
297                }
298            });
299        });
300
301        let collected = output.into_inner().expect("output mutex poisoned");
302        let mut chunks_out = Vec::with_capacity(collected.len());
303        let mut embs_out = Vec::with_capacity(collected.len());
304        for (chunk, emb) in collected {
305            chunks_out.push(chunk);
306            embs_out.push(emb);
307        }
308        Ok((chunks_out, embs_out))
309    }
310
311    fn hidden_dim(&self) -> usize {
312        self.hidden_dim
313    }
314
315    fn identity(&self) -> &str {
316        &self.model_repo
317    }
318}
319
320/// Chunk one file. Returns `(file_chunks, file_contents)` — empty
321/// when the file is too large, can't be read, or has no chunks.
322fn chunk_one_file(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
323    match std::fs::metadata(full) {
324        Ok(meta) if meta.len() > MAX_FILE_BYTES => return (Vec::new(), Vec::new()),
325        Err(_) => return (Vec::new(), Vec::new()),
326        _ => {}
327    }
328    let Ok(source) = std::fs::read_to_string(full) else {
329        return (Vec::new(), Vec::new());
330    };
331
332    let ext = full
333        .extension()
334        .and_then(|e| e.to_str())
335        .unwrap_or_default();
336    let lang_cfg = config_for_extension(ext);
337    let language = lang_cfg.as_ref().map(|c| &c.language);
338
339    let rel_path = full
340        .strip_prefix(root)
341        .unwrap_or(full)
342        .display()
343        .to_string();
344
345    let boundaries = chunk_source(&source, language, DEFAULT_DESIRED_CHUNK_CHARS);
346    let mut chunks = Vec::with_capacity(boundaries.len());
347    let mut contents = Vec::with_capacity(boundaries.len());
348    for b in boundaries {
349        let text = b.content(&source).to_string();
350        if text.trim().is_empty() {
351            continue;
352        }
353        contents.push(text.clone());
354        chunks.push(CodeChunk {
355            file_path: rel_path.clone(),
356            name: String::new(),
357            kind: String::new(),
358            start_line: b.start_line,
359            end_line: b.end_line,
360            content: text.clone(),
361            enriched_content: text,
362        });
363    }
364    (chunks, contents)
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370    use crate::encoder::VectorEncoder;
371
372    /// `StaticEncoder` implements `VectorEncoder` + Send + Sync.
373    /// Compile-time check (`test:static-encoder-implements-vector-encoder`).
374    #[test]
375    fn static_encoder_implements_vector_encoder() {
376        fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
377        assert_trait_object::<StaticEncoder>();
378    }
379
380    /// `from_pretrained` returns the right hidden_dim from a probe encode.
381    /// Ignored by default because it requires a model download (~16 MB).
382    ///
383    /// Corresponds to acceptance `test:static-encoder-hidden-dim-256` and
384    /// `test:static-encoder-loads-potion-code-16m` and
385    /// `test:static-encoder-output-is-l2-normalized`.
386    #[test]
387    #[ignore = "requires local model files at RIPVEC_SEMBLE_MODEL_PATH"]
388    fn static_encoder_loads_potion_code_16m() {
389        let Ok(path) = std::env::var("RIPVEC_SEMBLE_MODEL_PATH") else {
390            eprintln!("RIPVEC_SEMBLE_MODEL_PATH not set; skipping");
391            return;
392        };
393        let enc = StaticEncoder::from_pretrained(&path).expect("model load should succeed");
394        assert_eq!(enc.hidden_dim(), DEFAULT_HIDDEN_DIM);
395        // identity() reflects what the caller passed (typically the
396        // local path under test).
397        assert_eq!(enc.identity(), path);
398
399        // Verify L2-normalized output via the public encode_query path.
400        let row = enc.encode_query("hello world");
401        let norm: f32 = row.iter().map(|x| x * x).sum::<f32>().sqrt();
402        assert!(
403            (norm - 1.0).abs() < 1e-3,
404            "expected L2-normalized output; got norm={norm}"
405        );
406    }
407}