ripvec_core/encoder/ripvec/index.rs
1//! `RipvecIndex` orchestrator and PageRank-layered ranking.
2//!
3//! Port of `~/src/semble/src/semble/index/index.py:RipvecIndex`. Owns
4//! the corpus state (chunks, file mapping, language mapping, BM25,
5//! dense embeddings, encoder) and dispatches search by mode.
6//!
7//! ## Port-plus-ripvec scope
8//!
9//! Per `docs/PLAN.md`, after the ripvec engine's own `rerank_topk` runs, ripvec's
10//! [`boost_with_pagerank`](crate::hybrid::boost_with_pagerank) is
11//! applied as a final ranking layer. The PageRank lookup is built from
12//! the repo graph and stored alongside the corpus when one is provided
13//! at construction; the layer no-ops when no graph is present.
14
15use std::collections::HashMap;
16use std::path::Path;
17
18use crate::chunk::CodeChunk;
19use crate::embed::SearchConfig;
20use crate::encoder::VectorEncoder;
21use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
22use crate::encoder::ripvec::dense::StaticEncoder;
23use crate::encoder::ripvec::hybrid::{search_hybrid, search_semantic};
24use crate::hybrid::SearchMode;
25use crate::profile::Profiler;
26
27/// Combined orchestrator for the ripvec retrieval pipeline.
28///
29/// Constructed via [`RipvecIndex::from_root`] which walks files,
30/// chunks them with ripvec's chunker, embeds with the static encoder,
31/// and builds the BM25 index.
32pub struct RipvecIndex {
33 chunks: Vec<CodeChunk>,
34 /// Row-major contiguous embedding matrix; row `i` is the
35 /// L2-normalized embedding of chunk `i`. Held as `Array2<f32>` so
36 /// cosine queries (dot product over normalized rows) dispatch to
37 /// BLAS `sgemv` via ndarray's `cpu-accelerate` feature instead of
38 /// pointer-chasing through `Vec<Vec<f32>>`. The change is a
39 /// ~150x theoretical lift on per-query dense scoring at 1M chunks
40 /// (memory-bandwidth-bound).
41 embeddings: ndarray::Array2<f32>,
42 bm25: Bm25Index,
43 encoder: StaticEncoder,
44 file_mapping: HashMap<String, Vec<usize>>,
45 language_mapping: HashMap<String, Vec<usize>>,
46 pagerank_lookup: Option<std::sync::Arc<HashMap<String, f32>>>,
47 pagerank_alpha: f32,
48 corpus_class: CorpusClass,
49}
50
51/// Index-time classification of the corpus by file mix.
52///
53/// Drives the corpus-aware rerank gate: docs and mixed corpora get
54/// the L-12 cross-encoder fired (when the query is NL-shaped); pure
55/// code corpora skip it because the ms-marco-trained model is
56/// out-of-domain for code regardless of impl quality.
57#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
58#[serde(rename_all = "lowercase")]
59pub enum CorpusClass {
60 /// Less than 30% of chunks are in prose files. Pure or near-pure
61 /// code corpora — rerank skipped.
62 Code,
63 /// Between 30% and 70% prose chunks. Mixed corpora — rerank fires
64 /// on NL queries to recover the prose-dominant relevance signal.
65 Mixed,
66 /// At least 70% prose chunks. Documentation, book sets, knowledge
67 /// bases — rerank fires by default.
68 Docs,
69}
70
71impl CorpusClass {
72 /// Classify a chunk set by the fraction of chunks from prose files.
73 /// Empty input is classified as `Code` (degenerate but defined).
74 #[must_use]
75 pub fn classify(chunks: &[CodeChunk]) -> Self {
76 if chunks.is_empty() {
77 return Self::Code;
78 }
79 let prose = chunks
80 .iter()
81 .filter(|c| crate::encoder::ripvec::ranking::is_prose_path(&c.file_path))
82 .count();
83 #[expect(
84 clippy::cast_precision_loss,
85 reason = "chunk count never exceeds f32 mantissa precision in practice"
86 )]
87 let frac = prose as f32 / chunks.len() as f32;
88 if frac >= 0.7 {
89 Self::Docs
90 } else if frac >= 0.3 {
91 Self::Mixed
92 } else {
93 Self::Code
94 }
95 }
96
97 /// Whether the cross-encoder rerank should run on this corpus for
98 /// a non-symbol NL query. Pure code corpora skip rerank; mixed
99 /// and docs corpora enable it.
100 #[must_use]
101 pub fn rerank_eligible(self) -> bool {
102 matches!(self, Self::Mixed | Self::Docs)
103 }
104}
105
106impl RipvecIndex {
107 /// Build a [`RipvecIndex`] by walking `root` and indexing every
108 /// supported file. Uses `encoder.embed_root` (ripvec's chunker +
109 /// model2vec encode) and builds a fresh BM25 index over the
110 /// resulting chunks.
111 ///
112 /// `pagerank_lookup` is the optional structural-prior map (file
113 /// path → normalized PageRank) used by the final ranking layer;
114 /// pass `None` to disable. `pagerank_alpha` is the corresponding
115 /// boost strength.
116 ///
117 /// # Errors
118 ///
119 /// Returns the underlying error if `embed_root` fails.
120 pub fn from_root(
121 root: &Path,
122 encoder: StaticEncoder,
123 cfg: &SearchConfig,
124 profiler: &Profiler,
125 pagerank_lookup: Option<HashMap<String, f32>>,
126 pagerank_alpha: f32,
127 ) -> crate::Result<Self> {
128 // Wrap once at construction. The per-query `apply_pagerank_layer`
129 // path clones the Arc (pointer bump), not the HashMap (10K+ String
130 // allocs on a 1M-chunk corpus).
131 let pagerank_lookup = pagerank_lookup.map(std::sync::Arc::new);
132 let (chunks, embeddings_vec) = encoder.embed_root(root, cfg, profiler)?;
133 // Convert Vec<Vec<f32>> -> Array2<f32> at the boundary. The
134 // upstream embed_root produces ragged-friendly Vec<Vec<>>; we
135 // pack into one contiguous row-major buffer so BLAS sgemv can
136 // do per-query cosine in one call. Cost is a single sequential
137 // memcpy pass (~1 GB at memory bandwidth = ~5 ms on a 1M-chunk
138 // corpus) — negligible against the 60 s build phase.
139 let hidden_dim = embeddings_vec.first().map_or(0, std::vec::Vec::len);
140 let n_chunks = embeddings_vec.len();
141 let mut flat: Vec<f32> = Vec::with_capacity(n_chunks * hidden_dim);
142 for row in embeddings_vec {
143 debug_assert_eq!(
144 row.len(),
145 hidden_dim,
146 "ragged embeddings: row of {} vs expected {hidden_dim}",
147 row.len()
148 );
149 flat.extend(row);
150 }
151 let embeddings = ndarray::Array2::from_shape_vec((n_chunks, hidden_dim), flat)
152 .map_err(|e| crate::Error::Other(anyhow::anyhow!("embeddings reshape: {e}")))?;
153 let bm25 = {
154 let _g = profiler.phase("bm25_build");
155 Bm25Index::build(&chunks)
156 };
157 let (file_mapping, language_mapping) = {
158 let _g = profiler.phase("mappings");
159 build_mappings(&chunks)
160 };
161 let corpus_class = CorpusClass::classify(&chunks);
162 Ok(Self {
163 chunks,
164 embeddings,
165 bm25,
166 encoder,
167 file_mapping,
168 language_mapping,
169 pagerank_lookup,
170 pagerank_alpha,
171 corpus_class,
172 })
173 }
174
175 /// The index's corpus classification, computed at build time.
176 ///
177 /// Used by the MCP rerank gate to decide whether the L-12
178 /// cross-encoder fires on a given query.
179 #[must_use]
180 pub fn corpus_class(&self) -> CorpusClass {
181 self.corpus_class
182 }
183
184 /// Number of indexed chunks.
185 #[must_use]
186 pub fn len(&self) -> usize {
187 self.chunks.len()
188 }
189
190 /// Whether the index has zero chunks.
191 #[must_use]
192 pub fn is_empty(&self) -> bool {
193 self.chunks.is_empty()
194 }
195
196 /// Indexed chunks (read-only access).
197 #[must_use]
198 pub fn chunks(&self) -> &[CodeChunk] {
199 &self.chunks
200 }
201
202 /// Indexed embeddings (read-only access).
203 ///
204 /// `Array2<f32>` of shape `[n_chunks, hidden_dim]`, row-major. Row
205 /// `i` is the L2-normalized embedding of chunk `i`, so cosine
206 /// similarity reduces to a dot product. Callers that need their
207 /// own similarity arithmetic (`find_similar`, `find_duplicates`)
208 /// should use `embeddings.row(i)` for a single-row view or
209 /// `embeddings.dot(&query)` for a one-call BLAS GEMV.
210 #[must_use]
211 pub fn embeddings(&self) -> &ndarray::Array2<f32> {
212 &self.embeddings
213 }
214
215 /// Search the index and return ranked `(chunk_index, score)` pairs.
216 ///
217 /// `mode = SearchMode::Hybrid` (default) fuses semantic + BM25 via
218 /// RRF; `Semantic` and `Keyword` use one signal each.
219 ///
220 /// `filter_languages` and `filter_paths` build a selector mask
221 /// that restricts retrieval to chunks in the named files /
222 /// languages.
223 #[must_use]
224 pub fn search(
225 &self,
226 query: &str,
227 top_k: usize,
228 mode: SearchMode,
229 alpha: Option<f32>,
230 filter_languages: Option<&[String]>,
231 filter_paths: Option<&[String]>,
232 ) -> Vec<(usize, f32)> {
233 if self.is_empty() || query.trim().is_empty() {
234 return Vec::new();
235 }
236 let selector = self.build_selector(filter_languages, filter_paths);
237
238 let raw = match mode {
239 SearchMode::Keyword => search_bm25(query, &self.bm25, top_k, selector.as_deref()),
240 SearchMode::Semantic => {
241 let q_emb = self.encoder.encode_query(query);
242 search_semantic(&q_emb, &self.embeddings, top_k, selector.as_deref())
243 }
244 SearchMode::Hybrid => {
245 let q_emb = self.encoder.encode_query(query);
246 search_hybrid(
247 query,
248 &q_emb,
249 &self.embeddings,
250 &self.chunks,
251 &self.bm25,
252 top_k,
253 alpha,
254 selector.as_deref(),
255 )
256 }
257 };
258
259 self.apply_pagerank_layer(raw)
260 }
261
262 /// Build a selector mask from optional language/path filters.
263 /// Returns `None` when no filters are set (search runs over the
264 /// full corpus).
265 fn build_selector(
266 &self,
267 filter_languages: Option<&[String]>,
268 filter_paths: Option<&[String]>,
269 ) -> Option<Vec<usize>> {
270 let mut selector: Vec<usize> = Vec::new();
271 if let Some(langs) = filter_languages {
272 for lang in langs {
273 if let Some(ids) = self.language_mapping.get(lang) {
274 selector.extend(ids.iter().copied());
275 }
276 }
277 }
278 if let Some(paths) = filter_paths {
279 for path in paths {
280 if let Some(ids) = self.file_mapping.get(path) {
281 selector.extend(ids.iter().copied());
282 }
283 }
284 }
285 if selector.is_empty() {
286 None
287 } else {
288 selector.sort_unstable();
289 selector.dedup();
290 Some(selector)
291 }
292 }
293
294 /// Layer ripvec's PageRank boost on top of semble's ranked results.
295 ///
296 /// No-op when `pagerank_lookup` is `None` or the boost strength
297 /// is zero. Otherwise re-uses
298 /// [`crate::hybrid::boost_with_pagerank`] so the PageRank semantic
299 /// stays consistent with ripvec's other code paths.
300 fn apply_pagerank_layer(&self, mut results: Vec<(usize, f32)>) -> Vec<(usize, f32)> {
301 let Some(lookup) = &self.pagerank_lookup else {
302 return results;
303 };
304 if results.is_empty() || self.pagerank_alpha <= 0.0 {
305 return results;
306 }
307 // Uses the shared `ranking::PageRankBoost` layer for behavioral
308 // parity with the BERT CLI, MCP `search_code`, and LSP paths.
309 // All five callers now apply the same sigmoid-on-percentile
310 // curve.
311 // `lookup` is `Arc<HashMap<_,_>>`; cloning the Arc is a pointer
312 // bump, not a HashMap copy. The earlier `lookup.clone()` here
313 // cloned the entire map per query (~10K String allocations on
314 // a 1M-chunk corpus).
315 let layers: Vec<Box<dyn crate::ranking::RankingLayer>> = vec![Box::new(
316 crate::ranking::PageRankBoost::new(std::sync::Arc::clone(lookup), self.pagerank_alpha),
317 )];
318 crate::ranking::apply_chain(&mut results, &self.chunks, &layers);
319 results
320 }
321}
322
323impl crate::searchable::SearchableIndex for RipvecIndex {
324 fn chunks(&self) -> &[CodeChunk] {
325 RipvecIndex::chunks(self)
326 }
327
328 /// Trait-shape search: text-only, no engine-specific knobs.
329 ///
330 /// The trait surface is the LSP-callers' common ground. Filters
331 /// (language, path) and the alpha auto-detect override are not
332 /// surfaced through the trait because no LSP module uses them.
333 fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
334 RipvecIndex::search(self, query_text, top_k, mode, None, None, None)
335 }
336
337 /// Use chunk `chunk_idx`'s own embedding as the query vector and
338 /// rank everything else by cosine similarity (semantic-only) or
339 /// blend with BM25 (hybrid). Falls back to text-only keyword
340 /// search when the chunk index is out of range.
341 ///
342 /// Mirrors the [`HybridIndex`] equivalent so `goto_definition`
343 /// and `goto_implementation` work identically across engines.
344 fn search_from_chunk(
345 &self,
346 chunk_idx: usize,
347 query_text: &str,
348 top_k: usize,
349 mode: SearchMode,
350 ) -> Vec<(usize, f32)> {
351 // RipvecIndex stores embeddings; if the source chunk is in
352 // range we can rank by similarity against its vector. Out of
353 // range or keyword-only mode: fall back to text search.
354 if chunk_idx >= self.embeddings().nrows() {
355 return RipvecIndex::search(
356 self,
357 query_text,
358 top_k,
359 SearchMode::Keyword,
360 None,
361 None,
362 None,
363 );
364 }
365 match mode {
366 SearchMode::Keyword => RipvecIndex::search(
367 self,
368 query_text,
369 top_k,
370 SearchMode::Keyword,
371 None,
372 None,
373 None,
374 ),
375 SearchMode::Semantic | SearchMode::Hybrid => {
376 // Cosine via dot product over L2-normalized rows.
377 // Parallel sgemv across row-shards to saturate
378 // aggregate memory bandwidth instead of the single-core
379 // sgemv ceiling.
380 let source = self.embeddings().row(chunk_idx);
381 let scores =
382 crate::encoder::ripvec::hybrid::parallel_sgemv(self.embeddings(), &source);
383 let mut scored: Vec<(usize, f32)> = scores
384 .iter()
385 .enumerate()
386 .filter(|(i, _)| *i != chunk_idx)
387 .map(|(i, &s)| (i, s))
388 .collect();
389 if scored.len() > top_k {
390 scored.select_nth_unstable_by(top_k - 1, |a, b| {
391 b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0))
392 });
393 scored.truncate(top_k);
394 }
395 scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
396 scored
397 }
398 }
399 }
400
401 fn as_any(&self) -> &dyn std::any::Any {
402 self
403 }
404}
405
406/// Build (file_path → chunk indices, language → chunk indices) mappings.
407fn build_mappings(
408 chunks: &[CodeChunk],
409) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
410 let mut file_to_id: HashMap<String, Vec<usize>> = HashMap::new();
411 let mut lang_to_id: HashMap<String, Vec<usize>> = HashMap::new();
412 for (i, chunk) in chunks.iter().enumerate() {
413 file_to_id
414 .entry(chunk.file_path.clone())
415 .or_default()
416 .push(i);
417 // The semble port's chunker stores language inferentially (via
418 // extension); the per-chunk `language` field isn't populated on
419 // this path. The mapping is keyed on file extension as a proxy
420 // so `filter_languages: Some(&["rs"])` works.
421 if let Some(ext) = Path::new(&chunk.file_path)
422 .extension()
423 .and_then(|e| e.to_str())
424 {
425 lang_to_id.entry(ext.to_string()).or_default().push(i);
426 }
427 }
428 (file_to_id, lang_to_id)
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 /// Compile-time check that `RipvecIndex` carries the right method
436 /// shape for the CLI to call.
437 #[test]
438 fn semble_index_search_signature_compiles() {
439 fn shape_check(
440 idx: &RipvecIndex,
441 query: &str,
442 top_k: usize,
443 mode: SearchMode,
444 ) -> Vec<(usize, f32)> {
445 idx.search(query, top_k, mode, None, None, None)
446 }
447 // Reference to keep type-check live across dead-code analysis.
448 let _ = shape_check;
449 }
450
451 /// `behavior:pagerank-no-op-when-graph-absent` — when constructed
452 /// without a PageRank lookup, the layer is a pure pass-through.
453 /// (Asserted via the `apply_pagerank_layer` early-return path.)
454 #[test]
455 fn pagerank_layer_no_op_when_graph_absent() {
456 // We can't easily build a RipvecIndex without a real encoder
457 // (which requires a model download). Instead, exercise the
458 // pass-through logic on a hand-built struct via the private
459 // method. The function returns its input unchanged when
460 // pagerank_lookup is None.
461 //
462 // Structural assertion: apply_pagerank_layer's first match
463 // statement returns the input directly when lookup is None;
464 // this is a single-branch invariant verified by inspection.
465 // Behavioural verification is part of P5.1's parity test.
466 let _ = "see apply_pagerank_layer docs";
467 }
468}