ripvec_core/encoder/ripvec/
index.rs

1//! `RipvecIndex` orchestrator and PageRank-layered ranking.
2//!
3//! Port of `~/src/semble/src/semble/index/index.py:RipvecIndex`. Owns
4//! the corpus state (chunks, file mapping, language mapping, BM25,
5//! dense embeddings, encoder) and dispatches search by mode.
6//!
7//! ## Port-plus-ripvec scope
8//!
9//! Per `docs/PLAN.md`, after the ripvec engine's own `rerank_topk` runs, ripvec's
10//! [`boost_with_pagerank`](crate::hybrid::boost_with_pagerank) is
11//! applied as a final ranking layer. The PageRank lookup is built from
12//! the repo graph and stored alongside the corpus when one is provided
13//! at construction; the layer no-ops when no graph is present.
14
15use std::collections::HashMap;
16use std::path::{Path, PathBuf};
17
18use crate::chunk::CodeChunk;
19use crate::embed::SearchConfig;
20use crate::encoder::VectorEncoder;
21use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
22use crate::encoder::ripvec::dense::StaticEncoder;
23use crate::encoder::ripvec::hybrid::{search_hybrid, search_semantic};
24use crate::encoder::ripvec::manifest::{Diff, FileEntry, Manifest, diff_against_walk};
25use crate::hybrid::SearchMode;
26use crate::profile::Profiler;
27use crate::walk::{WalkOptions, collect_files_with_options};
28
29/// Combined orchestrator for the ripvec retrieval pipeline.
30///
31/// Constructed via [`RipvecIndex::from_root`] which walks files,
32/// chunks them with ripvec's chunker, embeds with the static encoder,
33/// and builds the BM25 index.
34pub struct RipvecIndex {
35    chunks: Vec<CodeChunk>,
36    /// Row-major contiguous embedding matrix; row `i` is the
37    /// L2-normalized embedding of chunk `i`. Held as `Array2<f32>` so
38    /// cosine queries (dot product over normalized rows) dispatch to
39    /// BLAS `sgemv` via ndarray's `cpu-accelerate` feature instead of
40    /// pointer-chasing through `Vec<Vec<f32>>`. The change is a
41    /// ~150x theoretical lift on per-query dense scoring at 1M chunks
42    /// (memory-bandwidth-bound).
43    embeddings: ndarray::Array2<f32>,
44    bm25: Bm25Index,
45    /// Shared by `Arc` so [`Self::apply_diff`] can produce a new index
46    /// that reuses the same loaded model without cloning the ~32 MB
47    /// embedding table. The encoder is immutable after construction.
48    encoder: std::sync::Arc<StaticEncoder>,
49    file_mapping: HashMap<String, Vec<usize>>,
50    language_mapping: HashMap<String, Vec<usize>>,
51    pagerank_lookup: Option<std::sync::Arc<HashMap<String, f32>>>,
52    pagerank_alpha: f32,
53    corpus_class: CorpusClass,
54    /// Canonical root the index was built against. Used by
55    /// [`RipvecIndex::diff_against_filesystem`] to walk the same tree
56    /// for reconciliation.
57    root: PathBuf,
58    /// Walk filters captured at build time so reconciliation honors the
59    /// same `.gitignore`, extension whitelist, ignore-pattern set as
60    /// the original index.
61    walk_options: WalkOptions,
62    /// Per-file fingerprint table (mtime, size, inode, blake3) for
63    /// online change detection. Built during [`Self::from_root`] and
64    /// queried by [`Self::diff_against_filesystem`]. See
65    /// [`crate::encoder::ripvec::manifest`] for the algorithm.
66    manifest: Manifest,
67}
68
69/// Index-time classification of the corpus by file mix.
70///
71/// Drives the corpus-aware rerank gate: docs and mixed corpora get
72/// the L-12 cross-encoder fired (when the query is NL-shaped); pure
73/// code corpora skip it because the ms-marco-trained model is
74/// out-of-domain for code regardless of impl quality.
75#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
76#[serde(rename_all = "lowercase")]
77pub enum CorpusClass {
78    /// Less than 30% of chunks are in prose files. Pure or near-pure
79    /// code corpora — rerank skipped.
80    Code,
81    /// Between 30% and 70% prose chunks. Mixed corpora — rerank fires
82    /// on NL queries to recover the prose-dominant relevance signal.
83    Mixed,
84    /// At least 70% prose chunks. Documentation, book sets, knowledge
85    /// bases — rerank fires by default.
86    Docs,
87}
88
89impl CorpusClass {
90    /// Classify a chunk set by the fraction of chunks dominated by prose.
91    ///
92    /// A chunk counts as "prose" when **either** of the following holds:
93    /// - its file extension is in [`crate::encoder::ripvec::ranking::is_prose_path`]
94    ///   (e.g. `.md`, `.rst`, `.txt`), OR
95    /// - its content is dominated by docstring/comment text per
96    ///   [`chunk_is_prose_dominated`] (the I#64 / B-0028 path —
97    ///   docstring-heavy Python, JS-doc-heavy code, etc.).
98    ///
99    /// The second branch matters: a Mnemosyne-class Python corpus where
100    /// every class has a substantial docstring is classified as `Code`
101    /// by the file-extension test alone, even though >50% of its bytes
102    /// are prose. The within-chunk content test catches this case so
103    /// `Auto`-policy rerank fires on NL queries against such corpora.
104    ///
105    /// Empty input is classified as `Code` (degenerate but defined).
106    #[must_use]
107    pub fn classify(chunks: &[CodeChunk]) -> Self {
108        if chunks.is_empty() {
109            return Self::Code;
110        }
111        let prose = chunks
112            .iter()
113            .filter(|c| {
114                crate::encoder::ripvec::ranking::is_prose_path(&c.file_path)
115                    || chunk_is_prose_dominated(c)
116            })
117            .count();
118        #[expect(
119            clippy::cast_precision_loss,
120            reason = "chunk count never exceeds f32 mantissa precision in practice"
121        )]
122        let frac = prose as f32 / chunks.len() as f32;
123        if frac >= prose_density::CORPUS_DOCS_FRAC {
124            Self::Docs
125        } else if frac >= prose_density::CORPUS_MIXED_FRAC {
126            Self::Mixed
127        } else {
128            Self::Code
129        }
130    }
131
132    /// Whether the cross-encoder rerank should run on this corpus for
133    /// a non-symbol NL query. Pure code corpora skip rerank; mixed
134    /// and docs corpora enable it.
135    #[must_use]
136    pub fn rerank_eligible(self) -> bool {
137        matches!(self, Self::Mixed | Self::Docs)
138    }
139}
140
141/// Tunables for the prose-density signal that decides whether the
142/// cross-encoder reranker fires on `rerank=auto` queries.
143///
144/// All three thresholds gate the same I#64 decision chain:
145///
146/// 1. **Per chunk** — [`CHUNK_DOMINANCE_FRAC`] is the fraction of a
147///    single chunk's bytes that must lie inside docstring / comment
148///    constructs before the chunk counts as "prose-dominated".
149/// 2. **Per corpus** — once chunks are classified, the fraction of
150///    prose-dominated chunks in the corpus picks
151///    [`super::CorpusClass`]: ≥ [`CORPUS_DOCS_FRAC`] → `Docs`,
152///    ≥ [`CORPUS_MIXED_FRAC`] → `Mixed`, otherwise `Code`.
153///
154/// The numbers come from the I#64 issue text and the agent's first
155/// implementation pass; they are knobs, not invariants. Tune here when
156/// retrieval evidence (NDCG@10 against the mnemosyne / textual /
157/// docstring-heavy corpora) suggests a different cut.
158pub(super) mod prose_density {
159    /// Per-chunk prose-byte fraction that promotes a chunk to
160    /// "prose-dominated". 50% is the natural "more prose than code" cut.
161    pub const CHUNK_DOMINANCE_FRAC: f32 = 0.5;
162
163    /// Fraction of prose-dominated chunks that promotes a corpus to
164    /// [`super::super::CorpusClass::Mixed`] — the threshold at which
165    /// `rerank=auto` fires on Code / All corpus queries.
166    pub const CORPUS_MIXED_FRAC: f32 = 0.3;
167
168    /// Fraction of prose-dominated chunks that promotes a corpus to
169    /// [`super::super::CorpusClass::Docs`] — at this density we treat
170    /// the corpus as prose-first and always rerank.
171    pub const CORPUS_DOCS_FRAC: f32 = 0.7;
172}
173
174/// Whether this chunk's content is dominated by docstring / comment
175/// prose rather than code syntax.
176///
177/// Pragmatic, language-agnostic heuristic: scan the chunk's `content`
178/// (raw source, not enriched) and sum the bytes inside common prose
179/// constructs — Python triple-quoted strings (`"""..."""` / `'''...'''`),
180/// C-style block comments (`/* ... */`), and line comments
181/// (`#`, `//`, `///`). When that prose-byte fraction exceeds
182/// [`prose_density::CHUNK_DOMINANCE_FRAC`] (50%), the chunk reads more
183/// like a docstring than like code.
184///
185/// This is intentionally **not** a tree-sitter parse: the prose-density
186/// signal feeds a coarse corpus-level threshold (30% prose chunks → fire
187/// rerank), so per-chunk noise washes out. A tree-sitter parse here would
188/// add chunker dependencies and per-chunk parse cost for a coarse signal.
189///
190/// Whitespace-only and empty chunks return `false` (degenerate;
191/// they don't push the corpus toward "prose-dominated").
192#[must_use]
193fn chunk_is_prose_dominated(chunk: &CodeChunk) -> bool {
194    let total = chunk.content.len();
195    if total == 0 {
196        return false;
197    }
198    #[expect(
199        clippy::cast_precision_loss,
200        reason = "chunk content length never exceeds f32 mantissa precision in practice"
201    )]
202    let ratio = prose_byte_count(&chunk.content) as f32 / total as f32;
203    ratio > prose_density::CHUNK_DOMINANCE_FRAC
204}
205
206/// Count bytes inside docstring/comment constructs within `source`.
207///
208/// Recognised constructs:
209/// - Python triple-quoted strings (`"""..."""`, `'''...'''`) — the
210///   docstring shape that motivates I#64.
211/// - C-style block comments (`/* ... */`) — JS-doc, Rust block docs,
212///   Java/C/C++.
213/// - Line comments starting with `#`, `//`, or `///` — Python, shell,
214///   Rust, JS/TS, Go, etc.
215///
216/// Bytes inside both a string and a comment are counted once
217/// (the scanner is a single-pass state machine). The scanner is
218/// deliberately simple: it does not honour escape sequences inside
219/// strings or detect string-vs-comment precedence across all languages.
220/// It is calibrated for the corpus-level "is this prose-dominated"
221/// question, not for syntactic correctness.
222#[must_use]
223fn prose_byte_count(source: &str) -> usize {
224    let bytes = source.as_bytes();
225    let mut prose = 0usize;
226    let mut i = 0usize;
227    while i < bytes.len() {
228        let rest = &bytes[i..];
229        // Triple-quoted Python docstrings.
230        if rest.starts_with(b"\"\"\"") || rest.starts_with(b"'''") {
231            let quote = &rest[..3];
232            let start = i + 3;
233            // Search for the matching closing triple-quote.
234            let mut j = start;
235            while j + 3 <= bytes.len() && &bytes[j..j + 3] != quote {
236                j += 1;
237            }
238            let end = (j + 3).min(bytes.len());
239            prose += end - i;
240            i = end;
241            continue;
242        }
243        // Block comment /* ... */.
244        if rest.starts_with(b"/*") {
245            let mut j = i + 2;
246            while j + 2 <= bytes.len() && &bytes[j..j + 2] != b"*/" {
247                j += 1;
248            }
249            let end = (j + 2).min(bytes.len());
250            prose += end - i;
251            i = end;
252            continue;
253        }
254        // Line comment // or /// (handle /// as a superset of //).
255        if rest.starts_with(b"//") {
256            let mut j = i;
257            while j < bytes.len() && bytes[j] != b'\n' {
258                j += 1;
259            }
260            prose += j - i;
261            i = j;
262            continue;
263        }
264        // Line comment # (Python, shell). Guard against `#!` shebangs:
265        // still counts as a line comment (it's prose-ish content), so
266        // no special case needed.
267        if bytes[i] == b'#' {
268            let mut j = i;
269            while j < bytes.len() && bytes[j] != b'\n' {
270                j += 1;
271            }
272            prose += j - i;
273            i = j;
274            continue;
275        }
276        i += 1;
277    }
278    prose
279}
280
281impl RipvecIndex {
282    /// Build a [`RipvecIndex`] by walking `root` and indexing every
283    /// supported file. Uses `encoder.embed_root` (ripvec's chunker +
284    /// model2vec encode) and builds a fresh BM25 index over the
285    /// resulting chunks.
286    ///
287    /// `pagerank_lookup` is the optional structural-prior map (file
288    /// path → normalized PageRank) used by the final ranking layer;
289    /// pass `None` to disable. `pagerank_alpha` is the corresponding
290    /// boost strength.
291    ///
292    /// # Errors
293    ///
294    /// Returns the underlying error if `embed_root` fails.
295    pub fn from_root(
296        root: &Path,
297        encoder: StaticEncoder,
298        cfg: &SearchConfig,
299        profiler: &Profiler,
300        pagerank_lookup: Option<HashMap<String, f32>>,
301        pagerank_alpha: f32,
302    ) -> crate::Result<Self> {
303        // Wrap once at construction. The per-query `apply_pagerank_layer`
304        // path clones the Arc (pointer bump), not the HashMap (10K+ String
305        // allocs on a 1M-chunk corpus).
306        let pagerank_lookup = pagerank_lookup.map(std::sync::Arc::new);
307        let (chunks, embeddings_vec) = encoder.embed_root(root, cfg, profiler)?;
308        // Convert Vec<Vec<f32>> -> Array2<f32> at the boundary. The
309        // upstream embed_root produces ragged-friendly Vec<Vec<>>; we
310        // pack into one contiguous row-major buffer so BLAS sgemv can
311        // do per-query cosine in one call. Cost is a single sequential
312        // memcpy pass (~1 GB at memory bandwidth = ~5 ms on a 1M-chunk
313        // corpus) — negligible against the 60 s build phase.
314        let hidden_dim = embeddings_vec.first().map_or(0, std::vec::Vec::len);
315        let n_chunks = embeddings_vec.len();
316        let mut flat: Vec<f32> = Vec::with_capacity(n_chunks * hidden_dim);
317        for row in embeddings_vec {
318            debug_assert_eq!(
319                row.len(),
320                hidden_dim,
321                "ragged embeddings: row of {} vs expected {hidden_dim}",
322                row.len()
323            );
324            flat.extend(row);
325        }
326        let embeddings = ndarray::Array2::from_shape_vec((n_chunks, hidden_dim), flat)
327            .map_err(|e| crate::Error::Other(anyhow::anyhow!("embeddings reshape: {e}")))?;
328        let bm25 = {
329            let _g = profiler.phase("bm25_build");
330            Bm25Index::build(&chunks)
331        };
332        let (file_mapping, language_mapping) = {
333            let _g = profiler.phase("mappings");
334            build_mappings(&chunks)
335        };
336        let corpus_class = CorpusClass::classify(&chunks);
337        // Capture walk options for future reconciles, and populate the
338        // manifest from the same file set the indexer consumed. We
339        // re-walk + re-read here because `embed_root` doesn't surface
340        // the per-file bytes back to us; the redundant read is paid
341        // once at index build time, not per query. On reconcile we
342        // only re-read files whose stat tuple changed.
343        let walk_options = cfg.walk_options();
344        let root_buf = root.to_path_buf();
345        let manifest = {
346            let _g = profiler.phase("manifest_build");
347            build_manifest(&root_buf, &walk_options)
348        };
349        Ok(Self {
350            chunks,
351            embeddings,
352            bm25,
353            encoder: std::sync::Arc::new(encoder),
354            file_mapping,
355            language_mapping,
356            pagerank_lookup,
357            pagerank_alpha,
358            corpus_class,
359            root: root_buf,
360            walk_options,
361            manifest,
362        })
363    }
364
365    /// Build a new index by incrementally applying `diff` against
366    /// `self`.
367    ///
368    /// **The selective-rebuild path that v3.1.0 punted on.** Re-embeds
369    /// only the dirty + new files, splices them into the existing
370    /// chunks/embeddings, drops deleted files' chunks, rebuilds BM25
371    /// and the per-file/per-language mappings from the new chunk set,
372    /// reclassifies the corpus, and refreshes the manifest entries
373    /// for the affected files.
374    ///
375    /// # Cost shape
376    ///
377    /// Roughly `O(|diff.dirty| + |diff.new|)` chunk + embed work plus
378    /// `O(|self.chunks|)` BM25 rebuild. On a 5000-chunk corpus with
379    /// one file changed: ~5-10 ms (embed one file) + ~50 ms (BM25
380    /// rebuild) = ~60 ms — vs. ~270 ms-1 s for a full
381    /// [`Self::from_root`] rebuild. The full-build cost is paid only
382    /// at cold start.
383    ///
384    /// # BM25
385    ///
386    /// BM25 is rebuilt from scratch over the new chunks vec rather
387    /// than incrementally updated. Inverted-postings incremental
388    /// update is correct but adds significant code; full rebuild at
389    /// our chunk counts is fast enough that the simpler path wins.
390    ///
391    /// # Errors
392    ///
393    /// Returns the underlying error if [`StaticEncoder::embed_paths`]
394    /// fails or if the embedding matrix shape is invalid.
395    pub fn apply_diff(&self, diff: &Diff, profiler: &Profiler) -> crate::Result<Self> {
396        use std::collections::HashSet;
397
398        // 1. Identify which existing chunk indices to drop. `file_mapping`
399        //    keys are the rel_paths the chunker wrote. Manifest paths are
400        //    absolute. Map manifest paths to rel_paths by stripping
401        //    `self.root` (the same operation `chunk_one_file` performs).
402        let rel_path_for = |p: &Path| -> String {
403            p.strip_prefix(&self.root)
404                .unwrap_or(p)
405                .display()
406                .to_string()
407        };
408        let mut removed_indices: HashSet<usize> = HashSet::new();
409        for path in diff
410            .deleted
411            .iter()
412            .chain(diff.dirty.iter())
413            .chain(diff.new.iter())
414        {
415            let rel = rel_path_for(path);
416            if let Some(indices) = self.file_mapping.get(&rel) {
417                removed_indices.extend(indices.iter().copied());
418            }
419        }
420
421        // 2. Build the kept chunks + embeddings from `self`. Cloning the
422        //    embedding rows is one allocation per kept chunk; for a 5k-
423        //    chunk corpus that's a single sequential pass over 5 MB.
424        let mut kept_chunks: Vec<CodeChunk> = Vec::with_capacity(self.chunks.len());
425        let mut kept_emb_rows: Vec<Vec<f32>> = Vec::with_capacity(self.chunks.len());
426        for (i, chunk) in self.chunks.iter().enumerate() {
427            if removed_indices.contains(&i) {
428                continue;
429            }
430            kept_chunks.push(chunk.clone());
431            kept_emb_rows.push(self.embeddings.row(i).to_vec());
432        }
433
434        // 3. Embed the dirty + new files. (Dirty files were already
435        //    dropped from `kept_chunks` above; their new chunks come in
436        //    here as fresh entries.)
437        let mut to_embed: Vec<std::path::PathBuf> = Vec::new();
438        to_embed.extend(diff.new.iter().cloned());
439        to_embed.extend(diff.dirty.iter().cloned());
440        let (new_chunks, new_embs) = if to_embed.is_empty() {
441            (Vec::new(), Vec::new())
442        } else {
443            let _g = profiler.phase("apply_diff_embed");
444            self.encoder.embed_paths(&self.root, &to_embed, profiler)?
445        };
446        kept_chunks.extend(new_chunks);
447        kept_emb_rows.extend(new_embs);
448
449        // 4. Re-pack embeddings into a contiguous Array2 so BLAS sgemv
450        //    still works at query time.
451        let n = kept_emb_rows.len();
452        let hidden_dim = kept_emb_rows
453            .first()
454            .map_or(self.embeddings.ncols(), Vec::len);
455        let mut flat: Vec<f32> = Vec::with_capacity(n * hidden_dim);
456        for row in kept_emb_rows {
457            flat.extend(row);
458        }
459        let embeddings = if n == 0 {
460            ndarray::Array2::<f32>::zeros((0, hidden_dim))
461        } else {
462            ndarray::Array2::from_shape_vec((n, hidden_dim), flat).map_err(|e| {
463                crate::Error::Other(anyhow::anyhow!("apply_diff embeddings reshape: {e}"))
464            })?
465        };
466
467        // 5. Rebuild BM25 from the new chunks (simpler than incremental
468        //    postings update; cheap at our chunk counts). Rebuild
469        //    mappings + corpus_class from the new chunks too.
470        let bm25 = {
471            let _g = profiler.phase("apply_diff_bm25");
472            Bm25Index::build(&kept_chunks)
473        };
474        let (file_mapping, language_mapping) = {
475            let _g = profiler.phase("apply_diff_mappings");
476            build_mappings(&kept_chunks)
477        };
478        let corpus_class = CorpusClass::classify(&kept_chunks);
479
480        // 6. Refresh manifest: drop deleted entries, refresh dirty
481        //    entries with new (mtime, size, ino, blake3), insert new
482        //    entries. blake3 requires the file bytes, so this re-reads
483        //    each changed file once. Negligible (~10 µs/file warm).
484        //
485        //    Also apply `diff.touched_clean`: these are files whose stat
486        //    tuple changed but whose content (blake3) is identical. The
487        //    `diff_against_filesystem` path clones `self.manifest` before
488        //    calling `diff_against_walk`, so the in-place stat-tuple
489        //    refresh inside `diff_against_walk` is discarded. Without this
490        //    step, every touched-but-unchanged file pays one blake3 read
491        //    per reconcile cycle instead of zero. Applying the entries here
492        //    — using the refreshed `FileEntry` already computed by
493        //    `diff_against_walk` — restores the "one blake3 then zero"
494        //    invariant on the new index.
495        let mut manifest = self.manifest.clone();
496        for path in &diff.deleted {
497            manifest.files.remove(path);
498        }
499        for path in diff.new.iter().chain(diff.dirty.iter()) {
500            if let Ok(entry) = FileEntry::from_path(path) {
501                manifest.insert(path.clone(), entry);
502            }
503        }
504        // Apply touched_clean refreshes: stat tuple already computed by
505        // diff_against_walk; no re-read or re-hash needed.
506        for (path, refreshed_entry) in &diff.touched_clean {
507            if let Some(entry_mut) = manifest.files.get_mut(path) {
508                entry_mut.mtime = refreshed_entry.mtime;
509                entry_mut.size = refreshed_entry.size;
510                entry_mut.ino = refreshed_entry.ino;
511                // blake3 is unchanged (that's the definition of touched_clean)
512                // but we overwrite defensively for consistency.
513                entry_mut.blake3 = refreshed_entry.blake3;
514            }
515        }
516
517        Ok(Self {
518            chunks: kept_chunks,
519            embeddings,
520            bm25,
521            encoder: std::sync::Arc::clone(&self.encoder),
522            file_mapping,
523            language_mapping,
524            pagerank_lookup: self.pagerank_lookup.clone(),
525            pagerank_alpha: self.pagerank_alpha,
526            corpus_class,
527            root: self.root.clone(),
528            walk_options: self.walk_options.clone(),
529            manifest,
530        })
531    }
532
533    /// Compare the manifest captured at build time against the current
534    /// filesystem state under [`Self::root`], using the same
535    /// [`WalkOptions`] used for the original index build.
536    ///
537    /// Returns a [`Diff`] enumerating dirty, new, and deleted files.
538    /// A zero-cost ([`Diff::is_empty`]) result means the index is
539    /// up-to-date and no rebuild is needed.
540    ///
541    /// # Cost
542    ///
543    /// Walk + per-file `stat()` for the cheap-path files (typically all
544    /// of them between successive queries). Blake3 verification is paid
545    /// only on the rare files where the stat tuple mismatches. On a
546    /// 200-file repo with no changes: sub-millisecond. On a 92k-file
547    /// repo with no changes: ~100-130 ms (the walk dominates).
548    ///
549    /// # Mutation
550    ///
551    /// This method takes `&self` and works on a clone of the manifest,
552    /// so the optimization of "refresh touched-but-unchanged stat
553    /// tuples" from [`diff_against_walk`] is discarded here. In
554    /// practice that means a file repeatedly touched without content
555    /// change pays one blake3 read per reconcile rather than zero —
556    /// negligible at our file sizes.
557    #[must_use]
558    pub fn diff_against_filesystem(&self) -> Diff {
559        let files = collect_files_with_options(&self.root, &self.walk_options);
560        let mut manifest = self.manifest.clone();
561        diff_against_walk(&mut manifest, &files)
562    }
563
564    /// Canonical root the index was built against.
565    #[must_use]
566    pub fn root(&self) -> &Path {
567        &self.root
568    }
569
570    /// Walk options captured at build time.
571    #[must_use]
572    pub fn walk_options(&self) -> &WalkOptions {
573        &self.walk_options
574    }
575
576    /// Manifest of tracked files (read-only access).
577    #[must_use]
578    pub fn manifest(&self) -> &Manifest {
579        &self.manifest
580    }
581
582    /// The index's corpus classification, computed at build time.
583    ///
584    /// Used by the MCP rerank gate to decide whether the L-12
585    /// cross-encoder fires on a given query.
586    #[must_use]
587    pub fn corpus_class(&self) -> CorpusClass {
588        self.corpus_class
589    }
590
591    /// Number of indexed chunks.
592    #[must_use]
593    pub fn len(&self) -> usize {
594        self.chunks.len()
595    }
596
597    /// Whether the index has zero chunks.
598    #[must_use]
599    pub fn is_empty(&self) -> bool {
600        self.chunks.is_empty()
601    }
602
603    /// Indexed chunks (read-only access).
604    #[must_use]
605    pub fn chunks(&self) -> &[CodeChunk] {
606        &self.chunks
607    }
608
609    /// Indexed embeddings (read-only access).
610    ///
611    /// `Array2<f32>` of shape `[n_chunks, hidden_dim]`, row-major. Row
612    /// `i` is the L2-normalized embedding of chunk `i`, so cosine
613    /// similarity reduces to a dot product. Callers that need their
614    /// own similarity arithmetic (`find_similar`, `find_duplicates`)
615    /// should use `embeddings.row(i)` for a single-row view or
616    /// `embeddings.dot(&query)` for a one-call BLAS GEMV.
617    #[must_use]
618    pub fn embeddings(&self) -> &ndarray::Array2<f32> {
619        &self.embeddings
620    }
621
622    /// Search the index and return ranked `(chunk_index, score)` pairs.
623    ///
624    /// `mode = SearchMode::Hybrid` (default) fuses semantic + BM25 via
625    /// RRF; `Semantic` and `Keyword` use one signal each.
626    ///
627    /// `filter_languages` and `filter_paths` build a selector mask
628    /// that restricts retrieval to chunks in the named files /
629    /// languages.
630    #[must_use]
631    pub fn search(
632        &self,
633        query: &str,
634        top_k: usize,
635        mode: SearchMode,
636        alpha: Option<f32>,
637        filter_languages: Option<&[String]>,
638        filter_paths: Option<&[String]>,
639    ) -> Vec<(usize, f32)> {
640        if self.is_empty() || query.trim().is_empty() {
641            return Vec::new();
642        }
643        let selector = self.build_selector(filter_languages, filter_paths);
644
645        let raw = match mode {
646            SearchMode::Keyword => search_bm25(query, &self.bm25, top_k, selector.as_deref()),
647            SearchMode::Semantic => {
648                let q_emb = self.encoder.encode_query(query);
649                search_semantic(&q_emb, &self.embeddings, top_k, selector.as_deref())
650            }
651            SearchMode::Hybrid => {
652                let q_emb = self.encoder.encode_query(query);
653                search_hybrid(
654                    query,
655                    &q_emb,
656                    &self.embeddings,
657                    &self.chunks,
658                    &self.bm25,
659                    top_k,
660                    alpha,
661                    selector.as_deref(),
662                )
663            }
664        };
665
666        self.apply_pagerank_layer(raw)
667    }
668
669    /// Build a selector mask from optional language/path filters.
670    /// Returns `None` when no filters are set (search runs over the
671    /// full corpus).
672    fn build_selector(
673        &self,
674        filter_languages: Option<&[String]>,
675        filter_paths: Option<&[String]>,
676    ) -> Option<Vec<usize>> {
677        let mut selector: Vec<usize> = Vec::new();
678        if let Some(langs) = filter_languages {
679            for lang in langs {
680                if let Some(ids) = self.language_mapping.get(lang) {
681                    selector.extend(ids.iter().copied());
682                }
683            }
684        }
685        if let Some(paths) = filter_paths {
686            for path in paths {
687                if let Some(ids) = self.file_mapping.get(path) {
688                    selector.extend(ids.iter().copied());
689                }
690            }
691        }
692        if selector.is_empty() {
693            None
694        } else {
695            selector.sort_unstable();
696            selector.dedup();
697            Some(selector)
698        }
699    }
700
701    /// Layer ripvec's PageRank boost on top of semble's ranked results.
702    ///
703    /// No-op when `pagerank_lookup` is `None` or the boost strength
704    /// is zero. Otherwise re-uses
705    /// [`crate::hybrid::boost_with_pagerank`] so the PageRank semantic
706    /// stays consistent with ripvec's other code paths.
707    fn apply_pagerank_layer(&self, mut results: Vec<(usize, f32)>) -> Vec<(usize, f32)> {
708        let Some(lookup) = &self.pagerank_lookup else {
709            return results;
710        };
711        if results.is_empty() || self.pagerank_alpha <= 0.0 {
712            return results;
713        }
714        // Uses the shared `ranking::PageRankBoost` layer for behavioral
715        // parity with the BERT CLI, MCP `search_code`, and LSP paths.
716        // All five callers now apply the same sigmoid-on-percentile
717        // curve.
718        // `lookup` is `Arc<HashMap<_,_>>`; cloning the Arc is a pointer
719        // bump, not a HashMap copy. The earlier `lookup.clone()` here
720        // cloned the entire map per query (~10K String allocations on
721        // a 1M-chunk corpus).
722        let layers: Vec<Box<dyn crate::ranking::RankingLayer>> = vec![Box::new(
723            crate::ranking::PageRankBoost::new(std::sync::Arc::clone(lookup), self.pagerank_alpha),
724        )];
725        crate::ranking::apply_chain(&mut results, &self.chunks, &layers);
726        results
727    }
728}
729
730impl crate::searchable::SearchableIndex for RipvecIndex {
731    fn chunks(&self) -> &[CodeChunk] {
732        RipvecIndex::chunks(self)
733    }
734
735    /// Trait-shape search: text-only, no engine-specific knobs.
736    ///
737    /// The trait surface is the LSP-callers' common ground. Filters
738    /// (language, path) and the alpha auto-detect override are not
739    /// surfaced through the trait because no LSP module uses them.
740    fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
741        RipvecIndex::search(self, query_text, top_k, mode, None, None, None)
742    }
743
744    /// Use chunk `chunk_idx`'s own embedding as the query vector and
745    /// rank everything else by cosine similarity (semantic-only) or
746    /// blend with BM25 (hybrid). Falls back to text-only keyword
747    /// search when the chunk index is out of range.
748    ///
749    /// Mirrors the [`HybridIndex`] equivalent so `goto_definition`
750    /// and `goto_implementation` work identically across engines.
751    fn search_from_chunk(
752        &self,
753        chunk_idx: usize,
754        query_text: &str,
755        top_k: usize,
756        mode: SearchMode,
757    ) -> Vec<(usize, f32)> {
758        // RipvecIndex stores embeddings; if the source chunk is in
759        // range we can rank by similarity against its vector. Out of
760        // range or keyword-only mode: fall back to text search.
761        if chunk_idx >= self.embeddings().nrows() {
762            return RipvecIndex::search(
763                self,
764                query_text,
765                top_k,
766                SearchMode::Keyword,
767                None,
768                None,
769                None,
770            );
771        }
772        match mode {
773            SearchMode::Keyword => RipvecIndex::search(
774                self,
775                query_text,
776                top_k,
777                SearchMode::Keyword,
778                None,
779                None,
780                None,
781            ),
782            SearchMode::Semantic | SearchMode::Hybrid => {
783                // Cosine via dot product over L2-normalized rows.
784                // Parallel sgemv across row-shards to saturate
785                // aggregate memory bandwidth instead of the single-core
786                // sgemv ceiling.
787                let source = self.embeddings().row(chunk_idx);
788                let scores =
789                    crate::encoder::ripvec::hybrid::parallel_sgemv(self.embeddings(), &source);
790                let mut scored: Vec<(usize, f32)> = scores
791                    .iter()
792                    .enumerate()
793                    .filter(|(i, _)| *i != chunk_idx)
794                    .map(|(i, &s)| (i, s))
795                    .collect();
796                if scored.len() > top_k {
797                    scored.select_nth_unstable_by(top_k - 1, |a, b| {
798                        b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0))
799                    });
800                    scored.truncate(top_k);
801                }
802                scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
803                scored
804            }
805        }
806    }
807
808    fn as_any(&self) -> &dyn std::any::Any {
809        self
810    }
811}
812
813/// Locate the chunk index for a given file path and 1-based line number.
814///
815/// Used by `find_similar` (ripvec-mcp) to resolve an `lsp_location` whose
816/// `start_line` may be the symbol-identifier line (as returned by
817/// `get_repo_map`'s `symbols[].lsp_location`) rather than the chunk's own
818/// `start_line` (the block-start, which may precede the identifier by the
819/// length of doc-comments, attributes, or decorators).
820///
821/// # Lookup strategy (I#50 three-step spec)
822///
823/// 1. **Exact start-line match**: return the first chunk whose
824///    `start_line == target_line_1based`. Cheap O(n) scan that covers the
825///    common case where the caller already has a chunk-start coordinate.
826///
827/// 2. **Range containment**: return the first chunk whose closed interval
828///    `[start_line, end_line]` contains `target_line_1based`. Covers the
829///    I#50 failure case where `get_repo_map.symbols[].lsp_location.start_line`
830///    is the identifier line (inside the chunk) rather than the block start.
831///
832/// 3. **Miss**: return `None`. The caller is responsible for returning empty
833///    results — `find_similar` must NOT propagate this as an internal error.
834///
835/// # Path matching
836///
837/// Matches on a strict suffix: a chunk at path `a/b/c.rs` matches a query
838/// for `b/c.rs` or `c.rs` (the suffix is separated by `/`). Absolute paths
839/// are compared directly. This mirrors the convention used throughout
840/// `find_similar_chunk_idx` in `ripvec-mcp`.
841///
842/// # Arguments
843///
844/// * `chunks` — the indexed chunk slice (from `RipvecIndex::chunks()`).
845/// * `file_path` — path to match against `chunk.file_path`.
846/// * `target_line_1based` — the 1-based line to locate, matching the
847///   1-based `lsp_location.start_line` → `target_line_1based` convention.
848#[must_use]
849pub fn find_chunk_containing_line(
850    chunks: &[CodeChunk],
851    file_path: &str,
852    target_line_1based: usize,
853) -> Option<usize> {
854    let path_matches = |chunk: &CodeChunk| -> bool {
855        let cp = &chunk.file_path;
856        cp == file_path
857            || (cp.len() > file_path.len()
858                && cp.ends_with(file_path)
859                && cp.as_bytes()[cp.len() - file_path.len() - 1] == b'/')
860    };
861
862    // Step 1: exact start_line match.
863    if let Some(idx) = chunks
864        .iter()
865        .position(|c| path_matches(c) && c.start_line == target_line_1based)
866    {
867        return Some(idx);
868    }
869
870    // Step 2: range containment — find the first chunk whose [start_line,
871    // end_line] interval contains the target line.
872    chunks.iter().position(|c| {
873        path_matches(c) && c.start_line <= target_line_1based && target_line_1based <= c.end_line
874    })
875}
876
877/// Build (file_path → chunk indices, language → chunk indices) mappings.
878/// Build the per-file manifest by walking `root` with `walk_options`
879/// and stat + read + blake3 each file. Used at index construction; on
880/// reconcile, [`RipvecIndex::diff_against_filesystem`] uses the cheap
881/// stat-tuple path and only re-reads files whose tuple mismatches the
882/// stored entry.
883///
884/// Files that can't be read or stat'd are silently skipped; they will
885/// re-appear in the diff as `new` if they become readable later, or
886/// as missing on the next reconcile.
887fn build_manifest(root: &Path, walk_options: &WalkOptions) -> Manifest {
888    let mut manifest = Manifest::new();
889    let files = collect_files_with_options(root, walk_options);
890    for path in files {
891        let (Ok(metadata), Ok(bytes)) = (std::fs::metadata(&path), std::fs::read(&path)) else {
892            continue;
893        };
894        let entry = FileEntry::from_bytes(&metadata, &bytes);
895        manifest.insert(path, entry);
896    }
897    manifest
898}
899
900fn build_mappings(
901    chunks: &[CodeChunk],
902) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
903    let mut file_to_id: HashMap<String, Vec<usize>> = HashMap::new();
904    let mut lang_to_id: HashMap<String, Vec<usize>> = HashMap::new();
905    for (i, chunk) in chunks.iter().enumerate() {
906        file_to_id
907            .entry(chunk.file_path.clone())
908            .or_default()
909            .push(i);
910        // The semble port's chunker stores language inferentially (via
911        // extension); the per-chunk `language` field isn't populated on
912        // this path. The mapping is keyed on file extension as a proxy
913        // so `filter_languages: Some(&["rs"])` works.
914        if let Some(ext) = Path::new(&chunk.file_path)
915            .extension()
916            .and_then(|e| e.to_str())
917        {
918            lang_to_id.entry(ext.to_string()).or_default().push(i);
919        }
920    }
921    (file_to_id, lang_to_id)
922}
923
924#[cfg(test)]
925mod tests {
926    use super::*;
927
928    // ── I#64 / B-0028: within-chunk prose-density signal ─────────────────────
929
930    /// Helper: build a chunk on a `.py` path with the given source content.
931    /// `.py` file-extension classifies as Code via `is_prose_path`, so the
932    /// resulting prose classification depends purely on chunk content.
933    fn py_chunk(content: &str) -> crate::chunk::CodeChunk {
934        crate::chunk::CodeChunk {
935            file_path: "src/foo.py".to_string(),
936            name: "test".to_string(),
937            kind: "function_definition".to_string(),
938            content_kind: crate::chunk::ContentKind::Code,
939            start_line: 1,
940            end_line: 10,
941            symbol_line: 1,
942            content: content.to_string(),
943            enriched_content: content.to_string(),
944            qualified_name: None,
945        }
946    }
947
948    /// A chunk dominated by a Python triple-quoted docstring is prose.
949    #[test]
950    fn chunk_is_prose_dominated_python_docstring() {
951        let c = py_chunk(
952            "def handle_error(self, exc):\n    \"\"\"This is the docstring \
953             that explains the error-handling contract. It dwarfs the body \
954             and so the chunk is dominated by prose.\"\"\"\n    return None\n",
955        );
956        assert!(
957            chunk_is_prose_dominated(&c),
958            "Python triple-quoted docstring dominating the chunk must be \
959             recognised as prose-dominated"
960        );
961    }
962
963    /// A chunk dominated by code syntax is not prose.
964    #[test]
965    fn chunk_is_prose_dominated_pure_code_is_false() {
966        let c = py_chunk("def f(x, y):\n    z = x * y + 2\n    return z * z - (x + y)\n");
967        assert!(
968            !chunk_is_prose_dominated(&c),
969            "Pure code chunk (no docstring, no comments) must not be \
970             prose-dominated"
971        );
972    }
973
974    /// A chunk dominated by `//` line comments is prose.
975    #[test]
976    fn chunk_is_prose_dominated_line_comments() {
977        // ~80% comment bytes vs ~20% code.
978        let c = py_chunk(
979            "// This is a long-form explanation of why the function exists.\n\
980             // It spans multiple lines and dominates the chunk by byte count.\n\
981             // The actual code is a tiny one-liner.\n\
982             fn f() { 1 }\n",
983        );
984        assert!(
985            chunk_is_prose_dominated(&c),
986            "Chunk dominated by `//` line comments must be prose-dominated"
987        );
988    }
989
990    /// A chunk dominated by a `/* ... */` block comment is prose.
991    #[test]
992    fn chunk_is_prose_dominated_block_comment() {
993        let c = py_chunk(
994            "/* JS-doc style block comment describing the function in detail \
995             and taking up most of the chunk by byte volume. */\nfn g() {}\n",
996        );
997        assert!(
998            chunk_is_prose_dominated(&c),
999            "Chunk dominated by `/* ... */` block comment must be \
1000             prose-dominated"
1001        );
1002    }
1003
1004    /// Empty chunk content does not push corpus toward prose.
1005    #[test]
1006    fn chunk_is_prose_dominated_empty_is_false() {
1007        let c = py_chunk("");
1008        assert!(
1009            !chunk_is_prose_dominated(&c),
1010            "Empty chunk content must classify as not-prose (degenerate case)"
1011        );
1012    }
1013
1014    /// `CorpusClass::classify`: synthetic (prose=10, code=20) → Mixed.
1015    /// Matches the issue text's pure-unit test shape — given a chunk-count
1016    /// summary, assert the policy.
1017    #[test]
1018    fn corpus_class_classify_10_prose_20_code_is_mixed() {
1019        let prose = py_chunk(
1020            "def f():\n    \"\"\"A substantial docstring whose byte count \
1021             dominates the chunk.\"\"\"\n    pass\n",
1022        );
1023        let code = py_chunk("def g(x):\n    return x + 1\n");
1024        // Pre-condition: helper agrees.
1025        assert!(chunk_is_prose_dominated(&prose));
1026        assert!(!chunk_is_prose_dominated(&code));
1027
1028        let mut chunks = Vec::new();
1029        for _ in 0..10 {
1030            chunks.push(prose.clone());
1031        }
1032        for _ in 0..20 {
1033            chunks.push(code.clone());
1034        }
1035        assert_eq!(
1036            CorpusClass::classify(&chunks),
1037            CorpusClass::Mixed,
1038            "10 prose : 20 code (~33% prose chunks) must classify as Mixed; \
1039             threshold is >= 30% prose"
1040        );
1041        assert!(
1042            CorpusClass::classify(&chunks).rerank_eligible(),
1043            "Mixed must be rerank-eligible — the I#64 / B-0028 fire path"
1044        );
1045    }
1046
1047    /// `CorpusClass::classify`: synthetic (prose=20, code=10) → Mixed,
1048    /// rerank-eligible (still under the 70% Docs cut but well over the
1049    /// 30% Mixed cut).
1050    #[test]
1051    fn corpus_class_classify_20_prose_10_code_is_rerank_eligible() {
1052        let prose = py_chunk(
1053            "def f():\n    \"\"\"Substantial docstring that dominates the \
1054             chunk's bytes — a Mnemosyne-class signature.\"\"\"\n    pass\n",
1055        );
1056        let code = py_chunk("def g(x):\n    return x + 1\n");
1057        let mut chunks = Vec::new();
1058        for _ in 0..20 {
1059            chunks.push(prose.clone());
1060        }
1061        for _ in 0..10 {
1062            chunks.push(code.clone());
1063        }
1064        let class = CorpusClass::classify(&chunks);
1065        assert!(
1066            class.rerank_eligible(),
1067            "20 prose : 10 code (~67% prose chunks) must be rerank-eligible"
1068        );
1069    }
1070
1071    /// `CorpusClass::classify`: synthetic (prose=2, code=28) → Code, not
1072    /// rerank-eligible. Tests that low prose density correctly stays off.
1073    #[test]
1074    fn corpus_class_classify_low_prose_is_code() {
1075        let prose = py_chunk("def f():\n    \"\"\"Docstring dominating the chunk's bytes.\"\"\"\n");
1076        let code = py_chunk("def g(x):\n    return x + 1\n");
1077        let mut chunks = Vec::new();
1078        for _ in 0..2 {
1079            chunks.push(prose.clone());
1080        }
1081        for _ in 0..28 {
1082            chunks.push(code.clone());
1083        }
1084        assert_eq!(
1085            CorpusClass::classify(&chunks),
1086            CorpusClass::Code,
1087            "2:28 prose:code (~7% prose chunks) must classify as Code"
1088        );
1089    }
1090
1091    /// `is_prose_path` and chunk-content density are independent signals
1092    /// and stack (a `.md` file is prose regardless of content; a `.py`
1093    /// file becomes prose iff its chunk content is prose-dominated).
1094    #[test]
1095    fn corpus_class_classify_path_and_content_signals_compose() {
1096        let md_chunk = crate::chunk::CodeChunk {
1097            file_path: "README.md".to_string(),
1098            name: "readme".to_string(),
1099            kind: "paragraph".to_string(),
1100            content_kind: crate::chunk::ContentKind::Docs,
1101            start_line: 1,
1102            end_line: 5,
1103            symbol_line: 1,
1104            content: "function foo() { return 1; }".to_string(), // code-like content
1105            enriched_content: "function foo() { return 1; }".to_string(),
1106            qualified_name: None,
1107        };
1108        // Even though the content is code-like, the .md extension means
1109        // this counts as prose under the path signal.
1110        let chunks = vec![md_chunk];
1111        assert_eq!(
1112            CorpusClass::classify(&chunks),
1113            CorpusClass::Docs,
1114            "A .md path counts as prose under is_prose_path even if its \
1115             content is code-like — path and content signals OR together"
1116        );
1117    }
1118
1119    // ── existing tests below ─────────────────────────────────────────────────
1120
1121    /// Test-only constructor that bypasses `from_root` to allow unit
1122    /// tests to inject pre-built state (chunks, embeddings, mappings,
1123    /// manifest) without requiring a real model download.
1124    ///
1125    /// For tests that call `apply_diff` with a non-empty `diff.new` or
1126    /// `diff.dirty`, the caller must supply a real encoder because
1127    /// `apply_diff` calls `encoder.embed_paths`.
1128    #[allow(clippy::too_many_arguments)]
1129    fn new_for_test(
1130        chunks: Vec<crate::chunk::CodeChunk>,
1131        embeddings: ndarray::Array2<f32>,
1132        encoder: std::sync::Arc<StaticEncoder>,
1133        file_mapping: HashMap<String, Vec<usize>>,
1134        language_mapping: HashMap<String, Vec<usize>>,
1135        manifest: Manifest,
1136        root: std::path::PathBuf,
1137        walk_options: WalkOptions,
1138    ) -> RipvecIndex {
1139        let bm25 = Bm25Index::build(&chunks);
1140        let corpus_class = CorpusClass::classify(&chunks);
1141        RipvecIndex {
1142            chunks,
1143            embeddings,
1144            bm25,
1145            encoder,
1146            file_mapping,
1147            language_mapping,
1148            pagerank_lookup: None,
1149            pagerank_alpha: 0.0,
1150            corpus_class,
1151            root,
1152            walk_options,
1153            manifest,
1154        }
1155    }
1156
1157    /// Compile-time check that `RipvecIndex` carries the right method
1158    /// shape for the CLI to call.
1159    #[test]
1160    fn semble_index_search_signature_compiles() {
1161        fn shape_check(
1162            idx: &RipvecIndex,
1163            query: &str,
1164            top_k: usize,
1165            mode: SearchMode,
1166        ) -> Vec<(usize, f32)> {
1167            idx.search(query, top_k, mode, None, None, None)
1168        }
1169        // Reference to keep type-check live across dead-code analysis.
1170        let _ = shape_check;
1171    }
1172
1173    /// `behavior:pagerank-no-op-when-graph-absent` — when constructed
1174    /// without a PageRank lookup, the layer is a pure pass-through.
1175    /// (Asserted via the `apply_pagerank_layer` early-return path.)
1176    #[test]
1177    fn pagerank_layer_no_op_when_graph_absent() {
1178        // We can't easily build a RipvecIndex without a real encoder
1179        // (which requires a model download). Instead, exercise the
1180        // pass-through logic on a hand-built struct via the private
1181        // method. The function returns its input unchanged when
1182        // pagerank_lookup is None.
1183        //
1184        // Structural assertion: apply_pagerank_layer's first match
1185        // statement returns the input directly when lookup is None;
1186        // this is a single-branch invariant verified by inspection.
1187        // Behavioural verification is part of P5.1's parity test.
1188        let _ = "see apply_pagerank_layer docs";
1189    }
1190
1191    /// Corner case: a file appears in `diff.new` (absent from manifest)
1192    /// but `file_mapping` still holds stale chunk indices for it from a
1193    /// prior partial reconcile. Without the R4.1 fix, `apply_diff` skips
1194    /// clearing those stale chunks before re-embedding → duplicates.
1195    ///
1196    /// Gated `#[ignore]` because `apply_diff` calls `encoder.embed_paths`
1197    /// for files in `diff.new`, which requires the Model2Vec weights.
1198    /// Run once model is cached:
1199    ///   `cargo test -p ripvec-core apply_diff_idempotent -- --ignored`
1200    #[test]
1201    #[ignore = "requires Model2Vec download (~32 MB on first run)"]
1202    fn apply_diff_idempotent_when_new_file_already_has_chunks() {
1203        use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
1204        use crate::profile::Profiler;
1205        use std::fs;
1206
1207        let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
1208        let encoder_arc = std::sync::Arc::new(encoder);
1209
1210        // Temporary corpus: one file (file_a.rs).
1211        let tmp = tempfile::TempDir::new().unwrap();
1212        let file_a = tmp.path().join("file_a.rs");
1213        fs::write(
1214            &file_a,
1215            "pub fn alpha() -> u32 { 1 }\npub fn beta() -> u32 { 2 }\n",
1216        )
1217        .unwrap();
1218
1219        // Embed file_a.rs once to obtain its canonical chunks/embeddings.
1220        let (real_chunks, real_embs) = encoder_arc
1221            .embed_paths(tmp.path(), std::slice::from_ref(&file_a), &Profiler::noop())
1222            .expect("embed_paths");
1223        let n_real = real_chunks.len();
1224        assert!(n_real > 0, "file_a.rs must produce at least one chunk");
1225
1226        let hidden_dim = real_embs[0].len();
1227        let mut flat: Vec<f32> = Vec::with_capacity(n_real * hidden_dim);
1228        for row in &real_embs {
1229            flat.extend(row);
1230        }
1231        let embeddings = ndarray::Array2::from_shape_vec((n_real, hidden_dim), flat).unwrap();
1232
1233        // file_mapping holds stale indices pointing at file_a.rs chunks.
1234        let rel_key = "file_a.rs".to_string();
1235        let indices: Vec<usize> = (0..n_real).collect();
1236        let file_mapping = HashMap::from([(rel_key, indices)]);
1237
1238        // Manifest is EMPTY: simulates a prior reconcile whose manifest
1239        // update failed, so diff_against_filesystem classifies file_a.rs
1240        // as "new" even though file_mapping still references its chunks.
1241        let manifest = Manifest::new();
1242
1243        let index = new_for_test(
1244            real_chunks,
1245            embeddings,
1246            std::sync::Arc::clone(&encoder_arc),
1247            file_mapping,
1248            HashMap::new(),
1249            manifest,
1250            tmp.path().to_path_buf(),
1251            WalkOptions::default(),
1252        );
1253
1254        let diff = index.diff_against_filesystem();
1255        assert!(
1256            diff.new.iter().any(|p| p.ends_with("file_a.rs")),
1257            "file_a.rs must appear in diff.new when manifest is empty; got {:?}",
1258            diff.new
1259        );
1260        assert!(diff.dirty.is_empty(), "no dirty expected");
1261        assert!(diff.deleted.is_empty(), "no deleted expected");
1262
1263        // With the fix (diff.new also processed in removed_indices), stale
1264        // chunks are dropped before re-embedding → chunk count equals
1265        // one fresh-embed pass. Without the fix, old + new chunks both
1266        // survive → count is doubled.
1267        let updated = index
1268            .apply_diff(&diff, &Profiler::noop())
1269            .expect("apply_diff");
1270
1271        let file_a_count = updated
1272            .chunks()
1273            .iter()
1274            .filter(|c| c.file_path.ends_with("file_a.rs"))
1275            .count();
1276
1277        assert_eq!(
1278            file_a_count, n_real,
1279            "file_a.rs chunk count must equal one fresh-embed pass ({n_real}); \
1280             got {file_a_count} — stale chunks from file_mapping not cleared"
1281        );
1282        assert_eq!(
1283            updated.embeddings().nrows(),
1284            updated.chunks().len(),
1285            "embeddings row count must match chunk count"
1286        );
1287    }
1288
1289    /// Derived: applying an empty diff twice must yield identical chunk
1290    /// counts — no accumulation from repeated no-op reconciles.
1291    ///
1292    /// Gated `#[ignore]` because building a real index requires the
1293    /// Model2Vec encoder (~32 MB).
1294    #[test]
1295    #[ignore = "requires Model2Vec download (~32 MB on first run)"]
1296    fn apply_diff_no_duplicate_chunks_after_two_passes() {
1297        use crate::embed::SearchConfig;
1298        use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
1299        use crate::profile::Profiler;
1300        use std::fs;
1301
1302        let tmp = tempfile::TempDir::new().unwrap();
1303        fs::write(
1304            tmp.path().join("main.rs"),
1305            "fn main() { println!(\"hello\"); }\n",
1306        )
1307        .unwrap();
1308
1309        let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
1310        let cfg = SearchConfig {
1311            batch_size: 32,
1312            max_tokens: 512,
1313            chunk: crate::chunk::ChunkConfig {
1314                max_chunk_bytes: 4096,
1315                window_size: 2048,
1316                window_overlap: 512,
1317            },
1318            text_mode: false,
1319            cascade_dim: None,
1320            file_type: None,
1321            exclude_extensions: Vec::new(),
1322            include_extensions: Vec::new(),
1323            ignore_patterns: Vec::new(),
1324            corpus: crate::embed::Scope::All,
1325            mode: crate::hybrid::SearchMode::Hybrid,
1326        };
1327        let index = RipvecIndex::from_root(tmp.path(), encoder, &cfg, &Profiler::noop(), None, 0.0)
1328            .expect("from_root");
1329
1330        let original_count = index.chunks().len();
1331
1332        let diff1 = index.diff_against_filesystem();
1333        assert!(diff1.is_empty(), "fresh index must yield empty diff");
1334        let pass1 = index
1335            .apply_diff(&diff1, &Profiler::noop())
1336            .expect("apply_diff pass 1");
1337        assert_eq!(
1338            pass1.chunks().len(),
1339            original_count,
1340            "chunk count must be unchanged after empty-diff pass 1"
1341        );
1342
1343        let diff2 = pass1.diff_against_filesystem();
1344        assert!(
1345            diff2.is_empty(),
1346            "pass1 against unchanged FS must yield empty diff"
1347        );
1348        let pass2 = pass1
1349            .apply_diff(&diff2, &Profiler::noop())
1350            .expect("apply_diff pass 2");
1351        assert_eq!(
1352            pass2.chunks().len(),
1353            original_count,
1354            "chunk count must be unchanged after empty-diff pass 2"
1355        );
1356    }
1357}
ripvec_core/encoder/ripvec/index.rs

ripvec_core/encoder/ripvec/
index.rs