ripvec_core/encoder/ripvec/index.rs
1//! `RipvecIndex` orchestrator and PageRank-layered ranking.
2//!
3//! Port of `~/src/semble/src/semble/index/index.py:RipvecIndex`. Owns
4//! the corpus state (chunks, file mapping, language mapping, BM25,
5//! dense embeddings, encoder) and dispatches search by mode.
6//!
7//! ## Port-plus-ripvec scope
8//!
9//! Per `docs/PLAN.md`, after the ripvec engine's own `rerank_topk` runs, ripvec's
10//! [`boost_with_pagerank`](crate::hybrid::boost_with_pagerank) is
11//! applied as a final ranking layer. The PageRank lookup is built from
12//! the repo graph and stored alongside the corpus when one is provided
13//! at construction; the layer no-ops when no graph is present.
14
15use std::collections::HashMap;
16use std::path::{Path, PathBuf};
17
18use crate::chunk::CodeChunk;
19use crate::embed::SearchConfig;
20use crate::encoder::VectorEncoder;
21use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
22use crate::encoder::ripvec::dense::StaticEncoder;
23use crate::encoder::ripvec::hybrid::{search_hybrid, search_semantic};
24use crate::encoder::ripvec::manifest::{Diff, FileEntry, Manifest, diff_against_walk};
25use crate::hybrid::SearchMode;
26use crate::profile::Profiler;
27use crate::walk::{WalkOptions, collect_files_with_options};
28
29/// Combined orchestrator for the ripvec retrieval pipeline.
30///
31/// Constructed via [`RipvecIndex::from_root`] which walks files,
32/// chunks them with ripvec's chunker, embeds with the static encoder,
33/// and builds the BM25 index.
34pub struct RipvecIndex {
35 chunks: Vec<CodeChunk>,
36 /// Row-major contiguous embedding matrix; row `i` is the
37 /// L2-normalized embedding of chunk `i`. Held as `Array2<f32>` so
38 /// cosine queries (dot product over normalized rows) dispatch to
39 /// BLAS `sgemv` via ndarray's `cpu-accelerate` feature instead of
40 /// pointer-chasing through `Vec<Vec<f32>>`. The change is a
41 /// ~150x theoretical lift on per-query dense scoring at 1M chunks
42 /// (memory-bandwidth-bound).
43 embeddings: ndarray::Array2<f32>,
44 bm25: Bm25Index,
45 /// Shared by `Arc` so [`Self::apply_diff`] can produce a new index
46 /// that reuses the same loaded model without cloning the ~32 MB
47 /// embedding table. The encoder is immutable after construction.
48 encoder: std::sync::Arc<StaticEncoder>,
49 file_mapping: HashMap<String, Vec<usize>>,
50 language_mapping: HashMap<String, Vec<usize>>,
51 pagerank_lookup: Option<std::sync::Arc<HashMap<String, f32>>>,
52 pagerank_alpha: f32,
53 corpus_class: CorpusClass,
54 /// Canonical root the index was built against. Used by
55 /// [`RipvecIndex::diff_against_filesystem`] to walk the same tree
56 /// for reconciliation.
57 root: PathBuf,
58 /// Walk filters captured at build time so reconciliation honors the
59 /// same `.gitignore`, extension whitelist, ignore-pattern set as
60 /// the original index.
61 walk_options: WalkOptions,
62 /// Per-file fingerprint table (mtime, size, inode, blake3) for
63 /// online change detection. Built during [`Self::from_root`] and
64 /// queried by [`Self::diff_against_filesystem`]. See
65 /// [`crate::encoder::ripvec::manifest`] for the algorithm.
66 manifest: Manifest,
67}
68
69/// Index-time classification of the corpus by file mix.
70///
71/// Drives the corpus-aware rerank gate: docs and mixed corpora get
72/// the L-12 cross-encoder fired (when the query is NL-shaped); pure
73/// code corpora skip it because the ms-marco-trained model is
74/// out-of-domain for code regardless of impl quality.
75#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
76#[serde(rename_all = "lowercase")]
77pub enum CorpusClass {
78 /// Less than 30% of chunks are in prose files. Pure or near-pure
79 /// code corpora — rerank skipped.
80 Code,
81 /// Between 30% and 70% prose chunks. Mixed corpora — rerank fires
82 /// on NL queries to recover the prose-dominant relevance signal.
83 Mixed,
84 /// At least 70% prose chunks. Documentation, book sets, knowledge
85 /// bases — rerank fires by default.
86 Docs,
87}
88
89impl CorpusClass {
90 /// Classify a chunk set by the fraction of chunks dominated by prose.
91 ///
92 /// A chunk counts as "prose" when **either** of the following holds:
93 /// - its file extension is in [`crate::encoder::ripvec::ranking::is_prose_path`]
94 /// (e.g. `.md`, `.rst`, `.txt`), OR
95 /// - its content is dominated by docstring/comment text per
96 /// [`chunk_is_prose_dominated`] (the I#64 / B-0028 path —
97 /// docstring-heavy Python, JS-doc-heavy code, etc.).
98 ///
99 /// The second branch matters: a Mnemosyne-class Python corpus where
100 /// every class has a substantial docstring is classified as `Code`
101 /// by the file-extension test alone, even though >50% of its bytes
102 /// are prose. The within-chunk content test catches this case so
103 /// `Auto`-policy rerank fires on NL queries against such corpora.
104 ///
105 /// Empty input is classified as `Code` (degenerate but defined).
106 #[must_use]
107 pub fn classify(chunks: &[CodeChunk]) -> Self {
108 if chunks.is_empty() {
109 return Self::Code;
110 }
111 let prose = chunks
112 .iter()
113 .filter(|c| {
114 crate::encoder::ripvec::ranking::is_prose_path(&c.file_path)
115 || chunk_is_prose_dominated(c)
116 })
117 .count();
118 #[expect(
119 clippy::cast_precision_loss,
120 reason = "chunk count never exceeds f32 mantissa precision in practice"
121 )]
122 let frac = prose as f32 / chunks.len() as f32;
123 if frac >= prose_density::CORPUS_DOCS_FRAC {
124 Self::Docs
125 } else if frac >= prose_density::CORPUS_MIXED_FRAC {
126 Self::Mixed
127 } else {
128 Self::Code
129 }
130 }
131
132 /// Whether the cross-encoder rerank should run on this corpus for
133 /// a non-symbol NL query. Pure code corpora skip rerank; mixed
134 /// and docs corpora enable it.
135 #[must_use]
136 pub fn rerank_eligible(self) -> bool {
137 matches!(self, Self::Mixed | Self::Docs)
138 }
139}
140
141/// Tunables for the prose-density signal that decides whether the
142/// cross-encoder reranker fires on `rerank=auto` queries.
143///
144/// All three thresholds gate the same I#64 decision chain:
145///
146/// 1. **Per chunk** — [`CHUNK_DOMINANCE_FRAC`] is the fraction of a
147/// single chunk's bytes that must lie inside docstring / comment
148/// constructs before the chunk counts as "prose-dominated".
149/// 2. **Per corpus** — once chunks are classified, the fraction of
150/// prose-dominated chunks in the corpus picks
151/// [`super::CorpusClass`]: ≥ [`CORPUS_DOCS_FRAC`] → `Docs`,
152/// ≥ [`CORPUS_MIXED_FRAC`] → `Mixed`, otherwise `Code`.
153///
154/// The numbers come from the I#64 issue text and the agent's first
155/// implementation pass; they are knobs, not invariants. Tune here when
156/// retrieval evidence (NDCG@10 against the mnemosyne / textual /
157/// docstring-heavy corpora) suggests a different cut.
158pub(super) mod prose_density {
159 /// Per-chunk prose-byte fraction that promotes a chunk to
160 /// "prose-dominated". 50% is the natural "more prose than code" cut.
161 pub const CHUNK_DOMINANCE_FRAC: f32 = 0.5;
162
163 /// Fraction of prose-dominated chunks that promotes a corpus to
164 /// [`super::super::CorpusClass::Mixed`] — the threshold at which
165 /// `rerank=auto` fires on Code / All corpus queries.
166 pub const CORPUS_MIXED_FRAC: f32 = 0.3;
167
168 /// Fraction of prose-dominated chunks that promotes a corpus to
169 /// [`super::super::CorpusClass::Docs`] — at this density we treat
170 /// the corpus as prose-first and always rerank.
171 pub const CORPUS_DOCS_FRAC: f32 = 0.7;
172}
173
174/// Whether this chunk's content is dominated by docstring / comment
175/// prose rather than code syntax.
176///
177/// Pragmatic, language-agnostic heuristic: scan the chunk's `content`
178/// (raw source, not enriched) and sum the bytes inside common prose
179/// constructs — Python triple-quoted strings (`"""..."""` / `'''...'''`),
180/// C-style block comments (`/* ... */`), and line comments
181/// (`#`, `//`, `///`). When that prose-byte fraction exceeds
182/// [`prose_density::CHUNK_DOMINANCE_FRAC`] (50%), the chunk reads more
183/// like a docstring than like code.
184///
185/// This is intentionally **not** a tree-sitter parse: the prose-density
186/// signal feeds a coarse corpus-level threshold (30% prose chunks → fire
187/// rerank), so per-chunk noise washes out. A tree-sitter parse here would
188/// add chunker dependencies and per-chunk parse cost for a coarse signal.
189///
190/// Whitespace-only and empty chunks return `false` (degenerate;
191/// they don't push the corpus toward "prose-dominated").
192#[must_use]
193fn chunk_is_prose_dominated(chunk: &CodeChunk) -> bool {
194 let total = chunk.content.len();
195 if total == 0 {
196 return false;
197 }
198 #[expect(
199 clippy::cast_precision_loss,
200 reason = "chunk content length never exceeds f32 mantissa precision in practice"
201 )]
202 let ratio = prose_byte_count(&chunk.content) as f32 / total as f32;
203 ratio > prose_density::CHUNK_DOMINANCE_FRAC
204}
205
206/// Count bytes inside docstring/comment constructs within `source`.
207///
208/// Recognised constructs:
209/// - Python triple-quoted strings (`"""..."""`, `'''...'''`) — the
210/// docstring shape that motivates I#64.
211/// - C-style block comments (`/* ... */`) — JS-doc, Rust block docs,
212/// Java/C/C++.
213/// - Line comments starting with `#`, `//`, or `///` — Python, shell,
214/// Rust, JS/TS, Go, etc.
215///
216/// Bytes inside both a string and a comment are counted once
217/// (the scanner is a single-pass state machine). The scanner is
218/// deliberately simple: it does not honour escape sequences inside
219/// strings or detect string-vs-comment precedence across all languages.
220/// It is calibrated for the corpus-level "is this prose-dominated"
221/// question, not for syntactic correctness.
222#[must_use]
223fn prose_byte_count(source: &str) -> usize {
224 let bytes = source.as_bytes();
225 let mut prose = 0usize;
226 let mut i = 0usize;
227 while i < bytes.len() {
228 let rest = &bytes[i..];
229 // Triple-quoted Python docstrings.
230 if rest.starts_with(b"\"\"\"") || rest.starts_with(b"'''") {
231 let quote = &rest[..3];
232 let start = i + 3;
233 // Search for the matching closing triple-quote.
234 let mut j = start;
235 while j + 3 <= bytes.len() && &bytes[j..j + 3] != quote {
236 j += 1;
237 }
238 let end = (j + 3).min(bytes.len());
239 prose += end - i;
240 i = end;
241 continue;
242 }
243 // Block comment /* ... */.
244 if rest.starts_with(b"/*") {
245 let mut j = i + 2;
246 while j + 2 <= bytes.len() && &bytes[j..j + 2] != b"*/" {
247 j += 1;
248 }
249 let end = (j + 2).min(bytes.len());
250 prose += end - i;
251 i = end;
252 continue;
253 }
254 // Line comment // or /// (handle /// as a superset of //).
255 if rest.starts_with(b"//") {
256 let mut j = i;
257 while j < bytes.len() && bytes[j] != b'\n' {
258 j += 1;
259 }
260 prose += j - i;
261 i = j;
262 continue;
263 }
264 // Line comment # (Python, shell). Guard against `#!` shebangs:
265 // still counts as a line comment (it's prose-ish content), so
266 // no special case needed.
267 if bytes[i] == b'#' {
268 let mut j = i;
269 while j < bytes.len() && bytes[j] != b'\n' {
270 j += 1;
271 }
272 prose += j - i;
273 i = j;
274 continue;
275 }
276 i += 1;
277 }
278 prose
279}
280
281impl RipvecIndex {
282 /// Build a [`RipvecIndex`] by walking `root` and indexing every
283 /// supported file. Uses `encoder.embed_root` (ripvec's chunker +
284 /// model2vec encode) and builds a fresh BM25 index over the
285 /// resulting chunks.
286 ///
287 /// `pagerank_lookup` is the optional structural-prior map (file
288 /// path → normalized PageRank) used by the final ranking layer;
289 /// pass `None` to disable. `pagerank_alpha` is the corresponding
290 /// boost strength.
291 ///
292 /// # Errors
293 ///
294 /// Returns the underlying error if `embed_root` fails.
295 pub fn from_root(
296 root: &Path,
297 encoder: StaticEncoder,
298 cfg: &SearchConfig,
299 profiler: &Profiler,
300 pagerank_lookup: Option<HashMap<String, f32>>,
301 pagerank_alpha: f32,
302 ) -> crate::Result<Self> {
303 // Wrap once at construction. The per-query `apply_pagerank_layer`
304 // path clones the Arc (pointer bump), not the HashMap (10K+ String
305 // allocs on a 1M-chunk corpus).
306 let pagerank_lookup = pagerank_lookup.map(std::sync::Arc::new);
307 let (chunks, embeddings_vec) = encoder.embed_root(root, cfg, profiler)?;
308 // Convert Vec<Vec<f32>> -> Array2<f32> at the boundary. The
309 // upstream embed_root produces ragged-friendly Vec<Vec<>>; we
310 // pack into one contiguous row-major buffer so BLAS sgemv can
311 // do per-query cosine in one call. Cost is a single sequential
312 // memcpy pass (~1 GB at memory bandwidth = ~5 ms on a 1M-chunk
313 // corpus) — negligible against the 60 s build phase.
314 let hidden_dim = embeddings_vec.first().map_or(0, std::vec::Vec::len);
315 let n_chunks = embeddings_vec.len();
316 let mut flat: Vec<f32> = Vec::with_capacity(n_chunks * hidden_dim);
317 for row in embeddings_vec {
318 debug_assert_eq!(
319 row.len(),
320 hidden_dim,
321 "ragged embeddings: row of {} vs expected {hidden_dim}",
322 row.len()
323 );
324 flat.extend(row);
325 }
326 let embeddings = ndarray::Array2::from_shape_vec((n_chunks, hidden_dim), flat)
327 .map_err(|e| crate::Error::Other(anyhow::anyhow!("embeddings reshape: {e}")))?;
328 let bm25 = {
329 let _g = profiler.phase("bm25_build");
330 Bm25Index::build(&chunks)
331 };
332 let (file_mapping, language_mapping) = {
333 let _g = profiler.phase("mappings");
334 build_mappings(&chunks)
335 };
336 let corpus_class = CorpusClass::classify(&chunks);
337 // Capture walk options for future reconciles, and populate the
338 // manifest from the same file set the indexer consumed. We
339 // re-walk + re-read here because `embed_root` doesn't surface
340 // the per-file bytes back to us; the redundant read is paid
341 // once at index build time, not per query. On reconcile we
342 // only re-read files whose stat tuple changed.
343 let walk_options = cfg.walk_options();
344 let root_buf = root.to_path_buf();
345 let manifest = {
346 let _g = profiler.phase("manifest_build");
347 build_manifest(&root_buf, &walk_options)
348 };
349 Ok(Self {
350 chunks,
351 embeddings,
352 bm25,
353 encoder: std::sync::Arc::new(encoder),
354 file_mapping,
355 language_mapping,
356 pagerank_lookup,
357 pagerank_alpha,
358 corpus_class,
359 root: root_buf,
360 walk_options,
361 manifest,
362 })
363 }
364
365 /// Build a new index by incrementally applying `diff` against
366 /// `self`.
367 ///
368 /// **The selective-rebuild path that v3.1.0 punted on.** Re-embeds
369 /// only the dirty + new files, splices them into the existing
370 /// chunks/embeddings, drops deleted files' chunks, rebuilds BM25
371 /// and the per-file/per-language mappings from the new chunk set,
372 /// reclassifies the corpus, and refreshes the manifest entries
373 /// for the affected files.
374 ///
375 /// # Cost shape
376 ///
377 /// Roughly `O(|diff.dirty| + |diff.new|)` chunk + embed work plus
378 /// `O(|self.chunks|)` BM25 rebuild. On a 5000-chunk corpus with
379 /// one file changed: ~5-10 ms (embed one file) + ~50 ms (BM25
380 /// rebuild) = ~60 ms — vs. ~270 ms-1 s for a full
381 /// [`Self::from_root`] rebuild. The full-build cost is paid only
382 /// at cold start.
383 ///
384 /// # BM25
385 ///
386 /// BM25 is rebuilt from scratch over the new chunks vec rather
387 /// than incrementally updated. Inverted-postings incremental
388 /// update is correct but adds significant code; full rebuild at
389 /// our chunk counts is fast enough that the simpler path wins.
390 ///
391 /// # Errors
392 ///
393 /// Returns the underlying error if [`StaticEncoder::embed_paths`]
394 /// fails or if the embedding matrix shape is invalid.
395 pub fn apply_diff(&self, diff: &Diff, profiler: &Profiler) -> crate::Result<Self> {
396 use std::collections::HashSet;
397
398 // 1. Identify which existing chunk indices to drop. `file_mapping`
399 // keys are the rel_paths the chunker wrote. Manifest paths are
400 // absolute. Map manifest paths to rel_paths by stripping
401 // `self.root` (the same operation `chunk_one_file` performs).
402 let rel_path_for = |p: &Path| -> String {
403 p.strip_prefix(&self.root)
404 .unwrap_or(p)
405 .display()
406 .to_string()
407 };
408 let mut removed_indices: HashSet<usize> = HashSet::new();
409 for path in diff
410 .deleted
411 .iter()
412 .chain(diff.dirty.iter())
413 .chain(diff.new.iter())
414 {
415 let rel = rel_path_for(path);
416 if let Some(indices) = self.file_mapping.get(&rel) {
417 removed_indices.extend(indices.iter().copied());
418 }
419 }
420
421 // 2. Build the kept chunks + embeddings from `self`. Cloning the
422 // embedding rows is one allocation per kept chunk; for a 5k-
423 // chunk corpus that's a single sequential pass over 5 MB.
424 let mut kept_chunks: Vec<CodeChunk> = Vec::with_capacity(self.chunks.len());
425 let mut kept_emb_rows: Vec<Vec<f32>> = Vec::with_capacity(self.chunks.len());
426 for (i, chunk) in self.chunks.iter().enumerate() {
427 if removed_indices.contains(&i) {
428 continue;
429 }
430 kept_chunks.push(chunk.clone());
431 kept_emb_rows.push(self.embeddings.row(i).to_vec());
432 }
433
434 // 3. Embed the dirty + new files. (Dirty files were already
435 // dropped from `kept_chunks` above; their new chunks come in
436 // here as fresh entries.)
437 let mut to_embed: Vec<std::path::PathBuf> = Vec::new();
438 to_embed.extend(diff.new.iter().cloned());
439 to_embed.extend(diff.dirty.iter().cloned());
440 let (new_chunks, new_embs) = if to_embed.is_empty() {
441 (Vec::new(), Vec::new())
442 } else {
443 let _g = profiler.phase("apply_diff_embed");
444 self.encoder.embed_paths(&self.root, &to_embed, profiler)?
445 };
446 kept_chunks.extend(new_chunks);
447 kept_emb_rows.extend(new_embs);
448
449 // 4. Re-pack embeddings into a contiguous Array2 so BLAS sgemv
450 // still works at query time.
451 let n = kept_emb_rows.len();
452 let hidden_dim = kept_emb_rows
453 .first()
454 .map_or(self.embeddings.ncols(), Vec::len);
455 let mut flat: Vec<f32> = Vec::with_capacity(n * hidden_dim);
456 for row in kept_emb_rows {
457 flat.extend(row);
458 }
459 let embeddings = if n == 0 {
460 ndarray::Array2::<f32>::zeros((0, hidden_dim))
461 } else {
462 ndarray::Array2::from_shape_vec((n, hidden_dim), flat).map_err(|e| {
463 crate::Error::Other(anyhow::anyhow!("apply_diff embeddings reshape: {e}"))
464 })?
465 };
466
467 // 5. Rebuild BM25 from the new chunks (simpler than incremental
468 // postings update; cheap at our chunk counts). Rebuild
469 // mappings + corpus_class from the new chunks too.
470 let bm25 = {
471 let _g = profiler.phase("apply_diff_bm25");
472 Bm25Index::build(&kept_chunks)
473 };
474 let (file_mapping, language_mapping) = {
475 let _g = profiler.phase("apply_diff_mappings");
476 build_mappings(&kept_chunks)
477 };
478 let corpus_class = CorpusClass::classify(&kept_chunks);
479
480 // 6. Refresh manifest: drop deleted entries, refresh dirty
481 // entries with new (mtime, size, ino, blake3), insert new
482 // entries. blake3 requires the file bytes, so this re-reads
483 // each changed file once. Negligible (~10 µs/file warm).
484 //
485 // Also apply `diff.touched_clean`: these are files whose stat
486 // tuple changed but whose content (blake3) is identical. The
487 // `diff_against_filesystem` path clones `self.manifest` before
488 // calling `diff_against_walk`, so the in-place stat-tuple
489 // refresh inside `diff_against_walk` is discarded. Without this
490 // step, every touched-but-unchanged file pays one blake3 read
491 // per reconcile cycle instead of zero. Applying the entries here
492 // — using the refreshed `FileEntry` already computed by
493 // `diff_against_walk` — restores the "one blake3 then zero"
494 // invariant on the new index.
495 let mut manifest = self.manifest.clone();
496 for path in &diff.deleted {
497 manifest.files.remove(path);
498 }
499 for path in diff.new.iter().chain(diff.dirty.iter()) {
500 if let Ok(entry) = FileEntry::from_path(path) {
501 manifest.insert(path.clone(), entry);
502 }
503 }
504 // Apply touched_clean refreshes: stat tuple already computed by
505 // diff_against_walk; no re-read or re-hash needed.
506 for (path, refreshed_entry) in &diff.touched_clean {
507 if let Some(entry_mut) = manifest.files.get_mut(path) {
508 entry_mut.mtime = refreshed_entry.mtime;
509 entry_mut.size = refreshed_entry.size;
510 entry_mut.ino = refreshed_entry.ino;
511 // blake3 is unchanged (that's the definition of touched_clean)
512 // but we overwrite defensively for consistency.
513 entry_mut.blake3 = refreshed_entry.blake3;
514 }
515 }
516
517 Ok(Self {
518 chunks: kept_chunks,
519 embeddings,
520 bm25,
521 encoder: std::sync::Arc::clone(&self.encoder),
522 file_mapping,
523 language_mapping,
524 pagerank_lookup: self.pagerank_lookup.clone(),
525 pagerank_alpha: self.pagerank_alpha,
526 corpus_class,
527 root: self.root.clone(),
528 walk_options: self.walk_options.clone(),
529 manifest,
530 })
531 }
532
533 /// Compare the manifest captured at build time against the current
534 /// filesystem state under [`Self::root`], using the same
535 /// [`WalkOptions`] used for the original index build.
536 ///
537 /// Returns a [`Diff`] enumerating dirty, new, and deleted files.
538 /// A zero-cost ([`Diff::is_empty`]) result means the index is
539 /// up-to-date and no rebuild is needed.
540 ///
541 /// # Cost
542 ///
543 /// Walk + per-file `stat()` for the cheap-path files (typically all
544 /// of them between successive queries). Blake3 verification is paid
545 /// only on the rare files where the stat tuple mismatches. On a
546 /// 200-file repo with no changes: sub-millisecond. On a 92k-file
547 /// repo with no changes: ~100-130 ms (the walk dominates).
548 ///
549 /// # Mutation
550 ///
551 /// This method takes `&self` and works on a clone of the manifest,
552 /// so the optimization of "refresh touched-but-unchanged stat
553 /// tuples" from [`diff_against_walk`] is discarded here. In
554 /// practice that means a file repeatedly touched without content
555 /// change pays one blake3 read per reconcile rather than zero —
556 /// negligible at our file sizes.
557 #[must_use]
558 pub fn diff_against_filesystem(&self) -> Diff {
559 let files = collect_files_with_options(&self.root, &self.walk_options);
560 let mut manifest = self.manifest.clone();
561 diff_against_walk(&mut manifest, &files)
562 }
563
564 /// Canonical root the index was built against.
565 #[must_use]
566 pub fn root(&self) -> &Path {
567 &self.root
568 }
569
570 /// Walk options captured at build time.
571 #[must_use]
572 pub fn walk_options(&self) -> &WalkOptions {
573 &self.walk_options
574 }
575
576 /// Manifest of tracked files (read-only access).
577 #[must_use]
578 pub fn manifest(&self) -> &Manifest {
579 &self.manifest
580 }
581
582 /// The index's corpus classification, computed at build time.
583 ///
584 /// Used by the MCP rerank gate to decide whether the L-12
585 /// cross-encoder fires on a given query.
586 #[must_use]
587 pub fn corpus_class(&self) -> CorpusClass {
588 self.corpus_class
589 }
590
591 /// Number of indexed chunks.
592 #[must_use]
593 pub fn len(&self) -> usize {
594 self.chunks.len()
595 }
596
597 /// Whether the index has zero chunks.
598 #[must_use]
599 pub fn is_empty(&self) -> bool {
600 self.chunks.is_empty()
601 }
602
603 /// Indexed chunks (read-only access).
604 #[must_use]
605 pub fn chunks(&self) -> &[CodeChunk] {
606 &self.chunks
607 }
608
609 /// Indexed embeddings (read-only access).
610 ///
611 /// `Array2<f32>` of shape `[n_chunks, hidden_dim]`, row-major. Row
612 /// `i` is the L2-normalized embedding of chunk `i`, so cosine
613 /// similarity reduces to a dot product. Callers that need their
614 /// own similarity arithmetic (`find_similar`, `find_duplicates`)
615 /// should use `embeddings.row(i)` for a single-row view or
616 /// `embeddings.dot(&query)` for a one-call BLAS GEMV.
617 #[must_use]
618 pub fn embeddings(&self) -> &ndarray::Array2<f32> {
619 &self.embeddings
620 }
621
622 /// Search the index and return ranked `(chunk_index, score)` pairs.
623 ///
624 /// `mode = SearchMode::Hybrid` (default) fuses semantic + BM25 via
625 /// RRF; `Semantic` and `Keyword` use one signal each.
626 ///
627 /// `filter_languages` and `filter_paths` build a selector mask
628 /// that restricts retrieval to chunks in the named files /
629 /// languages.
630 #[must_use]
631 pub fn search(
632 &self,
633 query: &str,
634 top_k: usize,
635 mode: SearchMode,
636 alpha: Option<f32>,
637 filter_languages: Option<&[String]>,
638 filter_paths: Option<&[String]>,
639 ) -> Vec<(usize, f32)> {
640 if self.is_empty() || query.trim().is_empty() {
641 return Vec::new();
642 }
643 let selector = self.build_selector(filter_languages, filter_paths);
644
645 let raw = match mode {
646 SearchMode::Keyword => search_bm25(query, &self.bm25, top_k, selector.as_deref()),
647 SearchMode::Semantic => {
648 let q_emb = self.encoder.encode_query(query);
649 search_semantic(&q_emb, &self.embeddings, top_k, selector.as_deref())
650 }
651 SearchMode::Hybrid => {
652 let q_emb = self.encoder.encode_query(query);
653 search_hybrid(
654 query,
655 &q_emb,
656 &self.embeddings,
657 &self.chunks,
658 &self.bm25,
659 top_k,
660 alpha,
661 selector.as_deref(),
662 )
663 }
664 };
665
666 self.apply_pagerank_layer(raw)
667 }
668
669 /// Build a selector mask from optional language/path filters.
670 /// Returns `None` when no filters are set (search runs over the
671 /// full corpus).
672 fn build_selector(
673 &self,
674 filter_languages: Option<&[String]>,
675 filter_paths: Option<&[String]>,
676 ) -> Option<Vec<usize>> {
677 let mut selector: Vec<usize> = Vec::new();
678 if let Some(langs) = filter_languages {
679 for lang in langs {
680 if let Some(ids) = self.language_mapping.get(lang) {
681 selector.extend(ids.iter().copied());
682 }
683 }
684 }
685 if let Some(paths) = filter_paths {
686 for path in paths {
687 if let Some(ids) = self.file_mapping.get(path) {
688 selector.extend(ids.iter().copied());
689 }
690 }
691 }
692 if selector.is_empty() {
693 None
694 } else {
695 selector.sort_unstable();
696 selector.dedup();
697 Some(selector)
698 }
699 }
700
701 /// Layer ripvec's PageRank boost on top of semble's ranked results.
702 ///
703 /// No-op when `pagerank_lookup` is `None` or the boost strength
704 /// is zero. Otherwise re-uses
705 /// [`crate::hybrid::boost_with_pagerank`] so the PageRank semantic
706 /// stays consistent with ripvec's other code paths.
707 fn apply_pagerank_layer(&self, mut results: Vec<(usize, f32)>) -> Vec<(usize, f32)> {
708 let Some(lookup) = &self.pagerank_lookup else {
709 return results;
710 };
711 if results.is_empty() || self.pagerank_alpha <= 0.0 {
712 return results;
713 }
714 // Uses the shared `ranking::PageRankBoost` layer for behavioral
715 // parity with the BERT CLI, MCP `search_code`, and LSP paths.
716 // All five callers now apply the same sigmoid-on-percentile
717 // curve.
718 // `lookup` is `Arc<HashMap<_,_>>`; cloning the Arc is a pointer
719 // bump, not a HashMap copy. The earlier `lookup.clone()` here
720 // cloned the entire map per query (~10K String allocations on
721 // a 1M-chunk corpus).
722 let layers: Vec<Box<dyn crate::ranking::RankingLayer>> = vec![Box::new(
723 crate::ranking::PageRankBoost::new(std::sync::Arc::clone(lookup), self.pagerank_alpha),
724 )];
725 crate::ranking::apply_chain(&mut results, &self.chunks, &layers);
726 results
727 }
728}
729
730impl crate::searchable::SearchableIndex for RipvecIndex {
731 fn chunks(&self) -> &[CodeChunk] {
732 RipvecIndex::chunks(self)
733 }
734
735 /// Trait-shape search: text-only, no engine-specific knobs.
736 ///
737 /// The trait surface is the LSP-callers' common ground. Filters
738 /// (language, path) and the alpha auto-detect override are not
739 /// surfaced through the trait because no LSP module uses them.
740 fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
741 RipvecIndex::search(self, query_text, top_k, mode, None, None, None)
742 }
743
744 /// Use chunk `chunk_idx`'s own embedding as the query vector and
745 /// rank everything else by cosine similarity (semantic-only) or
746 /// blend with BM25 (hybrid). Falls back to text-only keyword
747 /// search when the chunk index is out of range.
748 ///
749 /// Mirrors the [`HybridIndex`] equivalent so `goto_definition`
750 /// and `goto_implementation` work identically across engines.
751 fn search_from_chunk(
752 &self,
753 chunk_idx: usize,
754 query_text: &str,
755 top_k: usize,
756 mode: SearchMode,
757 ) -> Vec<(usize, f32)> {
758 // RipvecIndex stores embeddings; if the source chunk is in
759 // range we can rank by similarity against its vector. Out of
760 // range or keyword-only mode: fall back to text search.
761 if chunk_idx >= self.embeddings().nrows() {
762 return RipvecIndex::search(
763 self,
764 query_text,
765 top_k,
766 SearchMode::Keyword,
767 None,
768 None,
769 None,
770 );
771 }
772 match mode {
773 SearchMode::Keyword => RipvecIndex::search(
774 self,
775 query_text,
776 top_k,
777 SearchMode::Keyword,
778 None,
779 None,
780 None,
781 ),
782 SearchMode::Semantic | SearchMode::Hybrid => {
783 // Cosine via dot product over L2-normalized rows.
784 // Parallel sgemv across row-shards to saturate
785 // aggregate memory bandwidth instead of the single-core
786 // sgemv ceiling.
787 let source = self.embeddings().row(chunk_idx);
788 let scores =
789 crate::encoder::ripvec::hybrid::parallel_sgemv(self.embeddings(), &source);
790 let mut scored: Vec<(usize, f32)> = scores
791 .iter()
792 .enumerate()
793 .filter(|(i, _)| *i != chunk_idx)
794 .map(|(i, &s)| (i, s))
795 .collect();
796 if scored.len() > top_k {
797 scored.select_nth_unstable_by(top_k - 1, |a, b| {
798 b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0))
799 });
800 scored.truncate(top_k);
801 }
802 scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
803 scored
804 }
805 }
806 }
807
808 fn as_any(&self) -> &dyn std::any::Any {
809 self
810 }
811}
812
813/// Locate the chunk index for a given file path and 1-based line number.
814///
815/// Used by `find_similar` (ripvec-mcp) to resolve an `lsp_location` whose
816/// `start_line` may be the symbol-identifier line (as returned by
817/// `get_repo_map`'s `symbols[].lsp_location`) rather than the chunk's own
818/// `start_line` (the block-start, which may precede the identifier by the
819/// length of doc-comments, attributes, or decorators).
820///
821/// # Lookup strategy (I#50 three-step spec)
822///
823/// 1. **Exact start-line match**: return the first chunk whose
824/// `start_line == target_line_1based`. Cheap O(n) scan that covers the
825/// common case where the caller already has a chunk-start coordinate.
826///
827/// 2. **Range containment**: return the first chunk whose closed interval
828/// `[start_line, end_line]` contains `target_line_1based`. Covers the
829/// I#50 failure case where `get_repo_map.symbols[].lsp_location.start_line`
830/// is the identifier line (inside the chunk) rather than the block start.
831///
832/// 3. **Miss**: return `None`. The caller is responsible for returning empty
833/// results — `find_similar` must NOT propagate this as an internal error.
834///
835/// # Path matching
836///
837/// Matches on a strict suffix: a chunk at path `a/b/c.rs` matches a query
838/// for `b/c.rs` or `c.rs` (the suffix is separated by `/`). Absolute paths
839/// are compared directly. This mirrors the convention used throughout
840/// `find_similar_chunk_idx` in `ripvec-mcp`.
841///
842/// # Arguments
843///
844/// * `chunks` — the indexed chunk slice (from `RipvecIndex::chunks()`).
845/// * `file_path` — path to match against `chunk.file_path`.
846/// * `target_line_1based` — the 1-based line to locate, matching the
847/// 1-based `lsp_location.start_line` → `target_line_1based` convention.
848#[must_use]
849pub fn find_chunk_containing_line(
850 chunks: &[CodeChunk],
851 file_path: &str,
852 target_line_1based: usize,
853) -> Option<usize> {
854 let path_matches = |chunk: &CodeChunk| -> bool {
855 let cp = &chunk.file_path;
856 cp == file_path
857 || (cp.len() > file_path.len()
858 && cp.ends_with(file_path)
859 && cp.as_bytes()[cp.len() - file_path.len() - 1] == b'/')
860 };
861
862 // Step 1: exact start_line match.
863 if let Some(idx) = chunks
864 .iter()
865 .position(|c| path_matches(c) && c.start_line == target_line_1based)
866 {
867 return Some(idx);
868 }
869
870 // Step 2: range containment — find the first chunk whose [start_line,
871 // end_line] interval contains the target line.
872 chunks.iter().position(|c| {
873 path_matches(c) && c.start_line <= target_line_1based && target_line_1based <= c.end_line
874 })
875}
876
877/// Build (file_path → chunk indices, language → chunk indices) mappings.
878/// Build the per-file manifest by walking `root` with `walk_options`
879/// and stat + read + blake3 each file. Used at index construction; on
880/// reconcile, [`RipvecIndex::diff_against_filesystem`] uses the cheap
881/// stat-tuple path and only re-reads files whose tuple mismatches the
882/// stored entry.
883///
884/// Files that can't be read or stat'd are silently skipped; they will
885/// re-appear in the diff as `new` if they become readable later, or
886/// as missing on the next reconcile.
887fn build_manifest(root: &Path, walk_options: &WalkOptions) -> Manifest {
888 let mut manifest = Manifest::new();
889 let files = collect_files_with_options(root, walk_options);
890 for path in files {
891 let (Ok(metadata), Ok(bytes)) = (std::fs::metadata(&path), std::fs::read(&path)) else {
892 continue;
893 };
894 let entry = FileEntry::from_bytes(&metadata, &bytes);
895 manifest.insert(path, entry);
896 }
897 manifest
898}
899
900fn build_mappings(
901 chunks: &[CodeChunk],
902) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
903 let mut file_to_id: HashMap<String, Vec<usize>> = HashMap::new();
904 let mut lang_to_id: HashMap<String, Vec<usize>> = HashMap::new();
905 for (i, chunk) in chunks.iter().enumerate() {
906 file_to_id
907 .entry(chunk.file_path.clone())
908 .or_default()
909 .push(i);
910 // The semble port's chunker stores language inferentially (via
911 // extension); the per-chunk `language` field isn't populated on
912 // this path. The mapping is keyed on file extension as a proxy
913 // so `filter_languages: Some(&["rs"])` works.
914 if let Some(ext) = Path::new(&chunk.file_path)
915 .extension()
916 .and_then(|e| e.to_str())
917 {
918 lang_to_id.entry(ext.to_string()).or_default().push(i);
919 }
920 }
921 (file_to_id, lang_to_id)
922}
923
924#[cfg(test)]
925mod tests {
926 use super::*;
927
928 // ── I#64 / B-0028: within-chunk prose-density signal ─────────────────────
929
930 /// Helper: build a chunk on a `.py` path with the given source content.
931 /// `.py` file-extension classifies as Code via `is_prose_path`, so the
932 /// resulting prose classification depends purely on chunk content.
933 fn py_chunk(content: &str) -> crate::chunk::CodeChunk {
934 crate::chunk::CodeChunk {
935 file_path: "src/foo.py".to_string(),
936 name: "test".to_string(),
937 kind: "function_definition".to_string(),
938 content_kind: crate::chunk::ContentKind::Code,
939 start_line: 1,
940 end_line: 10,
941 symbol_line: 1,
942 content: content.to_string(),
943 enriched_content: content.to_string(),
944 qualified_name: None,
945 }
946 }
947
948 /// A chunk dominated by a Python triple-quoted docstring is prose.
949 #[test]
950 fn chunk_is_prose_dominated_python_docstring() {
951 let c = py_chunk(
952 "def handle_error(self, exc):\n \"\"\"This is the docstring \
953 that explains the error-handling contract. It dwarfs the body \
954 and so the chunk is dominated by prose.\"\"\"\n return None\n",
955 );
956 assert!(
957 chunk_is_prose_dominated(&c),
958 "Python triple-quoted docstring dominating the chunk must be \
959 recognised as prose-dominated"
960 );
961 }
962
963 /// A chunk dominated by code syntax is not prose.
964 #[test]
965 fn chunk_is_prose_dominated_pure_code_is_false() {
966 let c = py_chunk("def f(x, y):\n z = x * y + 2\n return z * z - (x + y)\n");
967 assert!(
968 !chunk_is_prose_dominated(&c),
969 "Pure code chunk (no docstring, no comments) must not be \
970 prose-dominated"
971 );
972 }
973
974 /// A chunk dominated by `//` line comments is prose.
975 #[test]
976 fn chunk_is_prose_dominated_line_comments() {
977 // ~80% comment bytes vs ~20% code.
978 let c = py_chunk(
979 "// This is a long-form explanation of why the function exists.\n\
980 // It spans multiple lines and dominates the chunk by byte count.\n\
981 // The actual code is a tiny one-liner.\n\
982 fn f() { 1 }\n",
983 );
984 assert!(
985 chunk_is_prose_dominated(&c),
986 "Chunk dominated by `//` line comments must be prose-dominated"
987 );
988 }
989
990 /// A chunk dominated by a `/* ... */` block comment is prose.
991 #[test]
992 fn chunk_is_prose_dominated_block_comment() {
993 let c = py_chunk(
994 "/* JS-doc style block comment describing the function in detail \
995 and taking up most of the chunk by byte volume. */\nfn g() {}\n",
996 );
997 assert!(
998 chunk_is_prose_dominated(&c),
999 "Chunk dominated by `/* ... */` block comment must be \
1000 prose-dominated"
1001 );
1002 }
1003
1004 /// Empty chunk content does not push corpus toward prose.
1005 #[test]
1006 fn chunk_is_prose_dominated_empty_is_false() {
1007 let c = py_chunk("");
1008 assert!(
1009 !chunk_is_prose_dominated(&c),
1010 "Empty chunk content must classify as not-prose (degenerate case)"
1011 );
1012 }
1013
1014 /// `CorpusClass::classify`: synthetic (prose=10, code=20) → Mixed.
1015 /// Matches the issue text's pure-unit test shape — given a chunk-count
1016 /// summary, assert the policy.
1017 #[test]
1018 fn corpus_class_classify_10_prose_20_code_is_mixed() {
1019 let prose = py_chunk(
1020 "def f():\n \"\"\"A substantial docstring whose byte count \
1021 dominates the chunk.\"\"\"\n pass\n",
1022 );
1023 let code = py_chunk("def g(x):\n return x + 1\n");
1024 // Pre-condition: helper agrees.
1025 assert!(chunk_is_prose_dominated(&prose));
1026 assert!(!chunk_is_prose_dominated(&code));
1027
1028 let mut chunks = Vec::new();
1029 for _ in 0..10 {
1030 chunks.push(prose.clone());
1031 }
1032 for _ in 0..20 {
1033 chunks.push(code.clone());
1034 }
1035 assert_eq!(
1036 CorpusClass::classify(&chunks),
1037 CorpusClass::Mixed,
1038 "10 prose : 20 code (~33% prose chunks) must classify as Mixed; \
1039 threshold is >= 30% prose"
1040 );
1041 assert!(
1042 CorpusClass::classify(&chunks).rerank_eligible(),
1043 "Mixed must be rerank-eligible — the I#64 / B-0028 fire path"
1044 );
1045 }
1046
1047 /// `CorpusClass::classify`: synthetic (prose=20, code=10) → Mixed,
1048 /// rerank-eligible (still under the 70% Docs cut but well over the
1049 /// 30% Mixed cut).
1050 #[test]
1051 fn corpus_class_classify_20_prose_10_code_is_rerank_eligible() {
1052 let prose = py_chunk(
1053 "def f():\n \"\"\"Substantial docstring that dominates the \
1054 chunk's bytes — a Mnemosyne-class signature.\"\"\"\n pass\n",
1055 );
1056 let code = py_chunk("def g(x):\n return x + 1\n");
1057 let mut chunks = Vec::new();
1058 for _ in 0..20 {
1059 chunks.push(prose.clone());
1060 }
1061 for _ in 0..10 {
1062 chunks.push(code.clone());
1063 }
1064 let class = CorpusClass::classify(&chunks);
1065 assert!(
1066 class.rerank_eligible(),
1067 "20 prose : 10 code (~67% prose chunks) must be rerank-eligible"
1068 );
1069 }
1070
1071 /// `CorpusClass::classify`: synthetic (prose=2, code=28) → Code, not
1072 /// rerank-eligible. Tests that low prose density correctly stays off.
1073 #[test]
1074 fn corpus_class_classify_low_prose_is_code() {
1075 let prose = py_chunk("def f():\n \"\"\"Docstring dominating the chunk's bytes.\"\"\"\n");
1076 let code = py_chunk("def g(x):\n return x + 1\n");
1077 let mut chunks = Vec::new();
1078 for _ in 0..2 {
1079 chunks.push(prose.clone());
1080 }
1081 for _ in 0..28 {
1082 chunks.push(code.clone());
1083 }
1084 assert_eq!(
1085 CorpusClass::classify(&chunks),
1086 CorpusClass::Code,
1087 "2:28 prose:code (~7% prose chunks) must classify as Code"
1088 );
1089 }
1090
1091 /// `is_prose_path` and chunk-content density are independent signals
1092 /// and stack (a `.md` file is prose regardless of content; a `.py`
1093 /// file becomes prose iff its chunk content is prose-dominated).
1094 #[test]
1095 fn corpus_class_classify_path_and_content_signals_compose() {
1096 let md_chunk = crate::chunk::CodeChunk {
1097 file_path: "README.md".to_string(),
1098 name: "readme".to_string(),
1099 kind: "paragraph".to_string(),
1100 content_kind: crate::chunk::ContentKind::Docs,
1101 start_line: 1,
1102 end_line: 5,
1103 symbol_line: 1,
1104 content: "function foo() { return 1; }".to_string(), // code-like content
1105 enriched_content: "function foo() { return 1; }".to_string(),
1106 qualified_name: None,
1107 };
1108 // Even though the content is code-like, the .md extension means
1109 // this counts as prose under the path signal.
1110 let chunks = vec![md_chunk];
1111 assert_eq!(
1112 CorpusClass::classify(&chunks),
1113 CorpusClass::Docs,
1114 "A .md path counts as prose under is_prose_path even if its \
1115 content is code-like — path and content signals OR together"
1116 );
1117 }
1118
1119 // ── existing tests below ─────────────────────────────────────────────────
1120
1121 /// Test-only constructor that bypasses `from_root` to allow unit
1122 /// tests to inject pre-built state (chunks, embeddings, mappings,
1123 /// manifest) without requiring a real model download.
1124 ///
1125 /// For tests that call `apply_diff` with a non-empty `diff.new` or
1126 /// `diff.dirty`, the caller must supply a real encoder because
1127 /// `apply_diff` calls `encoder.embed_paths`.
1128 #[allow(clippy::too_many_arguments)]
1129 fn new_for_test(
1130 chunks: Vec<crate::chunk::CodeChunk>,
1131 embeddings: ndarray::Array2<f32>,
1132 encoder: std::sync::Arc<StaticEncoder>,
1133 file_mapping: HashMap<String, Vec<usize>>,
1134 language_mapping: HashMap<String, Vec<usize>>,
1135 manifest: Manifest,
1136 root: std::path::PathBuf,
1137 walk_options: WalkOptions,
1138 ) -> RipvecIndex {
1139 let bm25 = Bm25Index::build(&chunks);
1140 let corpus_class = CorpusClass::classify(&chunks);
1141 RipvecIndex {
1142 chunks,
1143 embeddings,
1144 bm25,
1145 encoder,
1146 file_mapping,
1147 language_mapping,
1148 pagerank_lookup: None,
1149 pagerank_alpha: 0.0,
1150 corpus_class,
1151 root,
1152 walk_options,
1153 manifest,
1154 }
1155 }
1156
1157 /// Compile-time check that `RipvecIndex` carries the right method
1158 /// shape for the CLI to call.
1159 #[test]
1160 fn semble_index_search_signature_compiles() {
1161 fn shape_check(
1162 idx: &RipvecIndex,
1163 query: &str,
1164 top_k: usize,
1165 mode: SearchMode,
1166 ) -> Vec<(usize, f32)> {
1167 idx.search(query, top_k, mode, None, None, None)
1168 }
1169 // Reference to keep type-check live across dead-code analysis.
1170 let _ = shape_check;
1171 }
1172
1173 /// `behavior:pagerank-no-op-when-graph-absent` — when constructed
1174 /// without a PageRank lookup, the layer is a pure pass-through.
1175 /// (Asserted via the `apply_pagerank_layer` early-return path.)
1176 #[test]
1177 fn pagerank_layer_no_op_when_graph_absent() {
1178 // We can't easily build a RipvecIndex without a real encoder
1179 // (which requires a model download). Instead, exercise the
1180 // pass-through logic on a hand-built struct via the private
1181 // method. The function returns its input unchanged when
1182 // pagerank_lookup is None.
1183 //
1184 // Structural assertion: apply_pagerank_layer's first match
1185 // statement returns the input directly when lookup is None;
1186 // this is a single-branch invariant verified by inspection.
1187 // Behavioural verification is part of P5.1's parity test.
1188 let _ = "see apply_pagerank_layer docs";
1189 }
1190
1191 /// Corner case: a file appears in `diff.new` (absent from manifest)
1192 /// but `file_mapping` still holds stale chunk indices for it from a
1193 /// prior partial reconcile. Without the R4.1 fix, `apply_diff` skips
1194 /// clearing those stale chunks before re-embedding → duplicates.
1195 ///
1196 /// Gated `#[ignore]` because `apply_diff` calls `encoder.embed_paths`
1197 /// for files in `diff.new`, which requires the Model2Vec weights.
1198 /// Run once model is cached:
1199 /// `cargo test -p ripvec-core apply_diff_idempotent -- --ignored`
1200 #[test]
1201 #[ignore = "requires Model2Vec download (~32 MB on first run)"]
1202 fn apply_diff_idempotent_when_new_file_already_has_chunks() {
1203 use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
1204 use crate::profile::Profiler;
1205 use std::fs;
1206
1207 let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
1208 let encoder_arc = std::sync::Arc::new(encoder);
1209
1210 // Temporary corpus: one file (file_a.rs).
1211 let tmp = tempfile::TempDir::new().unwrap();
1212 let file_a = tmp.path().join("file_a.rs");
1213 fs::write(
1214 &file_a,
1215 "pub fn alpha() -> u32 { 1 }\npub fn beta() -> u32 { 2 }\n",
1216 )
1217 .unwrap();
1218
1219 // Embed file_a.rs once to obtain its canonical chunks/embeddings.
1220 let (real_chunks, real_embs) = encoder_arc
1221 .embed_paths(tmp.path(), std::slice::from_ref(&file_a), &Profiler::noop())
1222 .expect("embed_paths");
1223 let n_real = real_chunks.len();
1224 assert!(n_real > 0, "file_a.rs must produce at least one chunk");
1225
1226 let hidden_dim = real_embs[0].len();
1227 let mut flat: Vec<f32> = Vec::with_capacity(n_real * hidden_dim);
1228 for row in &real_embs {
1229 flat.extend(row);
1230 }
1231 let embeddings = ndarray::Array2::from_shape_vec((n_real, hidden_dim), flat).unwrap();
1232
1233 // file_mapping holds stale indices pointing at file_a.rs chunks.
1234 let rel_key = "file_a.rs".to_string();
1235 let indices: Vec<usize> = (0..n_real).collect();
1236 let file_mapping = HashMap::from([(rel_key, indices)]);
1237
1238 // Manifest is EMPTY: simulates a prior reconcile whose manifest
1239 // update failed, so diff_against_filesystem classifies file_a.rs
1240 // as "new" even though file_mapping still references its chunks.
1241 let manifest = Manifest::new();
1242
1243 let index = new_for_test(
1244 real_chunks,
1245 embeddings,
1246 std::sync::Arc::clone(&encoder_arc),
1247 file_mapping,
1248 HashMap::new(),
1249 manifest,
1250 tmp.path().to_path_buf(),
1251 WalkOptions::default(),
1252 );
1253
1254 let diff = index.diff_against_filesystem();
1255 assert!(
1256 diff.new.iter().any(|p| p.ends_with("file_a.rs")),
1257 "file_a.rs must appear in diff.new when manifest is empty; got {:?}",
1258 diff.new
1259 );
1260 assert!(diff.dirty.is_empty(), "no dirty expected");
1261 assert!(diff.deleted.is_empty(), "no deleted expected");
1262
1263 // With the fix (diff.new also processed in removed_indices), stale
1264 // chunks are dropped before re-embedding → chunk count equals
1265 // one fresh-embed pass. Without the fix, old + new chunks both
1266 // survive → count is doubled.
1267 let updated = index
1268 .apply_diff(&diff, &Profiler::noop())
1269 .expect("apply_diff");
1270
1271 let file_a_count = updated
1272 .chunks()
1273 .iter()
1274 .filter(|c| c.file_path.ends_with("file_a.rs"))
1275 .count();
1276
1277 assert_eq!(
1278 file_a_count, n_real,
1279 "file_a.rs chunk count must equal one fresh-embed pass ({n_real}); \
1280 got {file_a_count} — stale chunks from file_mapping not cleared"
1281 );
1282 assert_eq!(
1283 updated.embeddings().nrows(),
1284 updated.chunks().len(),
1285 "embeddings row count must match chunk count"
1286 );
1287 }
1288
1289 /// Derived: applying an empty diff twice must yield identical chunk
1290 /// counts — no accumulation from repeated no-op reconciles.
1291 ///
1292 /// Gated `#[ignore]` because building a real index requires the
1293 /// Model2Vec encoder (~32 MB).
1294 #[test]
1295 #[ignore = "requires Model2Vec download (~32 MB on first run)"]
1296 fn apply_diff_no_duplicate_chunks_after_two_passes() {
1297 use crate::embed::SearchConfig;
1298 use crate::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
1299 use crate::profile::Profiler;
1300 use std::fs;
1301
1302 let tmp = tempfile::TempDir::new().unwrap();
1303 fs::write(
1304 tmp.path().join("main.rs"),
1305 "fn main() { println!(\"hello\"); }\n",
1306 )
1307 .unwrap();
1308
1309 let encoder = StaticEncoder::from_pretrained(DEFAULT_MODEL_REPO).expect("encoder load");
1310 let cfg = SearchConfig {
1311 batch_size: 32,
1312 max_tokens: 512,
1313 chunk: crate::chunk::ChunkConfig {
1314 max_chunk_bytes: 4096,
1315 window_size: 2048,
1316 window_overlap: 512,
1317 },
1318 text_mode: false,
1319 cascade_dim: None,
1320 file_type: None,
1321 exclude_extensions: Vec::new(),
1322 include_extensions: Vec::new(),
1323 ignore_patterns: Vec::new(),
1324 corpus: crate::embed::Scope::All,
1325 mode: crate::hybrid::SearchMode::Hybrid,
1326 };
1327 let index = RipvecIndex::from_root(tmp.path(), encoder, &cfg, &Profiler::noop(), None, 0.0)
1328 .expect("from_root");
1329
1330 let original_count = index.chunks().len();
1331
1332 let diff1 = index.diff_against_filesystem();
1333 assert!(diff1.is_empty(), "fresh index must yield empty diff");
1334 let pass1 = index
1335 .apply_diff(&diff1, &Profiler::noop())
1336 .expect("apply_diff pass 1");
1337 assert_eq!(
1338 pass1.chunks().len(),
1339 original_count,
1340 "chunk count must be unchanged after empty-diff pass 1"
1341 );
1342
1343 let diff2 = pass1.diff_against_filesystem();
1344 assert!(
1345 diff2.is_empty(),
1346 "pass1 against unchanged FS must yield empty diff"
1347 );
1348 let pass2 = pass1
1349 .apply_diff(&diff2, &Profiler::noop())
1350 .expect("apply_diff pass 2");
1351 assert_eq!(
1352 pass2.chunks().len(),
1353 original_count,
1354 "chunk count must be unchanged after empty-diff pass 2"
1355 );
1356 }
1357}