Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58
59    if backends.is_empty() {
60        return Err(crate::Error::Other(anyhow::anyhow!(
61            "no embedding backends provided"
62        )));
63    }
64
65    // When repo_level is requested, ensure .ripvec/config.toml exists
66    // so that resolve_cache_dir will find it and use the repo-local path.
67    if repo_level {
68        let ripvec_dir = root.join(".ripvec");
69        let config_path = ripvec_dir.join("config.toml");
70        if !config_path.exists() {
71            let config = crate::cache::config::RepoConfig::new(
72                model_repo,
73                crate::cache::manifest::MANIFEST_VERSION.to_string(),
74            );
75            config.save(&ripvec_dir)?;
76        }
77    }
78
79    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
80    let portable = is_repo_local(&cache_dir);
81    let manifest_path = cache_dir.join("manifest.json");
82    let objects_dir = cache_dir.join("objects");
83    let store = ObjectStore::new(&objects_dir);
84
85    // Try loading existing manifest
86    let existing_manifest = Manifest::load(&manifest_path).ok();
87
88    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
89        // Incremental path: diff → re-embed dirty → merge
90        incremental_path(
91            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
92            start, portable,
93        )
94    } else {
95        // Cold path: full embed
96        full_index_path(
97            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
98            portable,
99        )
100    }
101}
102
103/// Incremental reindex: diff, re-embed dirty files, merge with cached.
104#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
105#[expect(
106    clippy::cast_possible_truncation,
107    reason = "duration in ms won't exceed u64"
108)]
109fn incremental_path(
110    root: &Path,
111    backends: &[&dyn EmbedBackend],
112    tokenizer: &tokenizers::Tokenizer,
113    cfg: &SearchConfig,
114    profiler: &Profiler,
115    _model_repo: &str,
116    cache_dir: &Path,
117    store: &ObjectStore,
118    mut manifest: Manifest,
119    start: Instant,
120    portable: bool,
121) -> crate::Result<(HybridIndex, ReindexStats)> {
122    let diff_result = diff::compute_diff(root, &manifest)?;
123
124    let files_changed = diff_result.dirty.len();
125    let files_deleted = diff_result.deleted.len();
126    let files_unchanged = diff_result.unchanged;
127
128    // Remove deleted files from manifest
129    for deleted in &diff_result.deleted {
130        manifest.remove_file(deleted);
131    }
132
133    // Re-embed dirty files
134    let mut new_chunks_count = 0;
135    for dirty_path in &diff_result.dirty {
136        let relative = dirty_path
137            .strip_prefix(root)
138            .unwrap_or(dirty_path)
139            .to_string_lossy()
140            .to_string();
141
142        // Remove old entry if it exists
143        manifest.remove_file(&relative);
144
145        // Chunk this file
146        let Some(source) = crate::embed::read_source(dirty_path) else {
147            continue;
148        };
149
150        let ext = dirty_path
151            .extension()
152            .and_then(|e| e.to_str())
153            .unwrap_or("");
154        let chunks = if cfg.text_mode {
155            crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
156        } else {
157            match crate::languages::config_for_extension(ext) {
158                Some(lang_config) => {
159                    crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
160                }
161                None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
162            }
163        };
164
165        if chunks.is_empty() {
166            continue;
167        }
168
169        // Tokenize
170        let model_max = backends[0].max_tokens();
171        let encodings: Vec<Option<crate::backend::Encoding>> = chunks
172            .iter()
173            .map(|chunk| {
174                crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
175            })
176            .collect();
177
178        // Embed
179        let embeddings =
180            crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
181
182        // Filter out failed tokenizations
183        let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
184            .into_iter()
185            .zip(embeddings.into_iter())
186            .filter(|(_, emb)| !emb.is_empty())
187            .unzip();
188
189        let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
190
191        // Save to object store
192        let content_hash = diff::hash_file(dirty_path)?;
193        let file_cache = FileCache {
194            chunks: good_chunks.clone(),
195            embeddings: good_embeddings.iter().flatten().copied().collect(),
196            hidden_dim,
197        };
198        let bytes = if portable {
199            file_cache.to_portable_bytes()
200        } else {
201            file_cache.to_bytes()
202        };
203        store.write(&content_hash, &bytes)?;
204
205        // Update manifest
206        let mtime = diff::mtime_secs(dirty_path);
207        let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
208        manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
209        new_chunks_count += good_chunks.len();
210    }
211
212    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
213    // but content hashes match). This ensures the fast-path mtime check
214    // works on subsequent runs.
215    heal_manifest_mtimes(root, &mut manifest);
216
217    // Recompute Merkle hashes
218    manifest.recompute_hashes();
219
220    // GC unreferenced objects
221    let referenced = manifest.referenced_hashes();
222    store.gc(&referenced)?;
223
224    // Save manifest
225    manifest.save(&cache_dir.join("manifest.json"))?;
226
227    // Rebuild HybridIndex (semantic + BM25) from all cached objects
228    let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
229    let chunks_total = all_chunks.len();
230    let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
231
232    Ok((
233        hybrid,
234        ReindexStats {
235            chunks_total,
236            chunks_reembedded: new_chunks_count,
237            files_unchanged,
238            files_changed,
239            files_deleted,
240            duration_ms: start.elapsed().as_millis() as u64,
241        },
242    ))
243}
244
245/// Full index from scratch: embed everything, save to cache.
246#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
247#[expect(
248    clippy::cast_possible_truncation,
249    reason = "duration in ms won't exceed u64"
250)]
251fn full_index_path(
252    root: &Path,
253    backends: &[&dyn EmbedBackend],
254    tokenizer: &tokenizers::Tokenizer,
255    cfg: &SearchConfig,
256    profiler: &Profiler,
257    model_repo: &str,
258    cache_dir: &Path,
259    store: &ObjectStore,
260    start: Instant,
261    portable: bool,
262) -> crate::Result<(HybridIndex, ReindexStats)> {
263    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
264
265    let hidden_dim = embeddings.first().map_or(384, Vec::len);
266
267    // Group chunks and embeddings by file, save to store
268    let mut manifest = Manifest::new(model_repo);
269    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
270        std::collections::BTreeMap::new();
271
272    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
273        file_groups
274            .entry(chunk.file_path.clone())
275            .or_default()
276            .0
277            .push(chunk.clone());
278        file_groups
279            .entry(chunk.file_path.clone())
280            .or_default()
281            .1
282            .push(emb.clone());
283    }
284
285    for (file_path, (file_chunks, file_embeddings)) in &file_groups {
286        // file_path from CodeChunk is already an absolute or cwd-relative path
287        let file_path_buf = PathBuf::from(file_path);
288
289        let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
290            // File might not exist (e.g., generated content) — use chunk content hash
291            blake3::hash(file_chunks[0].content.as_bytes())
292                .to_hex()
293                .to_string()
294        });
295
296        let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
297        let fc = FileCache {
298            chunks: file_chunks.clone(),
299            embeddings: flat_emb,
300            hidden_dim,
301        };
302        let bytes = if portable {
303            fc.to_portable_bytes()
304        } else {
305            fc.to_bytes()
306        };
307        store.write(&content_hash, &bytes)?;
308
309        let relative = file_path_buf
310            .strip_prefix(root)
311            .unwrap_or(&file_path_buf)
312            .to_string_lossy()
313            .to_string();
314        let mtime = diff::mtime_secs(&file_path_buf);
315        let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
316        manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
317    }
318
319    manifest.recompute_hashes();
320    manifest.save(&cache_dir.join("manifest.json"))?;
321
322    let chunks_total = chunks.len();
323    let files_changed = file_groups.len();
324    let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
325
326    Ok((
327        hybrid,
328        ReindexStats {
329            chunks_total,
330            chunks_reembedded: chunks_total,
331            files_unchanged: 0,
332            files_changed,
333            files_deleted: 0,
334            duration_ms: start.elapsed().as_millis() as u64,
335        },
336    ))
337}
338
339/// Check if the resolved cache directory is inside a `.ripvec/` directory.
340#[must_use]
341pub fn is_repo_local(cache_dir: &Path) -> bool {
342    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
343}
344
345/// Update manifest file mtimes to match the current filesystem.
346///
347/// After a git clone, all file mtimes are set to clone time, making the
348/// fast-path mtime check miss on every file. This function updates the
349/// manifest mtimes so subsequent diffs use the fast path.
350pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
351    for (relative, entry) in &mut manifest.files {
352        let file_path = root.join(relative);
353        let mtime = diff::mtime_secs(&file_path);
354        if mtime != entry.mtime_secs {
355            entry.mtime_secs = mtime;
356        }
357    }
358}
359
360/// Load a `FileCache` from bytes, auto-detecting the format.
361/// Checks for bitcode magic first (portable), then falls back to rkyv.
362fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
363    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
364        FileCache::from_portable_bytes(bytes)
365    } else {
366        FileCache::from_bytes(bytes)
367    }
368}
369
370/// Load all cached chunks and embeddings from the object store.
371fn load_all_from_store(
372    store: &ObjectStore,
373    manifest: &Manifest,
374) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
375    let mut all_chunks = Vec::new();
376    let mut all_embeddings = Vec::new();
377
378    for entry in manifest.files.values() {
379        let bytes = store.read(&entry.content_hash)?;
380        let fc = load_file_cache(&bytes)?;
381        let dim = fc.hidden_dim;
382
383        for (i, chunk) in fc.chunks.into_iter().enumerate() {
384            let start = i * dim;
385            let end = start + dim;
386            if end <= fc.embeddings.len() {
387                all_embeddings.push(fc.embeddings[start..end].to_vec());
388                all_chunks.push(chunk);
389            }
390        }
391    }
392
393    Ok((all_chunks, all_embeddings))
394}
395
396/// Load a pre-built index from the disk cache without re-embedding.
397///
398/// This is the lightweight read path for processes that don't own the index
399/// (e.g., the LSP process reading caches built by the MCP process).
400/// Returns `None` if no compatible cache exists for this root.
401///
402/// Uses an advisory file lock on `manifest.lock` to avoid reading
403/// a half-written cache.
404#[must_use]
405pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
406    let cache_dir = resolve_cache_dir(root, model_repo, None);
407    let manifest_path = cache_dir.join("manifest.json");
408    let objects_dir = cache_dir.join("objects");
409    let lock_path = cache_dir.join("manifest.lock");
410
411    // Ensure cache dir exists (it might not if no index has been built)
412    if !manifest_path.exists() {
413        return None;
414    }
415
416    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
417    let lock_file = std::fs::OpenOptions::new()
418        .create(true)
419        .truncate(false)
420        .write(true)
421        .read(true)
422        .open(&lock_path)
423        .ok()?;
424    let lock = fd_lock::RwLock::new(lock_file);
425    let _guard = lock.read().ok()?;
426
427    let manifest = Manifest::load(&manifest_path).ok()?;
428    if !manifest.is_compatible(model_repo) {
429        return None;
430    }
431
432    let store = ObjectStore::new(&objects_dir);
433    let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
434    HybridIndex::new(chunks, &embeddings, None).ok()
435}
436
437/// Resolve the cache directory for a project + model combination.
438///
439/// Resolution priority:
440/// 1. `override_dir` parameter (highest)
441/// 2. `.ripvec/config.toml` in directory tree (repo-local)
442/// 3. `RIPVEC_CACHE` environment variable
443/// 4. XDG cache dir (`~/.cache/ripvec/`)
444///
445/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
446/// or version subdirectory — the config.toml pins the model and version).
447///
448/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
449#[must_use]
450pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
451    // Priority 1: explicit override
452    if let Some(dir) = override_dir {
453        let project_hash = hash_project_root(root);
454        let version_dir = format_version_dir(model_repo);
455        return dir.join(&project_hash).join(version_dir);
456    }
457
458    // Priority 2: repo-local .ripvec/config.toml (with model validation)
459    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
460        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
461    {
462        if config.cache.model == model_repo {
463            return ripvec_dir.join("cache");
464        }
465        eprintln!(
466            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
467            config.cache.model, model_repo
468        );
469    }
470
471    // Priority 3+4: env var or XDG
472    let project_hash = hash_project_root(root);
473    let version_dir = format_version_dir(model_repo);
474
475    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
476        PathBuf::from(env_dir).join(&project_hash)
477    } else {
478        dirs::cache_dir()
479            .unwrap_or_else(|| PathBuf::from("/tmp"))
480            .join("ripvec")
481            .join(&project_hash)
482    };
483
484    base.join(version_dir)
485}
486
487/// Blake3 hash of the canonical project root path.
488fn hash_project_root(root: &Path) -> String {
489    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
490    blake3::hash(canonical.to_string_lossy().as_bytes())
491        .to_hex()
492        .to_string()
493}
494
495/// Format the version subdirectory name from model repo.
496fn format_version_dir(model_repo: &str) -> String {
497    let model_slug = model_repo
498        .rsplit('/')
499        .next()
500        .unwrap_or(model_repo)
501        .to_lowercase();
502    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
503}
504
505#[cfg(test)]
506mod tests {
507    use super::*;
508    use tempfile::TempDir;
509
510    #[test]
511    fn heal_stale_mtimes() {
512        use crate::cache::diff;
513        use crate::cache::manifest::Manifest;
514        use std::io::Write;
515
516        let dir = TempDir::new().unwrap();
517        let file_path = dir.path().join("test.rs");
518        let content = "fn main() {}";
519        {
520            let mut f = std::fs::File::create(&file_path).unwrap();
521            f.write_all(content.as_bytes()).unwrap();
522        }
523
524        // Create manifest with correct content hash but wrong mtime
525        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
526        let mut manifest = Manifest::new("test-model");
527        manifest.add_file(
528            "test.rs",
529            9_999_999, // deliberately wrong mtime
530            content.len() as u64,
531            &content_hash,
532            1,
533        );
534
535        // After heal, the manifest mtime should match the filesystem
536        heal_manifest_mtimes(dir.path(), &mut manifest);
537        let actual_mtime = diff::mtime_secs(&file_path);
538        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
539    }
540
541    #[test]
542    fn resolve_uses_repo_local_when_present() {
543        let dir = TempDir::new().unwrap();
544        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
545        cfg.save(&dir.path().join(".ripvec")).unwrap();
546
547        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
548        assert!(
549            result.starts_with(dir.path().join(".ripvec").join("cache")),
550            "expected repo-local cache dir, got: {result:?}"
551        );
552    }
553
554    #[test]
555    fn resolve_falls_back_to_user_cache_when_no_config() {
556        let dir = TempDir::new().unwrap();
557        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
558        assert!(
559            !result.to_string_lossy().contains(".ripvec"),
560            "should not use repo-local without config, got: {result:?}"
561        );
562    }
563
564    #[test]
565    fn resolve_override_takes_priority_over_repo_local() {
566        let dir = TempDir::new().unwrap();
567        let override_dir = TempDir::new().unwrap();
568
569        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
570        cfg.save(&dir.path().join(".ripvec")).unwrap();
571
572        let result = resolve_cache_dir(
573            dir.path(),
574            "nomic-ai/modernbert-embed-base",
575            Some(override_dir.path()),
576        );
577        assert!(
578            !result.starts_with(dir.path().join(".ripvec")),
579            "override should win over repo-local, got: {result:?}"
580        );
581    }
582}