Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58
59    if backends.is_empty() {
60        return Err(crate::Error::Other(anyhow::anyhow!(
61            "no embedding backends provided"
62        )));
63    }
64
65    // When repo_level is requested, ensure .ripvec/config.toml exists
66    // so that resolve_cache_dir will find it and use the repo-local path.
67    if repo_level {
68        let ripvec_dir = root.join(".ripvec");
69        let config_path = ripvec_dir.join("config.toml");
70        if !config_path.exists() {
71            let config = crate::cache::config::RepoConfig::new(
72                model_repo,
73                crate::cache::manifest::MANIFEST_VERSION.to_string(),
74            );
75            config.save(&ripvec_dir)?;
76        }
77        // Prevent merge conflicts: always keep "ours" for manifest.json.
78        // After merge, next `ripvec --index` reconciles from the filesystem.
79        let gitattributes_path = ripvec_dir.join(".gitattributes");
80        if !gitattributes_path.exists() {
81            let _ = std::fs::write(
82                &gitattributes_path,
83                "cache/manifest.json merge=ours\ncache/objects/** binary\n",
84            );
85        }
86    }
87
88    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
89    let portable = is_repo_local(&cache_dir);
90    let manifest_path = cache_dir.join("manifest.json");
91    let objects_dir = cache_dir.join("objects");
92    let store = ObjectStore::new(&objects_dir);
93
94    // Try loading existing manifest
95    let existing_manifest = Manifest::load(&manifest_path).ok();
96
97    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
98        // Incremental path: diff → re-embed dirty → merge
99        incremental_path(
100            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
101            start, portable,
102        )
103    } else {
104        // Cold path: full embed
105        full_index_path(
106            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
107            portable,
108        )
109    }
110}
111
112/// Incremental reindex: diff, re-embed dirty files, merge with cached.
113#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
114#[expect(
115    clippy::cast_possible_truncation,
116    reason = "duration in ms won't exceed u64"
117)]
118fn incremental_path(
119    root: &Path,
120    backends: &[&dyn EmbedBackend],
121    tokenizer: &tokenizers::Tokenizer,
122    cfg: &SearchConfig,
123    profiler: &Profiler,
124    _model_repo: &str,
125    cache_dir: &Path,
126    store: &ObjectStore,
127    mut manifest: Manifest,
128    start: Instant,
129    portable: bool,
130) -> crate::Result<(HybridIndex, ReindexStats)> {
131    let diff_result = diff::compute_diff(root, &manifest)?;
132
133    let files_changed = diff_result.dirty.len();
134    let files_deleted = diff_result.deleted.len();
135    let files_unchanged = diff_result.unchanged;
136
137    // Remove deleted files from manifest
138    for deleted in &diff_result.deleted {
139        manifest.remove_file(deleted);
140    }
141
142    // Re-embed dirty files
143    let mut new_chunks_count = 0;
144    for dirty_path in &diff_result.dirty {
145        let relative = dirty_path
146            .strip_prefix(root)
147            .unwrap_or(dirty_path)
148            .to_string_lossy()
149            .to_string();
150
151        // Remove old entry if it exists
152        manifest.remove_file(&relative);
153
154        // Chunk this file
155        let Some(source) = crate::embed::read_source(dirty_path) else {
156            continue;
157        };
158
159        let ext = dirty_path
160            .extension()
161            .and_then(|e| e.to_str())
162            .unwrap_or("");
163        let chunks = if cfg.text_mode {
164            crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
165        } else {
166            match crate::languages::config_for_extension(ext) {
167                Some(lang_config) => {
168                    crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
169                }
170                None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
171            }
172        };
173
174        if chunks.is_empty() {
175            continue;
176        }
177
178        // Tokenize
179        let model_max = backends[0].max_tokens();
180        let encodings: Vec<Option<crate::backend::Encoding>> = chunks
181            .iter()
182            .map(|chunk| {
183                crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
184            })
185            .collect();
186
187        // Embed
188        let embeddings =
189            crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
190
191        // Filter out failed tokenizations
192        let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
193            .into_iter()
194            .zip(embeddings.into_iter())
195            .filter(|(_, emb)| !emb.is_empty())
196            .unzip();
197
198        let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
199
200        // Save to object store
201        let content_hash = diff::hash_file(dirty_path)?;
202        let file_cache = FileCache {
203            chunks: good_chunks.clone(),
204            embeddings: good_embeddings.iter().flatten().copied().collect(),
205            hidden_dim,
206        };
207        let bytes = if portable {
208            file_cache.to_portable_bytes()
209        } else {
210            file_cache.to_bytes()
211        };
212        store.write(&content_hash, &bytes)?;
213
214        // Update manifest
215        let mtime = diff::mtime_secs(dirty_path);
216        let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
217        manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
218        new_chunks_count += good_chunks.len();
219    }
220
221    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
222    // but content hashes match). This ensures the fast-path mtime check
223    // works on subsequent runs.
224    heal_manifest_mtimes(root, &mut manifest);
225
226    // Recompute Merkle hashes
227    manifest.recompute_hashes();
228
229    // GC unreferenced objects
230    let referenced = manifest.referenced_hashes();
231    store.gc(&referenced)?;
232
233    // Save manifest
234    manifest.save(&cache_dir.join("manifest.json"))?;
235
236    // Rebuild HybridIndex (semantic + BM25) from all cached objects
237    let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
238    let chunks_total = all_chunks.len();
239    let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
240
241    Ok((
242        hybrid,
243        ReindexStats {
244            chunks_total,
245            chunks_reembedded: new_chunks_count,
246            files_unchanged,
247            files_changed,
248            files_deleted,
249            duration_ms: start.elapsed().as_millis() as u64,
250        },
251    ))
252}
253
254/// Full index from scratch: embed everything, save to cache.
255#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
256#[expect(
257    clippy::cast_possible_truncation,
258    reason = "duration in ms won't exceed u64"
259)]
260fn full_index_path(
261    root: &Path,
262    backends: &[&dyn EmbedBackend],
263    tokenizer: &tokenizers::Tokenizer,
264    cfg: &SearchConfig,
265    profiler: &Profiler,
266    model_repo: &str,
267    cache_dir: &Path,
268    store: &ObjectStore,
269    start: Instant,
270    portable: bool,
271) -> crate::Result<(HybridIndex, ReindexStats)> {
272    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
273
274    let hidden_dim = embeddings.first().map_or(384, Vec::len);
275
276    // Group chunks and embeddings by file, save to store
277    let mut manifest = Manifest::new(model_repo);
278    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
279        std::collections::BTreeMap::new();
280
281    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
282        file_groups
283            .entry(chunk.file_path.clone())
284            .or_default()
285            .0
286            .push(chunk.clone());
287        file_groups
288            .entry(chunk.file_path.clone())
289            .or_default()
290            .1
291            .push(emb.clone());
292    }
293
294    for (file_path, (file_chunks, file_embeddings)) in &file_groups {
295        // file_path from CodeChunk is already an absolute or cwd-relative path
296        let file_path_buf = PathBuf::from(file_path);
297
298        let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
299            // File might not exist (e.g., generated content) — use chunk content hash
300            blake3::hash(file_chunks[0].content.as_bytes())
301                .to_hex()
302                .to_string()
303        });
304
305        let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
306        let fc = FileCache {
307            chunks: file_chunks.clone(),
308            embeddings: flat_emb,
309            hidden_dim,
310        };
311        let bytes = if portable {
312            fc.to_portable_bytes()
313        } else {
314            fc.to_bytes()
315        };
316        store.write(&content_hash, &bytes)?;
317
318        let relative = file_path_buf
319            .strip_prefix(root)
320            .unwrap_or(&file_path_buf)
321            .to_string_lossy()
322            .to_string();
323        let mtime = diff::mtime_secs(&file_path_buf);
324        let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
325        manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
326    }
327
328    manifest.recompute_hashes();
329    manifest.save(&cache_dir.join("manifest.json"))?;
330
331    let chunks_total = chunks.len();
332    let files_changed = file_groups.len();
333    let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
334
335    Ok((
336        hybrid,
337        ReindexStats {
338            chunks_total,
339            chunks_reembedded: chunks_total,
340            files_unchanged: 0,
341            files_changed,
342            files_deleted: 0,
343            duration_ms: start.elapsed().as_millis() as u64,
344        },
345    ))
346}
347
348/// Check if the resolved cache directory is inside a `.ripvec/` directory.
349#[must_use]
350pub fn is_repo_local(cache_dir: &Path) -> bool {
351    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
352}
353
354/// Update manifest file mtimes to match the current filesystem.
355///
356/// After a git clone, all file mtimes are set to clone time, making the
357/// fast-path mtime check miss on every file. This function updates the
358/// manifest mtimes so subsequent diffs use the fast path.
359pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
360    for (relative, entry) in &mut manifest.files {
361        let file_path = root.join(relative);
362        let mtime = diff::mtime_secs(&file_path);
363        if mtime != entry.mtime_secs {
364            entry.mtime_secs = mtime;
365        }
366    }
367}
368
369/// Check whether `pull.autoStash` needs to be configured for a repo-local cache.
370///
371/// Returns `Some(message)` with a human-readable prompt if the setting has not
372/// been configured yet. Returns `None` if already configured (in git config or
373/// `.ripvec/config.toml`) or if the cache is not repo-local.
374#[must_use]
375pub fn check_auto_stash(root: &Path) -> Option<String> {
376    use std::process::Command;
377
378    let ripvec_dir = root.join(".ripvec");
379    let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
380    if !config.cache.local {
381        return None;
382    }
383
384    // Already decided via config.toml
385    if config.cache.auto_stash.is_some() {
386        return None;
387    }
388
389    // Already set in git config (by user or previous run)
390    let git_check = Command::new("git")
391        .args(["config", "--local", "pull.autoStash"])
392        .current_dir(root)
393        .stdout(std::process::Stdio::piped())
394        .stderr(std::process::Stdio::null())
395        .output()
396        .ok()?;
397    if git_check.status.success() {
398        // Sync the existing git setting into config.toml so we don't check again
399        let val = String::from_utf8_lossy(&git_check.stdout)
400            .trim()
401            .eq_ignore_ascii_case("true");
402        let _ = apply_auto_stash(root, val);
403        return None;
404    }
405
406    Some(
407        "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
408         Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
409            .to_string(),
410    )
411}
412
413/// Apply the user's `auto_stash` choice: set git config and save to `config.toml`.
414///
415/// When `enable` is true, runs `git config --local pull.autoStash true`.
416/// The choice is persisted to `.ripvec/config.toml` so the prompt is not repeated.
417///
418/// # Errors
419///
420/// Returns an error if `config.toml` cannot be read or written.
421pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
422    use std::process::Command;
423
424    let ripvec_dir = root.join(".ripvec");
425    let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
426    config.cache.auto_stash = Some(enable);
427    config.save(&ripvec_dir)?;
428
429    if enable {
430        let _ = Command::new("git")
431            .args(["config", "--local", "pull.autoStash", "true"])
432            .current_dir(root)
433            .stdout(std::process::Stdio::null())
434            .stderr(std::process::Stdio::null())
435            .status();
436    }
437
438    Ok(())
439}
440
441/// Load a `FileCache` from bytes, auto-detecting the format.
442/// Checks for bitcode magic first (portable), then falls back to rkyv.
443fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
444    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
445        FileCache::from_portable_bytes(bytes)
446    } else {
447        FileCache::from_bytes(bytes)
448    }
449}
450
451/// Load all cached chunks and embeddings from the object store.
452fn load_all_from_store(
453    store: &ObjectStore,
454    manifest: &Manifest,
455) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
456    let mut all_chunks = Vec::new();
457    let mut all_embeddings = Vec::new();
458
459    for entry in manifest.files.values() {
460        let bytes = store.read(&entry.content_hash)?;
461        let fc = load_file_cache(&bytes)?;
462        let dim = fc.hidden_dim;
463
464        for (i, chunk) in fc.chunks.into_iter().enumerate() {
465            let start = i * dim;
466            let end = start + dim;
467            if end <= fc.embeddings.len() {
468                all_embeddings.push(fc.embeddings[start..end].to_vec());
469                all_chunks.push(chunk);
470            }
471        }
472    }
473
474    Ok((all_chunks, all_embeddings))
475}
476
477/// Load a pre-built index from the disk cache without re-embedding.
478///
479/// This is the lightweight read path for processes that don't own the index
480/// (e.g., the LSP process reading caches built by the MCP process).
481/// Returns `None` if no compatible cache exists for this root.
482///
483/// Uses an advisory file lock on `manifest.lock` to avoid reading
484/// a half-written cache.
485#[must_use]
486pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
487    let cache_dir = resolve_cache_dir(root, model_repo, None);
488    let manifest_path = cache_dir.join("manifest.json");
489    let objects_dir = cache_dir.join("objects");
490    let lock_path = cache_dir.join("manifest.lock");
491
492    // Ensure cache dir exists (it might not if no index has been built)
493    if !manifest_path.exists() {
494        return None;
495    }
496
497    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
498    let lock_file = std::fs::OpenOptions::new()
499        .create(true)
500        .truncate(false)
501        .write(true)
502        .read(true)
503        .open(&lock_path)
504        .ok()?;
505    let lock = fd_lock::RwLock::new(lock_file);
506    let _guard = lock.read().ok()?;
507
508    let manifest = Manifest::load(&manifest_path).ok()?;
509    if !manifest.is_compatible(model_repo) {
510        return None;
511    }
512
513    let store = ObjectStore::new(&objects_dir);
514    let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
515    HybridIndex::new(chunks, &embeddings, None).ok()
516}
517
518/// Resolve the cache directory for a project + model combination.
519///
520/// Resolution priority:
521/// 1. `override_dir` parameter (highest)
522/// 2. `.ripvec/config.toml` in directory tree (repo-local)
523/// 3. `RIPVEC_CACHE` environment variable
524/// 4. XDG cache dir (`~/.cache/ripvec/`)
525///
526/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
527/// or version subdirectory — the config.toml pins the model and version).
528///
529/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
530#[must_use]
531pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
532    // Priority 1: explicit override
533    if let Some(dir) = override_dir {
534        let project_hash = hash_project_root(root);
535        let version_dir = format_version_dir(model_repo);
536        return dir.join(&project_hash).join(version_dir);
537    }
538
539    // Priority 2: repo-local .ripvec/config.toml (with model validation)
540    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
541        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
542    {
543        if config.cache.model == model_repo {
544            return ripvec_dir.join("cache");
545        }
546        eprintln!(
547            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
548            config.cache.model, model_repo
549        );
550    }
551
552    // Priority 3+4: env var or XDG
553    let project_hash = hash_project_root(root);
554    let version_dir = format_version_dir(model_repo);
555
556    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
557        PathBuf::from(env_dir).join(&project_hash)
558    } else {
559        dirs::cache_dir()
560            .unwrap_or_else(|| PathBuf::from("/tmp"))
561            .join("ripvec")
562            .join(&project_hash)
563    };
564
565    base.join(version_dir)
566}
567
568/// Blake3 hash of the canonical project root path.
569fn hash_project_root(root: &Path) -> String {
570    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
571    blake3::hash(canonical.to_string_lossy().as_bytes())
572        .to_hex()
573        .to_string()
574}
575
576/// Format the version subdirectory name from model repo.
577fn format_version_dir(model_repo: &str) -> String {
578    let model_slug = model_repo
579        .rsplit('/')
580        .next()
581        .unwrap_or(model_repo)
582        .to_lowercase();
583    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
584}
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589    use tempfile::TempDir;
590
591    #[test]
592    fn heal_stale_mtimes() {
593        use crate::cache::diff;
594        use crate::cache::manifest::Manifest;
595        use std::io::Write;
596
597        let dir = TempDir::new().unwrap();
598        let file_path = dir.path().join("test.rs");
599        let content = "fn main() {}";
600        {
601            let mut f = std::fs::File::create(&file_path).unwrap();
602            f.write_all(content.as_bytes()).unwrap();
603        }
604
605        // Create manifest with correct content hash but wrong mtime
606        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
607        let mut manifest = Manifest::new("test-model");
608        manifest.add_file(
609            "test.rs",
610            9_999_999, // deliberately wrong mtime
611            content.len() as u64,
612            &content_hash,
613            1,
614        );
615
616        // After heal, the manifest mtime should match the filesystem
617        heal_manifest_mtimes(dir.path(), &mut manifest);
618        let actual_mtime = diff::mtime_secs(&file_path);
619        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
620    }
621
622    #[test]
623    fn resolve_uses_repo_local_when_present() {
624        let dir = TempDir::new().unwrap();
625        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
626        cfg.save(&dir.path().join(".ripvec")).unwrap();
627
628        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
629        assert!(
630            result.starts_with(dir.path().join(".ripvec").join("cache")),
631            "expected repo-local cache dir, got: {result:?}"
632        );
633    }
634
635    #[test]
636    fn resolve_falls_back_to_user_cache_when_no_config() {
637        let dir = TempDir::new().unwrap();
638        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
639        assert!(
640            !result.to_string_lossy().contains(".ripvec"),
641            "should not use repo-local without config, got: {result:?}"
642        );
643    }
644
645    #[test]
646    fn resolve_override_takes_priority_over_repo_local() {
647        let dir = TempDir::new().unwrap();
648        let override_dir = TempDir::new().unwrap();
649
650        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
651        cfg.save(&dir.path().join(".ripvec")).unwrap();
652
653        let result = resolve_cache_dir(
654            dir.path(),
655            "nomic-ai/modernbert-embed-base",
656            Some(override_dir.path()),
657        );
658        assert!(
659            !result.starts_with(dir.path().join(".ripvec")),
660            "override should win over repo-local, got: {result:?}"
661        );
662    }
663}