Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58    tracing::info!(root = %root.display(), model = model_repo, "incremental_index starting");
59
60    if backends.is_empty() {
61        return Err(crate::Error::Other(anyhow::anyhow!(
62            "no embedding backends provided"
63        )));
64    }
65
66    {
67        let guard = profiler.phase("cache_prepare");
68        // When repo_level is requested, ensure .ripvec/config.toml exists
69        // so that resolve_cache_dir will find it and use the repo-local path.
70        if repo_level {
71            let ripvec_dir = root.join(".ripvec");
72            let config_path = ripvec_dir.join("config.toml");
73            if !config_path.exists() {
74                let config = crate::cache::config::RepoConfig::new(
75                    model_repo,
76                    crate::cache::manifest::MANIFEST_VERSION.to_string(),
77                );
78                config.save(&ripvec_dir)?;
79            }
80            // Gitignore the manifest — it's rebuilt from objects on first use.
81            // Objects are content-addressed and never cause merge conflicts.
82            let gitignore_path = ripvec_dir.join(".gitignore");
83            if !gitignore_path.exists() {
84                let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
85            }
86        }
87        guard.set_detail(format!("repo_level={repo_level}"));
88    }
89
90    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
91    let portable = is_repo_local(&cache_dir);
92    let manifest_path = cache_dir.join("manifest.json");
93    let objects_dir = cache_dir.join("objects");
94    let store = ObjectStore::new(&objects_dir);
95
96    tracing::info!(
97        cache_dir = %cache_dir.display(),
98        portable,
99        manifest = %manifest_path.display(),
100        "cache resolved"
101    );
102
103    // Try loading existing manifest, or rebuild from objects if missing.
104    let existing_manifest = {
105        let guard = profiler.phase("cache_manifest");
106        let manifest = Manifest::load(&manifest_path)
107            .ok()
108            .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
109        guard.set_detail(match &manifest {
110            Some(m) => format!("{} files", m.files.len()),
111            None => "none".to_string(),
112        });
113        manifest
114    };
115
116    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
117        tracing::info!(
118            files = manifest.files.len(),
119            "manifest loaded, running incremental diff"
120        );
121        // Incremental path: diff → re-embed dirty → merge
122        incremental_path(
123            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
124            start, portable,
125        )
126    } else {
127        // Cold path: full embed
128        full_index_path(
129            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
130            portable,
131        )
132    }
133}
134
135/// Incremental reindex: diff, re-embed dirty files, merge with cached.
136#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
137#[expect(
138    clippy::too_many_lines,
139    reason = "incremental cache pipeline orchestration with diagnostic phase boundaries"
140)]
141#[expect(
142    clippy::cast_possible_truncation,
143    reason = "duration in ms won't exceed u64"
144)]
145fn incremental_path(
146    root: &Path,
147    backends: &[&dyn EmbedBackend],
148    tokenizer: &tokenizers::Tokenizer,
149    cfg: &SearchConfig,
150    profiler: &Profiler,
151    _model_repo: &str,
152    cache_dir: &Path,
153    store: &ObjectStore,
154    mut manifest: Manifest,
155    start: Instant,
156    portable: bool,
157) -> crate::Result<(HybridIndex, ReindexStats)> {
158    let diff_result = {
159        let guard = profiler.phase("cache_diff");
160        let diff_result = diff::compute_diff(root, &manifest)?;
161        guard.set_detail(format!(
162            "{} changed, {} deleted, {} unchanged",
163            diff_result.dirty.len(),
164            diff_result.deleted.len(),
165            diff_result.unchanged,
166        ));
167        diff_result
168    };
169
170    let files_changed = diff_result.dirty.len();
171    let files_deleted = diff_result.deleted.len();
172    let files_unchanged = diff_result.unchanged;
173
174    tracing::info!(
175        changed = files_changed,
176        deleted = files_deleted,
177        unchanged = files_unchanged,
178        "diff complete"
179    );
180
181    // Remove deleted files from manifest
182    for deleted in &diff_result.deleted {
183        manifest.remove_file(deleted);
184    }
185
186    // Re-embed dirty files
187    let mut new_chunks_count = 0;
188    {
189        let guard = profiler.phase("reembed_dirty_files");
190        tracing::info!(files = files_changed, "re-embedding changed files");
191        for dirty_path in &diff_result.dirty {
192            let relative = dirty_path
193                .strip_prefix(root)
194                .unwrap_or(dirty_path)
195                .to_string_lossy()
196                .to_string();
197
198            // Remove old entry if it exists
199            manifest.remove_file(&relative);
200
201            // Chunk this file
202            let Some(source) = crate::embed::read_source(dirty_path) else {
203                continue;
204            };
205
206            let ext = dirty_path
207                .extension()
208                .and_then(|e| e.to_str())
209                .unwrap_or("");
210            let chunks = if cfg.text_mode {
211                crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
212            } else {
213                match crate::languages::config_for_extension(ext) {
214                    Some(lang_config) => {
215                        crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
216                    }
217                    None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
218                }
219            };
220
221            if chunks.is_empty() {
222                tracing::debug!(file = %relative, "dirty file produced no chunks");
223                continue;
224            }
225            tracing::debug!(file = %relative, chunks = chunks.len(), "embedding dirty file");
226
227            // Tokenize
228            let model_max = backends[0].max_tokens();
229            let encodings: Vec<Option<crate::backend::Encoding>> = chunks
230                .iter()
231                .map(|chunk| {
232                    crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max)
233                        .ok()
234                })
235                .collect();
236
237            // Embed
238            let embeddings =
239                crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
240
241            // Filter out failed tokenizations
242            let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
243                .into_iter()
244                .zip(embeddings)
245                .filter(|(_, emb)| !emb.is_empty())
246                .unzip();
247
248            let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
249
250            // Save to object store
251            let content_hash = diff::hash_file(dirty_path)?;
252            let file_cache = FileCache {
253                chunks: good_chunks.clone(),
254                embeddings: good_embeddings.iter().flatten().copied().collect(),
255                hidden_dim,
256            };
257            let bytes = if portable {
258                file_cache.to_portable_bytes()
259            } else {
260                file_cache.to_bytes()
261            };
262            store.write(&content_hash, &bytes)?;
263
264            // Update manifest
265            let mtime = diff::mtime_secs(dirty_path);
266            let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
267            manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
268            new_chunks_count += good_chunks.len();
269        }
270        guard.set_detail(format!("{files_changed} files, {new_chunks_count} chunks"));
271    }
272
273    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
274    // but content hashes match). This ensures the fast-path mtime check
275    // works on subsequent runs.
276    heal_manifest_mtimes(root, &mut manifest);
277
278    // Recompute Merkle hashes
279    manifest.recompute_hashes();
280
281    // Rebuild HybridIndex (semantic + BM25) from all cached objects.
282    // This prunes any manifest entries whose objects are missing/corrupt.
283    tracing::info!("loading cached objects from store");
284    let (all_chunks, all_embeddings) = {
285        let guard = profiler.phase("cache_load_objects");
286        let result = load_all_from_store(store, &mut manifest);
287        guard.set_detail(format!("{} chunks", result.0.len()));
288        result
289    };
290
291    // GC unreferenced objects (after pruning so dangling hashes are dropped)
292    {
293        let guard = profiler.phase("cache_gc");
294        let referenced = manifest.referenced_hashes();
295        store.gc(&referenced)?;
296        guard.set_detail(format!("{} referenced objects", referenced.len()));
297    }
298
299    // Save manifest (after pruning so the on-disk manifest is clean)
300    {
301        let guard = profiler.phase("cache_manifest_save");
302        manifest.save(&cache_dir.join("manifest.json"))?;
303        guard.set_detail(format!("{} files", manifest.files.len()));
304    }
305    let chunks_total = all_chunks.len();
306    tracing::info!(
307        chunks = chunks_total,
308        "building HybridIndex (BM25 + PolarQuant)"
309    );
310    let hybrid = {
311        let guard = profiler.phase("build_hybrid_index");
312        let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
313        guard.set_detail(format!("{chunks_total} chunks"));
314        hybrid
315    };
316    tracing::info!("HybridIndex ready");
317
318    Ok((
319        hybrid,
320        ReindexStats {
321            chunks_total,
322            chunks_reembedded: new_chunks_count,
323            files_unchanged,
324            files_changed,
325            files_deleted,
326            duration_ms: start.elapsed().as_millis() as u64,
327        },
328    ))
329}
330
331/// Full index from scratch: embed everything, save to cache.
332#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
333#[expect(
334    clippy::cast_possible_truncation,
335    reason = "duration in ms won't exceed u64"
336)]
337fn full_index_path(
338    root: &Path,
339    backends: &[&dyn EmbedBackend],
340    tokenizer: &tokenizers::Tokenizer,
341    cfg: &SearchConfig,
342    profiler: &Profiler,
343    model_repo: &str,
344    cache_dir: &Path,
345    store: &ObjectStore,
346    start: Instant,
347    portable: bool,
348) -> crate::Result<(HybridIndex, ReindexStats)> {
349    tracing::info!("no compatible manifest; building full index from source");
350    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
351
352    let hidden_dim = embeddings.first().map_or(384, Vec::len);
353
354    // Group chunks and embeddings by file, save to store
355    let mut manifest = Manifest::new(model_repo);
356    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
357        std::collections::BTreeMap::new();
358
359    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
360        file_groups
361            .entry(chunk.file_path.clone())
362            .or_default()
363            .0
364            .push(chunk.clone());
365        file_groups
366            .entry(chunk.file_path.clone())
367            .or_default()
368            .1
369            .push(emb.clone());
370    }
371
372    {
373        let guard = profiler.phase("cache_write_objects");
374        for (file_path, (file_chunks, file_embeddings)) in &file_groups {
375            // file_path from CodeChunk is already an absolute or cwd-relative path
376            let file_path_buf = PathBuf::from(file_path);
377
378            let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
379                // File might not exist (e.g., generated content) — use chunk content hash
380                blake3::hash(file_chunks[0].content.as_bytes())
381                    .to_hex()
382                    .to_string()
383            });
384
385            let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
386            let fc = FileCache {
387                chunks: file_chunks.clone(),
388                embeddings: flat_emb,
389                hidden_dim,
390            };
391            let bytes = if portable {
392                fc.to_portable_bytes()
393            } else {
394                fc.to_bytes()
395            };
396            store.write(&content_hash, &bytes)?;
397
398            let relative = file_path_buf
399                .strip_prefix(root)
400                .unwrap_or(&file_path_buf)
401                .to_string_lossy()
402                .to_string();
403            let mtime = diff::mtime_secs(&file_path_buf);
404            let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
405            manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
406        }
407        guard.set_detail(format!("{} files", file_groups.len()));
408    }
409
410    {
411        let guard = profiler.phase("cache_manifest_save");
412        manifest.recompute_hashes();
413        manifest.save(&cache_dir.join("manifest.json"))?;
414        guard.set_detail(format!("{} files", manifest.files.len()));
415    }
416
417    let chunks_total = chunks.len();
418    let files_changed = file_groups.len();
419    let hybrid = {
420        let guard = profiler.phase("build_hybrid_index");
421        let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
422        guard.set_detail(format!("{chunks_total} chunks"));
423        hybrid
424    };
425
426    Ok((
427        hybrid,
428        ReindexStats {
429            chunks_total,
430            chunks_reembedded: chunks_total,
431            files_unchanged: 0,
432            files_changed,
433            files_deleted: 0,
434            duration_ms: start.elapsed().as_millis() as u64,
435        },
436    ))
437}
438
439/// Check if the resolved cache directory is inside a `.ripvec/` directory.
440#[must_use]
441pub fn is_repo_local(cache_dir: &Path) -> bool {
442    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
443}
444
445/// Update manifest file mtimes to match the current filesystem.
446///
447/// After a git clone, all file mtimes are set to clone time, making the
448/// fast-path mtime check miss on every file. This function updates the
449/// manifest mtimes so subsequent diffs use the fast path.
450pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
451    for (relative, entry) in &mut manifest.files {
452        let file_path = root.join(relative);
453        let mtime = diff::mtime_secs(&file_path);
454        if mtime != entry.mtime_secs {
455            entry.mtime_secs = mtime;
456        }
457    }
458}
459
460/// Check whether `pull.autoStash` needs to be configured for a repo-local cache.
461///
462/// Returns `Some(message)` with a human-readable prompt if the setting has not
463/// been configured yet. Returns `None` if already configured (in git config or
464/// `.ripvec/config.toml`) or if the cache is not repo-local.
465#[must_use]
466pub fn check_auto_stash(root: &Path) -> Option<String> {
467    use std::process::Command;
468
469    let ripvec_dir = root.join(".ripvec");
470    let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
471    if !config.cache.local {
472        return None;
473    }
474
475    // Already decided via config.toml
476    if config.cache.auto_stash.is_some() {
477        return None;
478    }
479
480    // Already set in git config (by user or previous run)
481    let git_check = Command::new("git")
482        .args(["config", "--local", "pull.autoStash"])
483        .current_dir(root)
484        .stdout(std::process::Stdio::piped())
485        .stderr(std::process::Stdio::null())
486        .output()
487        .ok()?;
488    if git_check.status.success() {
489        // Sync the existing git setting into config.toml so we don't check again
490        let val = String::from_utf8_lossy(&git_check.stdout)
491            .trim()
492            .eq_ignore_ascii_case("true");
493        let _ = apply_auto_stash(root, val);
494        return None;
495    }
496
497    Some(
498        "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
499         Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
500            .to_string(),
501    )
502}
503
504/// Apply the user's `auto_stash` choice: set git config and save to `config.toml`.
505///
506/// When `enable` is true, runs `git config --local pull.autoStash true`.
507/// The choice is persisted to `.ripvec/config.toml` so the prompt is not repeated.
508///
509/// # Errors
510///
511/// Returns an error if `config.toml` cannot be read or written.
512pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
513    use std::process::Command;
514
515    let ripvec_dir = root.join(".ripvec");
516    let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
517    config.cache.auto_stash = Some(enable);
518    config.save(&ripvec_dir)?;
519
520    if enable {
521        let _ = Command::new("git")
522            .args(["config", "--local", "pull.autoStash", "true"])
523            .current_dir(root)
524            .stdout(std::process::Stdio::null())
525            .stderr(std::process::Stdio::null())
526            .status();
527    }
528
529    Ok(())
530}
531
532/// Load a `FileCache` from bytes, auto-detecting the format.
533/// Checks for bitcode magic first (portable), then falls back to rkyv.
534fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
535    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
536        FileCache::from_portable_bytes(bytes)
537    } else {
538        FileCache::from_bytes(bytes)
539    }
540}
541
542/// Load all cached chunks and embeddings from the object store.
543///
544/// Skips any manifest entry whose object is missing or corrupt, and prunes
545/// those entries from the manifest in place. This makes incremental indexing
546/// self-healing: an interrupted previous run or manually deleted cache file
547/// is treated as "file needs re-embedding" rather than a fatal error.
548fn load_all_from_store(
549    store: &ObjectStore,
550    manifest: &mut Manifest,
551) -> (Vec<CodeChunk>, Vec<Vec<f32>>) {
552    let mut all_chunks = Vec::new();
553    let mut all_embeddings = Vec::new();
554    let mut dangling: Vec<String> = Vec::new();
555
556    let total = manifest.files.len();
557    tracing::info!(objects = total, "reading cached objects");
558    for (idx, (path, entry)) in manifest.files.iter().enumerate() {
559        let current = idx + 1;
560        if current == 1 || current % 1000 == 0 || current == total {
561            tracing::debug!(current, total, path = %path, "reading cached object");
562        }
563        let bytes = match store.read(&entry.content_hash) {
564            Ok(b) => b,
565            Err(e) => {
566                tracing::warn!(
567                    path = %path,
568                    hash = %entry.content_hash,
569                    error = %e,
570                    "cache object missing or unreadable — will re-embed"
571                );
572                dangling.push(path.clone());
573                continue;
574            }
575        };
576        let fc = match load_file_cache(&bytes) {
577            Ok(fc) => fc,
578            Err(e) => {
579                tracing::warn!(
580                    path = %path,
581                    hash = %entry.content_hash,
582                    error = %e,
583                    "cache object corrupt — will re-embed"
584                );
585                dangling.push(path.clone());
586                continue;
587            }
588        };
589        let dim = fc.hidden_dim;
590
591        for (i, chunk) in fc.chunks.into_iter().enumerate() {
592            let start = i * dim;
593            let end = start + dim;
594            if end <= fc.embeddings.len() {
595                all_embeddings.push(fc.embeddings[start..end].to_vec());
596                all_chunks.push(chunk);
597            }
598        }
599    }
600
601    // Prune dangling manifest entries so the next diff pass treats these
602    // files as new and re-embeds them.
603    for path in &dangling {
604        manifest.files.remove(path);
605    }
606    if !dangling.is_empty() {
607        tracing::warn!(
608            count = dangling.len(),
609            "pruned dangling manifest entries; these files will be re-embedded on next run"
610        );
611    }
612
613    (all_chunks, all_embeddings)
614}
615
616/// Load a pre-built index from the disk cache without re-embedding.
617///
618/// This is the lightweight read path for processes that don't own the index
619/// (e.g., the LSP process reading caches built by the MCP process).
620/// Returns `None` if no compatible cache exists for this root.
621///
622/// Uses an advisory file lock on `manifest.lock` to avoid reading
623/// a half-written cache.
624#[must_use]
625pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
626    let cache_dir = resolve_cache_dir(root, model_repo, None);
627    let manifest_path = cache_dir.join("manifest.json");
628    let objects_dir = cache_dir.join("objects");
629    let lock_path = cache_dir.join("manifest.lock");
630
631    // Ensure cache dir exists (it might not if no index has been built)
632    if !manifest_path.exists() {
633        return None;
634    }
635
636    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
637    let lock_file = std::fs::OpenOptions::new()
638        .create(true)
639        .truncate(false)
640        .write(true)
641        .read(true)
642        .open(&lock_path)
643        .ok()?;
644    let lock = fd_lock::RwLock::new(lock_file);
645    let _guard = lock.read().ok()?;
646
647    let mut manifest = Manifest::load(&manifest_path)
648        .ok()
649        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
650    if !manifest.is_compatible(model_repo) {
651        return None;
652    }
653
654    let store = ObjectStore::new(&objects_dir);
655    let (chunks, embeddings) = load_all_from_store(&store, &mut manifest);
656    HybridIndex::new(chunks, &embeddings, None).ok()
657}
658
659/// Resolve the cache directory for a project + model combination.
660///
661/// Resolution priority:
662/// 1. `override_dir` parameter (highest)
663/// 2. `.ripvec/config.toml` in directory tree (repo-local)
664/// 3. `RIPVEC_CACHE` environment variable
665/// 4. XDG cache dir (`~/.cache/ripvec/`)
666///
667/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
668/// or version subdirectory — the config.toml pins the model and version).
669///
670/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
671#[must_use]
672pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
673    // Priority 1: explicit override
674    if let Some(dir) = override_dir {
675        let project_hash = hash_project_root(root);
676        let version_dir = format_version_dir(model_repo);
677        return dir.join(&project_hash).join(version_dir);
678    }
679
680    // Priority 2: repo-local .ripvec/config.toml (with model validation)
681    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
682        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
683    {
684        if config.cache.model == model_repo {
685            return ripvec_dir.join("cache");
686        }
687        eprintln!(
688            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
689            config.cache.model, model_repo
690        );
691    }
692
693    // Priority 3+4: env var or XDG
694    let project_hash = hash_project_root(root);
695    let version_dir = format_version_dir(model_repo);
696
697    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
698        PathBuf::from(env_dir).join(&project_hash)
699    } else {
700        dirs::cache_dir()
701            .unwrap_or_else(|| PathBuf::from("/tmp"))
702            .join("ripvec")
703            .join(&project_hash)
704    };
705
706    base.join(version_dir)
707}
708
709/// Blake3 hash of the canonical project root path.
710fn hash_project_root(root: &Path) -> String {
711    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
712    blake3::hash(canonical.to_string_lossy().as_bytes())
713        .to_hex()
714        .to_string()
715}
716
717/// Format the version subdirectory name from model repo.
718fn format_version_dir(model_repo: &str) -> String {
719    let model_slug = model_repo
720        .rsplit('/')
721        .next()
722        .unwrap_or(model_repo)
723        .to_lowercase();
724    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
725}
726
727/// Rebuild a manifest by scanning the object store and deserializing each object.
728///
729/// Used when `manifest.json` is gitignored and only the objects directory is
730/// committed. Scans every object, extracts the file path from the chunks,
731/// stats the source file for mtime/size, and constructs a valid manifest.
732///
733/// Returns `None` if the objects directory doesn't exist or is empty.
734#[must_use]
735pub fn rebuild_manifest_from_objects(
736    cache_dir: &std::path::Path,
737    root: &std::path::Path,
738    model_repo: &str,
739) -> Option<super::manifest::Manifest> {
740    use super::file_cache::FileCache;
741    use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
742    use super::store::ObjectStore;
743    use std::collections::BTreeMap;
744
745    let store = ObjectStore::new(&cache_dir.join("objects"));
746    let hashes = store.list_hashes();
747    if hashes.is_empty() {
748        return None;
749    }
750
751    tracing::info!(
752        objects = hashes.len(),
753        "rebuilding manifest from object store"
754    );
755
756    let mut files = BTreeMap::new();
757
758    for hash in &hashes {
759        let Ok(bytes) = store.read(hash) else {
760            continue;
761        };
762        let Ok(fc) =
763            FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
764        else {
765            continue;
766        };
767        let Some(first_chunk) = fc.chunks.first() else {
768            continue;
769        };
770
771        // The chunk's file_path may be absolute or relative.
772        // Try to make it relative to root for the manifest key.
773        let chunk_path = std::path::Path::new(&first_chunk.file_path);
774        let rel_path = chunk_path
775            .strip_prefix(root)
776            .unwrap_or(chunk_path)
777            .to_string_lossy()
778            .to_string();
779
780        // Stat the actual file for mtime/size.
781        let abs_path = root.join(&rel_path);
782        let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
783            let mtime = meta
784                .modified()
785                .ok()
786                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
787                .map_or(0, |d| d.as_secs());
788            (mtime, meta.len())
789        } else {
790            (0, 0) // file may not exist on this machine yet
791        };
792
793        files.insert(
794            rel_path,
795            FileEntry {
796                mtime_secs,
797                size,
798                content_hash: hash.clone(),
799                chunk_count: fc.chunks.len(),
800            },
801        );
802    }
803
804    if files.is_empty() {
805        return None;
806    }
807
808    let manifest = Manifest {
809        version: MANIFEST_VERSION,
810        model_repo: model_repo.to_string(),
811        root_hash: String::new(), // will be recomputed on next incremental_index
812        directories: BTreeMap::new(), // will be recomputed on next incremental_index
813        files,
814    };
815
816    tracing::info!(
817        files = manifest.files.len(),
818        "manifest rebuilt from objects"
819    );
820
821    // Write the rebuilt manifest to disk so subsequent runs use it.
822    let manifest_path = cache_dir.join("manifest.json");
823    if let Ok(json) = serde_json::to_string_pretty(&manifest) {
824        let _ = std::fs::write(&manifest_path, json);
825    }
826
827    Some(manifest)
828}
829
830#[cfg(test)]
831mod tests {
832    use super::*;
833    use tempfile::TempDir;
834
835    #[test]
836    fn heal_stale_mtimes() {
837        use crate::cache::diff;
838        use crate::cache::manifest::Manifest;
839        use std::io::Write;
840
841        let dir = TempDir::new().unwrap();
842        let file_path = dir.path().join("test.rs");
843        let content = "fn main() {}";
844        {
845            let mut f = std::fs::File::create(&file_path).unwrap();
846            f.write_all(content.as_bytes()).unwrap();
847        }
848
849        // Create manifest with correct content hash but wrong mtime
850        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
851        let mut manifest = Manifest::new("test-model");
852        manifest.add_file(
853            "test.rs",
854            9_999_999, // deliberately wrong mtime
855            content.len() as u64,
856            &content_hash,
857            1,
858        );
859
860        // After heal, the manifest mtime should match the filesystem
861        heal_manifest_mtimes(dir.path(), &mut manifest);
862        let actual_mtime = diff::mtime_secs(&file_path);
863        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
864    }
865
866    #[test]
867    fn resolve_uses_repo_local_when_present() {
868        let dir = TempDir::new().unwrap();
869        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
870        cfg.save(&dir.path().join(".ripvec")).unwrap();
871
872        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
873        assert!(
874            result.starts_with(dir.path().join(".ripvec").join("cache")),
875            "expected repo-local cache dir, got: {result:?}"
876        );
877    }
878
879    #[test]
880    fn resolve_falls_back_to_user_cache_when_no_config() {
881        let dir = TempDir::new().unwrap();
882        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
883        assert!(
884            !result.to_string_lossy().contains(".ripvec"),
885            "should not use repo-local without config, got: {result:?}"
886        );
887    }
888
889    #[test]
890    fn resolve_override_takes_priority_over_repo_local() {
891        let dir = TempDir::new().unwrap();
892        let override_dir = TempDir::new().unwrap();
893
894        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
895        cfg.save(&dir.path().join(".ripvec")).unwrap();
896
897        let result = resolve_cache_dir(
898            dir.path(),
899            "nomic-ai/modernbert-embed-base",
900            Some(override_dir.path()),
901        );
902        assert!(
903            !result.starts_with(dir.path().join(".ripvec")),
904            "override should win over repo-local, got: {result:?}"
905        );
906    }
907}