Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58    tracing::info!(root = %root.display(), model = model_repo, "incremental_index starting");
59
60    if backends.is_empty() {
61        return Err(crate::Error::Other(anyhow::anyhow!(
62            "no embedding backends provided"
63        )));
64    }
65
66    let mut effective_cfg = cfg.clone();
67    effective_cfg.apply_repo_config(root);
68    let cfg = &effective_cfg;
69
70    {
71        let guard = profiler.phase("cache_prepare");
72        // When repo_level is requested, ensure .ripvec/config.toml exists
73        // so that resolve_cache_dir will find it and use the repo-local path.
74        if repo_level {
75            let ripvec_dir = root.join(".ripvec");
76            let config_path = ripvec_dir.join("config.toml");
77            if !config_path.exists() {
78                let config = crate::cache::config::RepoConfig::new(
79                    model_repo,
80                    crate::cache::manifest::MANIFEST_VERSION.to_string(),
81                );
82                config.save(&ripvec_dir)?;
83            }
84            // Gitignore the manifest — it's rebuilt from objects on first use.
85            // Objects are content-addressed and never cause merge conflicts.
86            let gitignore_path = ripvec_dir.join(".gitignore");
87            if !gitignore_path.exists() {
88                let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
89            }
90        }
91        guard.set_detail(format!("repo_level={repo_level}"));
92    }
93
94    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
95    let portable = is_repo_local(&cache_dir);
96    let manifest_path = cache_dir.join("manifest.json");
97    let objects_dir = cache_dir.join("objects");
98    let store = ObjectStore::new(&objects_dir);
99
100    tracing::info!(
101        cache_dir = %cache_dir.display(),
102        portable,
103        manifest = %manifest_path.display(),
104        "cache resolved"
105    );
106
107    // Try loading existing manifest, or rebuild from objects if missing.
108    let existing_manifest = {
109        let guard = profiler.phase("cache_manifest");
110        let manifest = Manifest::load(&manifest_path)
111            .ok()
112            .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
113        guard.set_detail(match &manifest {
114            Some(m) => format!("{} files", m.files.len()),
115            None => "none".to_string(),
116        });
117        manifest
118    };
119
120    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
121        tracing::info!(
122            files = manifest.files.len(),
123            "manifest loaded, running incremental diff"
124        );
125        // Incremental path: diff → re-embed dirty → merge
126        incremental_path(
127            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
128            start, portable,
129        )
130    } else {
131        // Cold path: full embed
132        full_index_path(
133            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
134            portable,
135        )
136    }
137}
138
139/// Incremental reindex: diff, re-embed dirty files, merge with cached.
140#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
141#[expect(
142    clippy::too_many_lines,
143    reason = "incremental cache pipeline orchestration with diagnostic phase boundaries"
144)]
145#[expect(
146    clippy::cast_possible_truncation,
147    reason = "duration in ms won't exceed u64"
148)]
149fn incremental_path(
150    root: &Path,
151    backends: &[&dyn EmbedBackend],
152    tokenizer: &tokenizers::Tokenizer,
153    cfg: &SearchConfig,
154    profiler: &Profiler,
155    _model_repo: &str,
156    cache_dir: &Path,
157    store: &ObjectStore,
158    mut manifest: Manifest,
159    start: Instant,
160    portable: bool,
161) -> crate::Result<(HybridIndex, ReindexStats)> {
162    let diff_result = {
163        let guard = profiler.phase("cache_diff");
164        let walk_options = cfg.walk_options();
165        let diff_result = diff::compute_diff_with_options(root, &manifest, &walk_options)?;
166        guard.set_detail(format!(
167            "{} changed, {} deleted, {} unchanged",
168            diff_result.dirty.len(),
169            diff_result.deleted.len(),
170            diff_result.unchanged,
171        ));
172        diff_result
173    };
174
175    let files_changed = diff_result.dirty.len();
176    let files_deleted = diff_result.deleted.len();
177    let files_unchanged = diff_result.unchanged;
178
179    tracing::info!(
180        changed = files_changed,
181        deleted = files_deleted,
182        unchanged = files_unchanged,
183        "diff complete"
184    );
185
186    // Remove deleted files from manifest
187    for deleted in &diff_result.deleted {
188        manifest.remove_file(deleted);
189    }
190
191    // Re-embed dirty files
192    let mut new_chunks_count = 0;
193    {
194        let guard = profiler.phase("reembed_dirty_files");
195        tracing::info!(files = files_changed, "re-embedding changed files");
196        for dirty_path in &diff_result.dirty {
197            let relative = dirty_path
198                .strip_prefix(root)
199                .unwrap_or(dirty_path)
200                .to_string_lossy()
201                .to_string();
202
203            // Remove old entry if it exists
204            manifest.remove_file(&relative);
205
206            // Chunk this file
207            let Some(source) = crate::embed::read_source(dirty_path) else {
208                continue;
209            };
210
211            let chunks =
212                crate::chunk::chunk_source_for_path(dirty_path, &source, cfg.text_mode, &cfg.chunk);
213            profiler.chunk_thread_report(chunks.len());
214            profiler.chunk_batch(&chunks);
215
216            if chunks.is_empty() {
217                tracing::debug!(file = %relative, "dirty file produced no chunks");
218                continue;
219            }
220            tracing::debug!(file = %relative, chunks = chunks.len(), "embedding dirty file");
221
222            // Tokenize
223            let model_max = backends[0].max_tokens();
224            let encodings: Vec<Option<crate::backend::Encoding>> = chunks
225                .iter()
226                .map(|chunk| {
227                    crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max)
228                        .ok()
229                })
230                .collect();
231
232            // Embed
233            let embeddings =
234                crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
235
236            // Filter out failed tokenizations
237            let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
238                .into_iter()
239                .zip(embeddings)
240                .filter(|(_, emb)| !emb.is_empty())
241                .unzip();
242
243            let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
244
245            // Save to object store
246            let content_hash = diff::hash_file(dirty_path)?;
247            let file_cache = FileCache {
248                chunks: good_chunks.clone(),
249                embeddings: good_embeddings.iter().flatten().copied().collect(),
250                hidden_dim,
251            };
252            let bytes = if portable {
253                file_cache.to_portable_bytes()
254            } else {
255                file_cache.to_bytes()
256            };
257            store.write(&content_hash, &bytes)?;
258
259            // Update manifest
260            let mtime = diff::mtime_secs(dirty_path);
261            let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
262            manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
263            new_chunks_count += good_chunks.len();
264        }
265        guard.set_detail(format!("{files_changed} files, {new_chunks_count} chunks"));
266    }
267
268    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
269    // but content hashes match). This ensures the fast-path mtime check
270    // works on subsequent runs.
271    heal_manifest_mtimes(root, &mut manifest);
272
273    // Recompute Merkle hashes
274    manifest.recompute_hashes();
275
276    // Rebuild HybridIndex (semantic + BM25) from all cached objects.
277    // This prunes any manifest entries whose objects are missing/corrupt.
278    tracing::info!("loading cached objects from store");
279    let (all_chunks, all_embeddings) = {
280        let guard = profiler.phase("cache_load_objects");
281        let result = load_all_from_store(store, &mut manifest);
282        guard.set_detail(format!("{} chunks", result.0.len()));
283        result
284    };
285
286    // GC unreferenced objects (after pruning so dangling hashes are dropped)
287    {
288        let guard = profiler.phase("cache_gc");
289        let referenced = manifest.referenced_hashes();
290        store.gc(&referenced)?;
291        guard.set_detail(format!("{} referenced objects", referenced.len()));
292    }
293
294    // Save manifest (after pruning so the on-disk manifest is clean)
295    {
296        let guard = profiler.phase("cache_manifest_save");
297        manifest.save(&cache_dir.join("manifest.json"))?;
298        guard.set_detail(format!("{} files", manifest.files.len()));
299    }
300    let chunks_total = all_chunks.len();
301    tracing::info!(
302        chunks = chunks_total,
303        "building HybridIndex (BM25 + PolarQuant)"
304    );
305    let hybrid = {
306        let guard = profiler.phase("build_hybrid_index");
307        let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
308        guard.set_detail(format!("{chunks_total} chunks"));
309        hybrid
310    };
311    tracing::info!("HybridIndex ready");
312
313    Ok((
314        hybrid,
315        ReindexStats {
316            chunks_total,
317            chunks_reembedded: new_chunks_count,
318            files_unchanged,
319            files_changed,
320            files_deleted,
321            duration_ms: start.elapsed().as_millis() as u64,
322        },
323    ))
324}
325
326/// Full index from scratch: embed everything, save to cache.
327#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
328#[expect(
329    clippy::cast_possible_truncation,
330    reason = "duration in ms won't exceed u64"
331)]
332fn full_index_path(
333    root: &Path,
334    backends: &[&dyn EmbedBackend],
335    tokenizer: &tokenizers::Tokenizer,
336    cfg: &SearchConfig,
337    profiler: &Profiler,
338    model_repo: &str,
339    cache_dir: &Path,
340    store: &ObjectStore,
341    start: Instant,
342    portable: bool,
343) -> crate::Result<(HybridIndex, ReindexStats)> {
344    tracing::info!("no compatible manifest; building full index from source");
345    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
346
347    let hidden_dim = embeddings.first().map_or(384, Vec::len);
348
349    // Group chunks and embeddings by file, save to store
350    let mut manifest = Manifest::new(model_repo);
351    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
352        std::collections::BTreeMap::new();
353
354    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
355        file_groups
356            .entry(chunk.file_path.clone())
357            .or_default()
358            .0
359            .push(chunk.clone());
360        file_groups
361            .entry(chunk.file_path.clone())
362            .or_default()
363            .1
364            .push(emb.clone());
365    }
366
367    {
368        let guard = profiler.phase("cache_write_objects");
369        for (file_path, (file_chunks, file_embeddings)) in &file_groups {
370            // file_path from CodeChunk is already an absolute or cwd-relative path
371            let file_path_buf = PathBuf::from(file_path);
372
373            let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
374                // File might not exist (e.g., generated content) — use chunk content hash
375                blake3::hash(file_chunks[0].content.as_bytes())
376                    .to_hex()
377                    .to_string()
378            });
379
380            let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
381            let fc = FileCache {
382                chunks: file_chunks.clone(),
383                embeddings: flat_emb,
384                hidden_dim,
385            };
386            let bytes = if portable {
387                fc.to_portable_bytes()
388            } else {
389                fc.to_bytes()
390            };
391            store.write(&content_hash, &bytes)?;
392
393            let relative = file_path_buf
394                .strip_prefix(root)
395                .unwrap_or(&file_path_buf)
396                .to_string_lossy()
397                .to_string();
398            let mtime = diff::mtime_secs(&file_path_buf);
399            let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
400            manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
401        }
402        guard.set_detail(format!("{} files", file_groups.len()));
403    }
404
405    {
406        let guard = profiler.phase("cache_manifest_save");
407        manifest.recompute_hashes();
408        manifest.save(&cache_dir.join("manifest.json"))?;
409        guard.set_detail(format!("{} files", manifest.files.len()));
410    }
411
412    let chunks_total = chunks.len();
413    let files_changed = file_groups.len();
414    let hybrid = {
415        let guard = profiler.phase("build_hybrid_index");
416        let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
417        guard.set_detail(format!("{chunks_total} chunks"));
418        hybrid
419    };
420
421    Ok((
422        hybrid,
423        ReindexStats {
424            chunks_total,
425            chunks_reembedded: chunks_total,
426            files_unchanged: 0,
427            files_changed,
428            files_deleted: 0,
429            duration_ms: start.elapsed().as_millis() as u64,
430        },
431    ))
432}
433
434/// Check if the resolved cache directory is inside a `.ripvec/` directory.
435#[must_use]
436pub fn is_repo_local(cache_dir: &Path) -> bool {
437    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
438}
439
440/// Update manifest file mtimes to match the current filesystem.
441///
442/// After a git clone, all file mtimes are set to clone time, making the
443/// fast-path mtime check miss on every file. This function updates the
444/// manifest mtimes so subsequent diffs use the fast path.
445pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
446    for (relative, entry) in &mut manifest.files {
447        let file_path = root.join(relative);
448        let mtime = diff::mtime_secs(&file_path);
449        if mtime != entry.mtime_secs {
450            entry.mtime_secs = mtime;
451        }
452    }
453}
454
455/// Check whether `pull.autoStash` needs to be configured for a repo-local cache.
456///
457/// Returns `Some(message)` with a human-readable prompt if the setting has not
458/// been configured yet. Returns `None` if already configured (in git config or
459/// `.ripvec/config.toml`) or if the cache is not repo-local.
460#[must_use]
461pub fn check_auto_stash(root: &Path) -> Option<String> {
462    use std::process::Command;
463
464    let ripvec_dir = root.join(".ripvec");
465    let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
466    if !config.cache.local {
467        return None;
468    }
469
470    // Already decided via config.toml
471    if config.cache.auto_stash.is_some() {
472        return None;
473    }
474
475    // Already set in git config (by user or previous run)
476    let git_check = Command::new("git")
477        .args(["config", "--local", "pull.autoStash"])
478        .current_dir(root)
479        .stdout(std::process::Stdio::piped())
480        .stderr(std::process::Stdio::null())
481        .output()
482        .ok()?;
483    if git_check.status.success() {
484        // Sync the existing git setting into config.toml so we don't check again
485        let val = String::from_utf8_lossy(&git_check.stdout)
486            .trim()
487            .eq_ignore_ascii_case("true");
488        let _ = apply_auto_stash(root, val);
489        return None;
490    }
491
492    Some(
493        "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
494         Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
495            .to_string(),
496    )
497}
498
499/// Apply the user's `auto_stash` choice: set git config and save to `config.toml`.
500///
501/// When `enable` is true, runs `git config --local pull.autoStash true`.
502/// The choice is persisted to `.ripvec/config.toml` so the prompt is not repeated.
503///
504/// # Errors
505///
506/// Returns an error if `config.toml` cannot be read or written.
507pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
508    use std::process::Command;
509
510    let ripvec_dir = root.join(".ripvec");
511    let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
512    config.cache.auto_stash = Some(enable);
513    config.save(&ripvec_dir)?;
514
515    if enable {
516        let _ = Command::new("git")
517            .args(["config", "--local", "pull.autoStash", "true"])
518            .current_dir(root)
519            .stdout(std::process::Stdio::null())
520            .stderr(std::process::Stdio::null())
521            .status();
522    }
523
524    Ok(())
525}
526
527/// Load a `FileCache` from bytes, auto-detecting the format.
528/// Checks for bitcode magic first (portable), then falls back to rkyv.
529fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
530    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
531        FileCache::from_portable_bytes(bytes)
532    } else {
533        FileCache::from_bytes(bytes)
534    }
535}
536
537/// Load all cached chunks and embeddings from the object store.
538///
539/// Skips any manifest entry whose object is missing or corrupt, and prunes
540/// those entries from the manifest in place. This makes incremental indexing
541/// self-healing: an interrupted previous run or manually deleted cache file
542/// is treated as "file needs re-embedding" rather than a fatal error.
543fn load_all_from_store(
544    store: &ObjectStore,
545    manifest: &mut Manifest,
546) -> (Vec<CodeChunk>, Vec<Vec<f32>>) {
547    let mut all_chunks = Vec::new();
548    let mut all_embeddings = Vec::new();
549    let mut dangling: Vec<String> = Vec::new();
550
551    let total = manifest.files.len();
552    tracing::info!(objects = total, "reading cached objects");
553    for (idx, (path, entry)) in manifest.files.iter().enumerate() {
554        let current = idx + 1;
555        if current == 1 || current % 1000 == 0 || current == total {
556            tracing::debug!(current, total, path = %path, "reading cached object");
557        }
558        let bytes = match store.read(&entry.content_hash) {
559            Ok(b) => b,
560            Err(e) => {
561                tracing::warn!(
562                    path = %path,
563                    hash = %entry.content_hash,
564                    error = %e,
565                    "cache object missing or unreadable — will re-embed"
566                );
567                dangling.push(path.clone());
568                continue;
569            }
570        };
571        let fc = match load_file_cache(&bytes) {
572            Ok(fc) => fc,
573            Err(e) => {
574                tracing::warn!(
575                    path = %path,
576                    hash = %entry.content_hash,
577                    error = %e,
578                    "cache object corrupt — will re-embed"
579                );
580                dangling.push(path.clone());
581                continue;
582            }
583        };
584        let dim = fc.hidden_dim;
585
586        for (i, chunk) in fc.chunks.into_iter().enumerate() {
587            let start = i * dim;
588            let end = start + dim;
589            if end <= fc.embeddings.len() {
590                all_embeddings.push(fc.embeddings[start..end].to_vec());
591                all_chunks.push(chunk);
592            }
593        }
594    }
595
596    // Prune dangling manifest entries so the next diff pass treats these
597    // files as new and re-embeds them.
598    for path in &dangling {
599        manifest.files.remove(path);
600    }
601    if !dangling.is_empty() {
602        tracing::warn!(
603            count = dangling.len(),
604            "pruned dangling manifest entries; these files will be re-embedded on next run"
605        );
606    }
607
608    (all_chunks, all_embeddings)
609}
610
611/// Load a pre-built index from the disk cache without re-embedding.
612///
613/// This is the lightweight read path for processes that don't own the index
614/// (e.g., the LSP process reading caches built by the MCP process).
615/// Returns `None` if no compatible cache exists for this root.
616///
617/// Uses an advisory file lock on `manifest.lock` to avoid reading
618/// a half-written cache.
619#[must_use]
620pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
621    let cache_dir = resolve_cache_dir(root, model_repo, None);
622    let manifest_path = cache_dir.join("manifest.json");
623    let objects_dir = cache_dir.join("objects");
624    let lock_path = cache_dir.join("manifest.lock");
625
626    // Ensure cache dir exists (it might not if no index has been built)
627    if !manifest_path.exists() {
628        return None;
629    }
630
631    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
632    let lock_file = std::fs::OpenOptions::new()
633        .create(true)
634        .truncate(false)
635        .write(true)
636        .read(true)
637        .open(&lock_path)
638        .ok()?;
639    let lock = fd_lock::RwLock::new(lock_file);
640    let _guard = lock.read().ok()?;
641
642    let mut manifest = Manifest::load(&manifest_path)
643        .ok()
644        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
645    if !manifest.is_compatible(model_repo) {
646        return None;
647    }
648
649    let store = ObjectStore::new(&objects_dir);
650    let (chunks, embeddings) = load_all_from_store(&store, &mut manifest);
651    HybridIndex::new(chunks, &embeddings, None).ok()
652}
653
654/// Resolve the cache directory for a project + model combination.
655///
656/// Resolution priority:
657/// 1. `override_dir` parameter (highest)
658/// 2. `.ripvec/config.toml` in directory tree (repo-local)
659/// 3. `RIPVEC_CACHE` environment variable
660/// 4. XDG cache dir (`~/.cache/ripvec/`)
661///
662/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
663/// or version subdirectory — the config.toml pins the model and version).
664///
665/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
666#[must_use]
667pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
668    // Priority 1: explicit override
669    if let Some(dir) = override_dir {
670        let project_hash = hash_project_root(root);
671        let version_dir = format_version_dir(model_repo);
672        return dir.join(&project_hash).join(version_dir);
673    }
674
675    // Priority 2: repo-local .ripvec/config.toml (with model validation)
676    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
677        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
678    {
679        if config.cache.model == model_repo {
680            return ripvec_dir.join("cache");
681        }
682        eprintln!(
683            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
684            config.cache.model, model_repo
685        );
686    }
687
688    // Priority 3+4: env var or XDG
689    let project_hash = hash_project_root(root);
690    let version_dir = format_version_dir(model_repo);
691
692    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
693        PathBuf::from(env_dir).join(&project_hash)
694    } else {
695        dirs::cache_dir()
696            .unwrap_or_else(|| PathBuf::from("/tmp"))
697            .join("ripvec")
698            .join(&project_hash)
699    };
700
701    base.join(version_dir)
702}
703
704/// Blake3 hash of the canonical project root path.
705fn hash_project_root(root: &Path) -> String {
706    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
707    blake3::hash(canonical.to_string_lossy().as_bytes())
708        .to_hex()
709        .to_string()
710}
711
712/// Format the version subdirectory name from model repo.
713fn format_version_dir(model_repo: &str) -> String {
714    let model_slug = model_repo
715        .rsplit('/')
716        .next()
717        .unwrap_or(model_repo)
718        .to_lowercase();
719    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
720}
721
722/// Rebuild a manifest by scanning the object store and deserializing each object.
723///
724/// Used when `manifest.json` is gitignored and only the objects directory is
725/// committed. Scans every object, extracts the file path from the chunks,
726/// stats the source file for mtime/size, and constructs a valid manifest.
727///
728/// Returns `None` if the objects directory doesn't exist or is empty.
729#[must_use]
730pub fn rebuild_manifest_from_objects(
731    cache_dir: &std::path::Path,
732    root: &std::path::Path,
733    model_repo: &str,
734) -> Option<super::manifest::Manifest> {
735    use super::file_cache::FileCache;
736    use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
737    use super::store::ObjectStore;
738    use std::collections::BTreeMap;
739
740    let store = ObjectStore::new(&cache_dir.join("objects"));
741    let hashes = store.list_hashes();
742    if hashes.is_empty() {
743        return None;
744    }
745
746    tracing::info!(
747        objects = hashes.len(),
748        "rebuilding manifest from object store"
749    );
750
751    let mut files = BTreeMap::new();
752
753    for hash in &hashes {
754        let Ok(bytes) = store.read(hash) else {
755            continue;
756        };
757        let Ok(fc) =
758            FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
759        else {
760            continue;
761        };
762        let Some(first_chunk) = fc.chunks.first() else {
763            continue;
764        };
765
766        // The chunk's file_path may be absolute or relative.
767        // Try to make it relative to root for the manifest key.
768        let chunk_path = std::path::Path::new(&first_chunk.file_path);
769        let rel_path = chunk_path
770            .strip_prefix(root)
771            .unwrap_or(chunk_path)
772            .to_string_lossy()
773            .to_string();
774
775        // Stat the actual file for mtime/size.
776        let abs_path = root.join(&rel_path);
777        let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
778            let mtime = meta
779                .modified()
780                .ok()
781                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
782                .map_or(0, |d| d.as_secs());
783            (mtime, meta.len())
784        } else {
785            (0, 0) // file may not exist on this machine yet
786        };
787
788        files.insert(
789            rel_path,
790            FileEntry {
791                mtime_secs,
792                size,
793                content_hash: hash.clone(),
794                chunk_count: fc.chunks.len(),
795            },
796        );
797    }
798
799    if files.is_empty() {
800        return None;
801    }
802
803    let manifest = Manifest {
804        version: MANIFEST_VERSION,
805        model_repo: model_repo.to_string(),
806        root_hash: String::new(), // will be recomputed on next incremental_index
807        directories: BTreeMap::new(), // will be recomputed on next incremental_index
808        files,
809    };
810
811    tracing::info!(
812        files = manifest.files.len(),
813        "manifest rebuilt from objects"
814    );
815
816    // Write the rebuilt manifest to disk so subsequent runs use it.
817    let manifest_path = cache_dir.join("manifest.json");
818    if let Ok(json) = serde_json::to_string_pretty(&manifest) {
819        let _ = std::fs::write(&manifest_path, json);
820    }
821
822    Some(manifest)
823}
824
825#[cfg(test)]
826mod tests {
827    use super::*;
828    use tempfile::TempDir;
829
830    #[test]
831    fn heal_stale_mtimes() {
832        use crate::cache::diff;
833        use crate::cache::manifest::Manifest;
834        use std::io::Write;
835
836        let dir = TempDir::new().unwrap();
837        let file_path = dir.path().join("test.rs");
838        let content = "fn main() {}";
839        {
840            let mut f = std::fs::File::create(&file_path).unwrap();
841            f.write_all(content.as_bytes()).unwrap();
842        }
843
844        // Create manifest with correct content hash but wrong mtime
845        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
846        let mut manifest = Manifest::new("test-model");
847        manifest.add_file(
848            "test.rs",
849            9_999_999, // deliberately wrong mtime
850            content.len() as u64,
851            &content_hash,
852            1,
853        );
854
855        // After heal, the manifest mtime should match the filesystem
856        heal_manifest_mtimes(dir.path(), &mut manifest);
857        let actual_mtime = diff::mtime_secs(&file_path);
858        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
859    }
860
861    #[test]
862    fn resolve_uses_repo_local_when_present() {
863        let dir = TempDir::new().unwrap();
864        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
865        cfg.save(&dir.path().join(".ripvec")).unwrap();
866
867        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
868        assert!(
869            result.starts_with(dir.path().join(".ripvec").join("cache")),
870            "expected repo-local cache dir, got: {result:?}"
871        );
872    }
873
874    #[test]
875    fn resolve_falls_back_to_user_cache_when_no_config() {
876        let dir = TempDir::new().unwrap();
877        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
878        assert!(
879            !result.to_string_lossy().contains(".ripvec"),
880            "should not use repo-local without config, got: {result:?}"
881        );
882    }
883
884    #[test]
885    fn resolve_override_takes_priority_over_repo_local() {
886        let dir = TempDir::new().unwrap();
887        let override_dir = TempDir::new().unwrap();
888
889        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
890        cfg.save(&dir.path().join(".ripvec")).unwrap();
891
892        let result = resolve_cache_dir(
893            dir.path(),
894            "nomic-ai/modernbert-embed-base",
895            Some(override_dir.path()),
896        );
897        assert!(
898            !result.starts_with(dir.path().join(".ripvec")),
899            "override should win over repo-local, got: {result:?}"
900        );
901    }
902}