Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58    tracing::info!(root = %root.display(), model = model_repo, "incremental_index starting");
59
60    if backends.is_empty() {
61        return Err(crate::Error::Other(anyhow::anyhow!(
62            "no embedding backends provided"
63        )));
64    }
65
66    // When repo_level is requested, ensure .ripvec/config.toml exists
67    // so that resolve_cache_dir will find it and use the repo-local path.
68    if repo_level {
69        let ripvec_dir = root.join(".ripvec");
70        let config_path = ripvec_dir.join("config.toml");
71        if !config_path.exists() {
72            let config = crate::cache::config::RepoConfig::new(
73                model_repo,
74                crate::cache::manifest::MANIFEST_VERSION.to_string(),
75            );
76            config.save(&ripvec_dir)?;
77        }
78        // Gitignore the manifest — it's rebuilt from objects on first use.
79        // Objects are content-addressed and never cause merge conflicts.
80        let gitignore_path = ripvec_dir.join(".gitignore");
81        if !gitignore_path.exists() {
82            let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
83        }
84    }
85
86    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
87    let portable = is_repo_local(&cache_dir);
88    let manifest_path = cache_dir.join("manifest.json");
89    let objects_dir = cache_dir.join("objects");
90    let store = ObjectStore::new(&objects_dir);
91
92    // Try loading existing manifest, or rebuild from objects if missing.
93    let existing_manifest = Manifest::load(&manifest_path)
94        .ok()
95        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
96
97    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
98        tracing::info!(
99            files = manifest.files.len(),
100            "manifest loaded, running incremental diff"
101        );
102        // Incremental path: diff → re-embed dirty → merge
103        incremental_path(
104            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
105            start, portable,
106        )
107    } else {
108        // Cold path: full embed
109        full_index_path(
110            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
111            portable,
112        )
113    }
114}
115
116/// Incremental reindex: diff, re-embed dirty files, merge with cached.
117#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
118#[expect(
119    clippy::cast_possible_truncation,
120    reason = "duration in ms won't exceed u64"
121)]
122fn incremental_path(
123    root: &Path,
124    backends: &[&dyn EmbedBackend],
125    tokenizer: &tokenizers::Tokenizer,
126    cfg: &SearchConfig,
127    profiler: &Profiler,
128    _model_repo: &str,
129    cache_dir: &Path,
130    store: &ObjectStore,
131    mut manifest: Manifest,
132    start: Instant,
133    portable: bool,
134) -> crate::Result<(HybridIndex, ReindexStats)> {
135    let diff_result = diff::compute_diff(root, &manifest)?;
136
137    let files_changed = diff_result.dirty.len();
138    let files_deleted = diff_result.deleted.len();
139    let files_unchanged = diff_result.unchanged;
140
141    tracing::info!(
142        changed = files_changed,
143        deleted = files_deleted,
144        unchanged = files_unchanged,
145        "diff complete"
146    );
147
148    // Remove deleted files from manifest
149    for deleted in &diff_result.deleted {
150        manifest.remove_file(deleted);
151    }
152
153    // Re-embed dirty files
154    let mut new_chunks_count = 0;
155    for dirty_path in &diff_result.dirty {
156        let relative = dirty_path
157            .strip_prefix(root)
158            .unwrap_or(dirty_path)
159            .to_string_lossy()
160            .to_string();
161
162        // Remove old entry if it exists
163        manifest.remove_file(&relative);
164
165        // Chunk this file
166        let Some(source) = crate::embed::read_source(dirty_path) else {
167            continue;
168        };
169
170        let ext = dirty_path
171            .extension()
172            .and_then(|e| e.to_str())
173            .unwrap_or("");
174        let chunks = if cfg.text_mode {
175            crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
176        } else {
177            match crate::languages::config_for_extension(ext) {
178                Some(lang_config) => {
179                    crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
180                }
181                None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
182            }
183        };
184
185        if chunks.is_empty() {
186            continue;
187        }
188
189        // Tokenize
190        let model_max = backends[0].max_tokens();
191        let encodings: Vec<Option<crate::backend::Encoding>> = chunks
192            .iter()
193            .map(|chunk| {
194                crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
195            })
196            .collect();
197
198        // Embed
199        let embeddings =
200            crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
201
202        // Filter out failed tokenizations
203        let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
204            .into_iter()
205            .zip(embeddings)
206            .filter(|(_, emb)| !emb.is_empty())
207            .unzip();
208
209        let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
210
211        // Save to object store
212        let content_hash = diff::hash_file(dirty_path)?;
213        let file_cache = FileCache {
214            chunks: good_chunks.clone(),
215            embeddings: good_embeddings.iter().flatten().copied().collect(),
216            hidden_dim,
217        };
218        let bytes = if portable {
219            file_cache.to_portable_bytes()
220        } else {
221            file_cache.to_bytes()
222        };
223        store.write(&content_hash, &bytes)?;
224
225        // Update manifest
226        let mtime = diff::mtime_secs(dirty_path);
227        let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
228        manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
229        new_chunks_count += good_chunks.len();
230    }
231
232    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
233    // but content hashes match). This ensures the fast-path mtime check
234    // works on subsequent runs.
235    heal_manifest_mtimes(root, &mut manifest);
236
237    // Recompute Merkle hashes
238    manifest.recompute_hashes();
239
240    // Rebuild HybridIndex (semantic + BM25) from all cached objects.
241    // This prunes any manifest entries whose objects are missing/corrupt.
242    tracing::info!("loading cached objects from store");
243    let (all_chunks, all_embeddings) = load_all_from_store(store, &mut manifest);
244
245    // GC unreferenced objects (after pruning so dangling hashes are dropped)
246    let referenced = manifest.referenced_hashes();
247    store.gc(&referenced)?;
248
249    // Save manifest (after pruning so the on-disk manifest is clean)
250    manifest.save(&cache_dir.join("manifest.json"))?;
251    let chunks_total = all_chunks.len();
252    tracing::info!(
253        chunks = chunks_total,
254        "building HybridIndex (BM25 + PolarQuant)"
255    );
256    let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
257    tracing::info!("HybridIndex ready");
258
259    Ok((
260        hybrid,
261        ReindexStats {
262            chunks_total,
263            chunks_reembedded: new_chunks_count,
264            files_unchanged,
265            files_changed,
266            files_deleted,
267            duration_ms: start.elapsed().as_millis() as u64,
268        },
269    ))
270}
271
272/// Full index from scratch: embed everything, save to cache.
273#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
274#[expect(
275    clippy::cast_possible_truncation,
276    reason = "duration in ms won't exceed u64"
277)]
278fn full_index_path(
279    root: &Path,
280    backends: &[&dyn EmbedBackend],
281    tokenizer: &tokenizers::Tokenizer,
282    cfg: &SearchConfig,
283    profiler: &Profiler,
284    model_repo: &str,
285    cache_dir: &Path,
286    store: &ObjectStore,
287    start: Instant,
288    portable: bool,
289) -> crate::Result<(HybridIndex, ReindexStats)> {
290    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
291
292    let hidden_dim = embeddings.first().map_or(384, Vec::len);
293
294    // Group chunks and embeddings by file, save to store
295    let mut manifest = Manifest::new(model_repo);
296    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
297        std::collections::BTreeMap::new();
298
299    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
300        file_groups
301            .entry(chunk.file_path.clone())
302            .or_default()
303            .0
304            .push(chunk.clone());
305        file_groups
306            .entry(chunk.file_path.clone())
307            .or_default()
308            .1
309            .push(emb.clone());
310    }
311
312    for (file_path, (file_chunks, file_embeddings)) in &file_groups {
313        // file_path from CodeChunk is already an absolute or cwd-relative path
314        let file_path_buf = PathBuf::from(file_path);
315
316        let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
317            // File might not exist (e.g., generated content) — use chunk content hash
318            blake3::hash(file_chunks[0].content.as_bytes())
319                .to_hex()
320                .to_string()
321        });
322
323        let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
324        let fc = FileCache {
325            chunks: file_chunks.clone(),
326            embeddings: flat_emb,
327            hidden_dim,
328        };
329        let bytes = if portable {
330            fc.to_portable_bytes()
331        } else {
332            fc.to_bytes()
333        };
334        store.write(&content_hash, &bytes)?;
335
336        let relative = file_path_buf
337            .strip_prefix(root)
338            .unwrap_or(&file_path_buf)
339            .to_string_lossy()
340            .to_string();
341        let mtime = diff::mtime_secs(&file_path_buf);
342        let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
343        manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
344    }
345
346    manifest.recompute_hashes();
347    manifest.save(&cache_dir.join("manifest.json"))?;
348
349    let chunks_total = chunks.len();
350    let files_changed = file_groups.len();
351    let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
352
353    Ok((
354        hybrid,
355        ReindexStats {
356            chunks_total,
357            chunks_reembedded: chunks_total,
358            files_unchanged: 0,
359            files_changed,
360            files_deleted: 0,
361            duration_ms: start.elapsed().as_millis() as u64,
362        },
363    ))
364}
365
366/// Check if the resolved cache directory is inside a `.ripvec/` directory.
367#[must_use]
368pub fn is_repo_local(cache_dir: &Path) -> bool {
369    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
370}
371
372/// Update manifest file mtimes to match the current filesystem.
373///
374/// After a git clone, all file mtimes are set to clone time, making the
375/// fast-path mtime check miss on every file. This function updates the
376/// manifest mtimes so subsequent diffs use the fast path.
377pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
378    for (relative, entry) in &mut manifest.files {
379        let file_path = root.join(relative);
380        let mtime = diff::mtime_secs(&file_path);
381        if mtime != entry.mtime_secs {
382            entry.mtime_secs = mtime;
383        }
384    }
385}
386
387/// Check whether `pull.autoStash` needs to be configured for a repo-local cache.
388///
389/// Returns `Some(message)` with a human-readable prompt if the setting has not
390/// been configured yet. Returns `None` if already configured (in git config or
391/// `.ripvec/config.toml`) or if the cache is not repo-local.
392#[must_use]
393pub fn check_auto_stash(root: &Path) -> Option<String> {
394    use std::process::Command;
395
396    let ripvec_dir = root.join(".ripvec");
397    let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
398    if !config.cache.local {
399        return None;
400    }
401
402    // Already decided via config.toml
403    if config.cache.auto_stash.is_some() {
404        return None;
405    }
406
407    // Already set in git config (by user or previous run)
408    let git_check = Command::new("git")
409        .args(["config", "--local", "pull.autoStash"])
410        .current_dir(root)
411        .stdout(std::process::Stdio::piped())
412        .stderr(std::process::Stdio::null())
413        .output()
414        .ok()?;
415    if git_check.status.success() {
416        // Sync the existing git setting into config.toml so we don't check again
417        let val = String::from_utf8_lossy(&git_check.stdout)
418            .trim()
419            .eq_ignore_ascii_case("true");
420        let _ = apply_auto_stash(root, val);
421        return None;
422    }
423
424    Some(
425        "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
426         Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
427            .to_string(),
428    )
429}
430
431/// Apply the user's `auto_stash` choice: set git config and save to `config.toml`.
432///
433/// When `enable` is true, runs `git config --local pull.autoStash true`.
434/// The choice is persisted to `.ripvec/config.toml` so the prompt is not repeated.
435///
436/// # Errors
437///
438/// Returns an error if `config.toml` cannot be read or written.
439pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
440    use std::process::Command;
441
442    let ripvec_dir = root.join(".ripvec");
443    let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
444    config.cache.auto_stash = Some(enable);
445    config.save(&ripvec_dir)?;
446
447    if enable {
448        let _ = Command::new("git")
449            .args(["config", "--local", "pull.autoStash", "true"])
450            .current_dir(root)
451            .stdout(std::process::Stdio::null())
452            .stderr(std::process::Stdio::null())
453            .status();
454    }
455
456    Ok(())
457}
458
459/// Load a `FileCache` from bytes, auto-detecting the format.
460/// Checks for bitcode magic first (portable), then falls back to rkyv.
461fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
462    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
463        FileCache::from_portable_bytes(bytes)
464    } else {
465        FileCache::from_bytes(bytes)
466    }
467}
468
469/// Load all cached chunks and embeddings from the object store.
470///
471/// Skips any manifest entry whose object is missing or corrupt, and prunes
472/// those entries from the manifest in place. This makes incremental indexing
473/// self-healing: an interrupted previous run or manually deleted cache file
474/// is treated as "file needs re-embedding" rather than a fatal error.
475fn load_all_from_store(
476    store: &ObjectStore,
477    manifest: &mut Manifest,
478) -> (Vec<CodeChunk>, Vec<Vec<f32>>) {
479    let mut all_chunks = Vec::new();
480    let mut all_embeddings = Vec::new();
481    let mut dangling: Vec<String> = Vec::new();
482
483    for (path, entry) in &manifest.files {
484        let bytes = match store.read(&entry.content_hash) {
485            Ok(b) => b,
486            Err(e) => {
487                tracing::warn!(
488                    path = %path,
489                    hash = %entry.content_hash,
490                    error = %e,
491                    "cache object missing or unreadable — will re-embed"
492                );
493                dangling.push(path.clone());
494                continue;
495            }
496        };
497        let fc = match load_file_cache(&bytes) {
498            Ok(fc) => fc,
499            Err(e) => {
500                tracing::warn!(
501                    path = %path,
502                    hash = %entry.content_hash,
503                    error = %e,
504                    "cache object corrupt — will re-embed"
505                );
506                dangling.push(path.clone());
507                continue;
508            }
509        };
510        let dim = fc.hidden_dim;
511
512        for (i, chunk) in fc.chunks.into_iter().enumerate() {
513            let start = i * dim;
514            let end = start + dim;
515            if end <= fc.embeddings.len() {
516                all_embeddings.push(fc.embeddings[start..end].to_vec());
517                all_chunks.push(chunk);
518            }
519        }
520    }
521
522    // Prune dangling manifest entries so the next diff pass treats these
523    // files as new and re-embeds them.
524    for path in &dangling {
525        manifest.files.remove(path);
526    }
527    if !dangling.is_empty() {
528        tracing::warn!(
529            count = dangling.len(),
530            "pruned dangling manifest entries; these files will be re-embedded on next run"
531        );
532    }
533
534    (all_chunks, all_embeddings)
535}
536
537/// Load a pre-built index from the disk cache without re-embedding.
538///
539/// This is the lightweight read path for processes that don't own the index
540/// (e.g., the LSP process reading caches built by the MCP process).
541/// Returns `None` if no compatible cache exists for this root.
542///
543/// Uses an advisory file lock on `manifest.lock` to avoid reading
544/// a half-written cache.
545#[must_use]
546pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
547    let cache_dir = resolve_cache_dir(root, model_repo, None);
548    let manifest_path = cache_dir.join("manifest.json");
549    let objects_dir = cache_dir.join("objects");
550    let lock_path = cache_dir.join("manifest.lock");
551
552    // Ensure cache dir exists (it might not if no index has been built)
553    if !manifest_path.exists() {
554        return None;
555    }
556
557    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
558    let lock_file = std::fs::OpenOptions::new()
559        .create(true)
560        .truncate(false)
561        .write(true)
562        .read(true)
563        .open(&lock_path)
564        .ok()?;
565    let lock = fd_lock::RwLock::new(lock_file);
566    let _guard = lock.read().ok()?;
567
568    let mut manifest = Manifest::load(&manifest_path)
569        .ok()
570        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
571    if !manifest.is_compatible(model_repo) {
572        return None;
573    }
574
575    let store = ObjectStore::new(&objects_dir);
576    let (chunks, embeddings) = load_all_from_store(&store, &mut manifest);
577    HybridIndex::new(chunks, &embeddings, None).ok()
578}
579
580/// Resolve the cache directory for a project + model combination.
581///
582/// Resolution priority:
583/// 1. `override_dir` parameter (highest)
584/// 2. `.ripvec/config.toml` in directory tree (repo-local)
585/// 3. `RIPVEC_CACHE` environment variable
586/// 4. XDG cache dir (`~/.cache/ripvec/`)
587///
588/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
589/// or version subdirectory — the config.toml pins the model and version).
590///
591/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
592#[must_use]
593pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
594    // Priority 1: explicit override
595    if let Some(dir) = override_dir {
596        let project_hash = hash_project_root(root);
597        let version_dir = format_version_dir(model_repo);
598        return dir.join(&project_hash).join(version_dir);
599    }
600
601    // Priority 2: repo-local .ripvec/config.toml (with model validation)
602    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
603        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
604    {
605        if config.cache.model == model_repo {
606            return ripvec_dir.join("cache");
607        }
608        eprintln!(
609            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
610            config.cache.model, model_repo
611        );
612    }
613
614    // Priority 3+4: env var or XDG
615    let project_hash = hash_project_root(root);
616    let version_dir = format_version_dir(model_repo);
617
618    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
619        PathBuf::from(env_dir).join(&project_hash)
620    } else {
621        dirs::cache_dir()
622            .unwrap_or_else(|| PathBuf::from("/tmp"))
623            .join("ripvec")
624            .join(&project_hash)
625    };
626
627    base.join(version_dir)
628}
629
630/// Blake3 hash of the canonical project root path.
631fn hash_project_root(root: &Path) -> String {
632    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
633    blake3::hash(canonical.to_string_lossy().as_bytes())
634        .to_hex()
635        .to_string()
636}
637
638/// Format the version subdirectory name from model repo.
639fn format_version_dir(model_repo: &str) -> String {
640    let model_slug = model_repo
641        .rsplit('/')
642        .next()
643        .unwrap_or(model_repo)
644        .to_lowercase();
645    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
646}
647
648#[cfg(test)]
649mod tests {
650    use super::*;
651    use tempfile::TempDir;
652
653    #[test]
654    fn heal_stale_mtimes() {
655        use crate::cache::diff;
656        use crate::cache::manifest::Manifest;
657        use std::io::Write;
658
659        let dir = TempDir::new().unwrap();
660        let file_path = dir.path().join("test.rs");
661        let content = "fn main() {}";
662        {
663            let mut f = std::fs::File::create(&file_path).unwrap();
664            f.write_all(content.as_bytes()).unwrap();
665        }
666
667        // Create manifest with correct content hash but wrong mtime
668        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
669        let mut manifest = Manifest::new("test-model");
670        manifest.add_file(
671            "test.rs",
672            9_999_999, // deliberately wrong mtime
673            content.len() as u64,
674            &content_hash,
675            1,
676        );
677
678        // After heal, the manifest mtime should match the filesystem
679        heal_manifest_mtimes(dir.path(), &mut manifest);
680        let actual_mtime = diff::mtime_secs(&file_path);
681        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
682    }
683
684    #[test]
685    fn resolve_uses_repo_local_when_present() {
686        let dir = TempDir::new().unwrap();
687        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
688        cfg.save(&dir.path().join(".ripvec")).unwrap();
689
690        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
691        assert!(
692            result.starts_with(dir.path().join(".ripvec").join("cache")),
693            "expected repo-local cache dir, got: {result:?}"
694        );
695    }
696
697    #[test]
698    fn resolve_falls_back_to_user_cache_when_no_config() {
699        let dir = TempDir::new().unwrap();
700        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
701        assert!(
702            !result.to_string_lossy().contains(".ripvec"),
703            "should not use repo-local without config, got: {result:?}"
704        );
705    }
706
707    #[test]
708    fn resolve_override_takes_priority_over_repo_local() {
709        let dir = TempDir::new().unwrap();
710        let override_dir = TempDir::new().unwrap();
711
712        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
713        cfg.save(&dir.path().join(".ripvec")).unwrap();
714
715        let result = resolve_cache_dir(
716            dir.path(),
717            "nomic-ai/modernbert-embed-base",
718            Some(override_dir.path()),
719        );
720        assert!(
721            !result.starts_with(dir.path().join(".ripvec")),
722            "override should win over repo-local, got: {result:?}"
723        );
724    }
725}
726
727/// Rebuild a manifest by scanning the object store and deserializing each object.
728///
729/// Used when `manifest.json` is gitignored and only the objects directory is
730/// committed. Scans every object, extracts the file path from the chunks,
731/// stats the source file for mtime/size, and constructs a valid manifest.
732///
733/// Returns `None` if the objects directory doesn't exist or is empty.
734#[must_use]
735pub fn rebuild_manifest_from_objects(
736    cache_dir: &std::path::Path,
737    root: &std::path::Path,
738    model_repo: &str,
739) -> Option<super::manifest::Manifest> {
740    use super::file_cache::FileCache;
741    use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
742    use super::store::ObjectStore;
743    use std::collections::BTreeMap;
744
745    let store = ObjectStore::new(&cache_dir.join("objects"));
746    let hashes = store.list_hashes();
747    if hashes.is_empty() {
748        return None;
749    }
750
751    tracing::info!(
752        objects = hashes.len(),
753        "rebuilding manifest from object store"
754    );
755
756    let mut files = BTreeMap::new();
757
758    for hash in &hashes {
759        let Ok(bytes) = store.read(hash) else {
760            continue;
761        };
762        let Ok(fc) =
763            FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
764        else {
765            continue;
766        };
767        let Some(first_chunk) = fc.chunks.first() else {
768            continue;
769        };
770
771        // The chunk's file_path may be absolute or relative.
772        // Try to make it relative to root for the manifest key.
773        let chunk_path = std::path::Path::new(&first_chunk.file_path);
774        let rel_path = chunk_path
775            .strip_prefix(root)
776            .unwrap_or(chunk_path)
777            .to_string_lossy()
778            .to_string();
779
780        // Stat the actual file for mtime/size.
781        let abs_path = root.join(&rel_path);
782        let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
783            let mtime = meta
784                .modified()
785                .ok()
786                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
787                .map_or(0, |d| d.as_secs());
788            (mtime, meta.len())
789        } else {
790            (0, 0) // file may not exist on this machine yet
791        };
792
793        files.insert(
794            rel_path,
795            FileEntry {
796                mtime_secs,
797                size,
798                content_hash: hash.clone(),
799                chunk_count: fc.chunks.len(),
800            },
801        );
802    }
803
804    if files.is_empty() {
805        return None;
806    }
807
808    let manifest = Manifest {
809        version: MANIFEST_VERSION,
810        model_repo: model_repo.to_string(),
811        root_hash: String::new(), // will be recomputed on next incremental_index
812        directories: BTreeMap::new(), // will be recomputed on next incremental_index
813        files,
814    };
815
816    tracing::info!(
817        files = manifest.files.len(),
818        "manifest rebuilt from objects"
819    );
820
821    // Write the rebuilt manifest to disk so subsequent runs use it.
822    let manifest_path = cache_dir.join("manifest.json");
823    if let Ok(json) = serde_json::to_string_pretty(&manifest) {
824        let _ = std::fs::write(&manifest_path, json);
825    }
826
827    Some(manifest)
828}