Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55    repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57    let start = Instant::now();
58
59    if backends.is_empty() {
60        return Err(crate::Error::Other(anyhow::anyhow!(
61            "no embedding backends provided"
62        )));
63    }
64
65    // When repo_level is requested, ensure .ripvec/config.toml exists
66    // so that resolve_cache_dir will find it and use the repo-local path.
67    if repo_level {
68        let ripvec_dir = root.join(".ripvec");
69        let config_path = ripvec_dir.join("config.toml");
70        if !config_path.exists() {
71            let config = crate::cache::config::RepoConfig::new(
72                model_repo,
73                crate::cache::manifest::MANIFEST_VERSION.to_string(),
74            );
75            config.save(&ripvec_dir)?;
76        }
77        // Gitignore the manifest — it's rebuilt from objects on first use.
78        // Objects are content-addressed and never cause merge conflicts.
79        let gitignore_path = ripvec_dir.join(".gitignore");
80        if !gitignore_path.exists() {
81            let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
82        }
83    }
84
85    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
86    let portable = is_repo_local(&cache_dir);
87    let manifest_path = cache_dir.join("manifest.json");
88    let objects_dir = cache_dir.join("objects");
89    let store = ObjectStore::new(&objects_dir);
90
91    // Try loading existing manifest, or rebuild from objects if missing.
92    let existing_manifest = Manifest::load(&manifest_path)
93        .ok()
94        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
95
96    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
97        // Incremental path: diff → re-embed dirty → merge
98        incremental_path(
99            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
100            start, portable,
101        )
102    } else {
103        // Cold path: full embed
104        full_index_path(
105            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
106            portable,
107        )
108    }
109}
110
111/// Incremental reindex: diff, re-embed dirty files, merge with cached.
112#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
113#[expect(
114    clippy::cast_possible_truncation,
115    reason = "duration in ms won't exceed u64"
116)]
117fn incremental_path(
118    root: &Path,
119    backends: &[&dyn EmbedBackend],
120    tokenizer: &tokenizers::Tokenizer,
121    cfg: &SearchConfig,
122    profiler: &Profiler,
123    _model_repo: &str,
124    cache_dir: &Path,
125    store: &ObjectStore,
126    mut manifest: Manifest,
127    start: Instant,
128    portable: bool,
129) -> crate::Result<(HybridIndex, ReindexStats)> {
130    let diff_result = diff::compute_diff(root, &manifest)?;
131
132    let files_changed = diff_result.dirty.len();
133    let files_deleted = diff_result.deleted.len();
134    let files_unchanged = diff_result.unchanged;
135
136    // Remove deleted files from manifest
137    for deleted in &diff_result.deleted {
138        manifest.remove_file(deleted);
139    }
140
141    // Re-embed dirty files
142    let mut new_chunks_count = 0;
143    for dirty_path in &diff_result.dirty {
144        let relative = dirty_path
145            .strip_prefix(root)
146            .unwrap_or(dirty_path)
147            .to_string_lossy()
148            .to_string();
149
150        // Remove old entry if it exists
151        manifest.remove_file(&relative);
152
153        // Chunk this file
154        let Some(source) = crate::embed::read_source(dirty_path) else {
155            continue;
156        };
157
158        let ext = dirty_path
159            .extension()
160            .and_then(|e| e.to_str())
161            .unwrap_or("");
162        let chunks = if cfg.text_mode {
163            crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
164        } else {
165            match crate::languages::config_for_extension(ext) {
166                Some(lang_config) => {
167                    crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
168                }
169                None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
170            }
171        };
172
173        if chunks.is_empty() {
174            continue;
175        }
176
177        // Tokenize
178        let model_max = backends[0].max_tokens();
179        let encodings: Vec<Option<crate::backend::Encoding>> = chunks
180            .iter()
181            .map(|chunk| {
182                crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
183            })
184            .collect();
185
186        // Embed
187        let embeddings =
188            crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
189
190        // Filter out failed tokenizations
191        let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
192            .into_iter()
193            .zip(embeddings.into_iter())
194            .filter(|(_, emb)| !emb.is_empty())
195            .unzip();
196
197        let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
198
199        // Save to object store
200        let content_hash = diff::hash_file(dirty_path)?;
201        let file_cache = FileCache {
202            chunks: good_chunks.clone(),
203            embeddings: good_embeddings.iter().flatten().copied().collect(),
204            hidden_dim,
205        };
206        let bytes = if portable {
207            file_cache.to_portable_bytes()
208        } else {
209            file_cache.to_bytes()
210        };
211        store.write(&content_hash, &bytes)?;
212
213        // Update manifest
214        let mtime = diff::mtime_secs(dirty_path);
215        let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
216        manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
217        new_chunks_count += good_chunks.len();
218    }
219
220    // Heal stale mtimes (e.g., after git clone where all mtimes are wrong
221    // but content hashes match). This ensures the fast-path mtime check
222    // works on subsequent runs.
223    heal_manifest_mtimes(root, &mut manifest);
224
225    // Recompute Merkle hashes
226    manifest.recompute_hashes();
227
228    // GC unreferenced objects
229    let referenced = manifest.referenced_hashes();
230    store.gc(&referenced)?;
231
232    // Save manifest
233    manifest.save(&cache_dir.join("manifest.json"))?;
234
235    // Rebuild HybridIndex (semantic + BM25) from all cached objects
236    let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
237    let chunks_total = all_chunks.len();
238    let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
239
240    Ok((
241        hybrid,
242        ReindexStats {
243            chunks_total,
244            chunks_reembedded: new_chunks_count,
245            files_unchanged,
246            files_changed,
247            files_deleted,
248            duration_ms: start.elapsed().as_millis() as u64,
249        },
250    ))
251}
252
253/// Full index from scratch: embed everything, save to cache.
254#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
255#[expect(
256    clippy::cast_possible_truncation,
257    reason = "duration in ms won't exceed u64"
258)]
259fn full_index_path(
260    root: &Path,
261    backends: &[&dyn EmbedBackend],
262    tokenizer: &tokenizers::Tokenizer,
263    cfg: &SearchConfig,
264    profiler: &Profiler,
265    model_repo: &str,
266    cache_dir: &Path,
267    store: &ObjectStore,
268    start: Instant,
269    portable: bool,
270) -> crate::Result<(HybridIndex, ReindexStats)> {
271    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
272
273    let hidden_dim = embeddings.first().map_or(384, Vec::len);
274
275    // Group chunks and embeddings by file, save to store
276    let mut manifest = Manifest::new(model_repo);
277    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
278        std::collections::BTreeMap::new();
279
280    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
281        file_groups
282            .entry(chunk.file_path.clone())
283            .or_default()
284            .0
285            .push(chunk.clone());
286        file_groups
287            .entry(chunk.file_path.clone())
288            .or_default()
289            .1
290            .push(emb.clone());
291    }
292
293    for (file_path, (file_chunks, file_embeddings)) in &file_groups {
294        // file_path from CodeChunk is already an absolute or cwd-relative path
295        let file_path_buf = PathBuf::from(file_path);
296
297        let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
298            // File might not exist (e.g., generated content) — use chunk content hash
299            blake3::hash(file_chunks[0].content.as_bytes())
300                .to_hex()
301                .to_string()
302        });
303
304        let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
305        let fc = FileCache {
306            chunks: file_chunks.clone(),
307            embeddings: flat_emb,
308            hidden_dim,
309        };
310        let bytes = if portable {
311            fc.to_portable_bytes()
312        } else {
313            fc.to_bytes()
314        };
315        store.write(&content_hash, &bytes)?;
316
317        let relative = file_path_buf
318            .strip_prefix(root)
319            .unwrap_or(&file_path_buf)
320            .to_string_lossy()
321            .to_string();
322        let mtime = diff::mtime_secs(&file_path_buf);
323        let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
324        manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
325    }
326
327    manifest.recompute_hashes();
328    manifest.save(&cache_dir.join("manifest.json"))?;
329
330    let chunks_total = chunks.len();
331    let files_changed = file_groups.len();
332    let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
333
334    Ok((
335        hybrid,
336        ReindexStats {
337            chunks_total,
338            chunks_reembedded: chunks_total,
339            files_unchanged: 0,
340            files_changed,
341            files_deleted: 0,
342            duration_ms: start.elapsed().as_millis() as u64,
343        },
344    ))
345}
346
347/// Check if the resolved cache directory is inside a `.ripvec/` directory.
348#[must_use]
349pub fn is_repo_local(cache_dir: &Path) -> bool {
350    cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
351}
352
353/// Update manifest file mtimes to match the current filesystem.
354///
355/// After a git clone, all file mtimes are set to clone time, making the
356/// fast-path mtime check miss on every file. This function updates the
357/// manifest mtimes so subsequent diffs use the fast path.
358pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
359    for (relative, entry) in &mut manifest.files {
360        let file_path = root.join(relative);
361        let mtime = diff::mtime_secs(&file_path);
362        if mtime != entry.mtime_secs {
363            entry.mtime_secs = mtime;
364        }
365    }
366}
367
368/// Check whether `pull.autoStash` needs to be configured for a repo-local cache.
369///
370/// Returns `Some(message)` with a human-readable prompt if the setting has not
371/// been configured yet. Returns `None` if already configured (in git config or
372/// `.ripvec/config.toml`) or if the cache is not repo-local.
373#[must_use]
374pub fn check_auto_stash(root: &Path) -> Option<String> {
375    use std::process::Command;
376
377    let ripvec_dir = root.join(".ripvec");
378    let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
379    if !config.cache.local {
380        return None;
381    }
382
383    // Already decided via config.toml
384    if config.cache.auto_stash.is_some() {
385        return None;
386    }
387
388    // Already set in git config (by user or previous run)
389    let git_check = Command::new("git")
390        .args(["config", "--local", "pull.autoStash"])
391        .current_dir(root)
392        .stdout(std::process::Stdio::piped())
393        .stderr(std::process::Stdio::null())
394        .output()
395        .ok()?;
396    if git_check.status.success() {
397        // Sync the existing git setting into config.toml so we don't check again
398        let val = String::from_utf8_lossy(&git_check.stdout)
399            .trim()
400            .eq_ignore_ascii_case("true");
401        let _ = apply_auto_stash(root, val);
402        return None;
403    }
404
405    Some(
406        "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
407         Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
408            .to_string(),
409    )
410}
411
412/// Apply the user's `auto_stash` choice: set git config and save to `config.toml`.
413///
414/// When `enable` is true, runs `git config --local pull.autoStash true`.
415/// The choice is persisted to `.ripvec/config.toml` so the prompt is not repeated.
416///
417/// # Errors
418///
419/// Returns an error if `config.toml` cannot be read or written.
420pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
421    use std::process::Command;
422
423    let ripvec_dir = root.join(".ripvec");
424    let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
425    config.cache.auto_stash = Some(enable);
426    config.save(&ripvec_dir)?;
427
428    if enable {
429        let _ = Command::new("git")
430            .args(["config", "--local", "pull.autoStash", "true"])
431            .current_dir(root)
432            .stdout(std::process::Stdio::null())
433            .stderr(std::process::Stdio::null())
434            .status();
435    }
436
437    Ok(())
438}
439
440/// Load a `FileCache` from bytes, auto-detecting the format.
441/// Checks for bitcode magic first (portable), then falls back to rkyv.
442fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
443    if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
444        FileCache::from_portable_bytes(bytes)
445    } else {
446        FileCache::from_bytes(bytes)
447    }
448}
449
450/// Load all cached chunks and embeddings from the object store.
451fn load_all_from_store(
452    store: &ObjectStore,
453    manifest: &Manifest,
454) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
455    let mut all_chunks = Vec::new();
456    let mut all_embeddings = Vec::new();
457
458    for entry in manifest.files.values() {
459        let bytes = store.read(&entry.content_hash)?;
460        let fc = load_file_cache(&bytes)?;
461        let dim = fc.hidden_dim;
462
463        for (i, chunk) in fc.chunks.into_iter().enumerate() {
464            let start = i * dim;
465            let end = start + dim;
466            if end <= fc.embeddings.len() {
467                all_embeddings.push(fc.embeddings[start..end].to_vec());
468                all_chunks.push(chunk);
469            }
470        }
471    }
472
473    Ok((all_chunks, all_embeddings))
474}
475
476/// Load a pre-built index from the disk cache without re-embedding.
477///
478/// This is the lightweight read path for processes that don't own the index
479/// (e.g., the LSP process reading caches built by the MCP process).
480/// Returns `None` if no compatible cache exists for this root.
481///
482/// Uses an advisory file lock on `manifest.lock` to avoid reading
483/// a half-written cache.
484#[must_use]
485pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
486    let cache_dir = resolve_cache_dir(root, model_repo, None);
487    let manifest_path = cache_dir.join("manifest.json");
488    let objects_dir = cache_dir.join("objects");
489    let lock_path = cache_dir.join("manifest.lock");
490
491    // Ensure cache dir exists (it might not if no index has been built)
492    if !manifest_path.exists() {
493        return None;
494    }
495
496    // Acquire a shared (read) lock — blocks if a writer holds the exclusive lock
497    let lock_file = std::fs::OpenOptions::new()
498        .create(true)
499        .truncate(false)
500        .write(true)
501        .read(true)
502        .open(&lock_path)
503        .ok()?;
504    let lock = fd_lock::RwLock::new(lock_file);
505    let _guard = lock.read().ok()?;
506
507    let manifest = Manifest::load(&manifest_path)
508        .ok()
509        .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
510    if !manifest.is_compatible(model_repo) {
511        return None;
512    }
513
514    let store = ObjectStore::new(&objects_dir);
515    let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
516    HybridIndex::new(chunks, &embeddings, None).ok()
517}
518
519/// Resolve the cache directory for a project + model combination.
520///
521/// Resolution priority:
522/// 1. `override_dir` parameter (highest)
523/// 2. `.ripvec/config.toml` in directory tree (repo-local)
524/// 3. `RIPVEC_CACHE` environment variable
525/// 4. XDG cache dir (`~/.cache/ripvec/`)
526///
527/// For repo-local, the cache lives at `.ripvec/cache/` directly (no project hash
528/// or version subdirectory — the config.toml pins the model and version).
529///
530/// For user-level cache, layout is `<base>/<project_hash>/v<VERSION>-<model_slug>/`.
531#[must_use]
532pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
533    // Priority 1: explicit override
534    if let Some(dir) = override_dir {
535        let project_hash = hash_project_root(root);
536        let version_dir = format_version_dir(model_repo);
537        return dir.join(&project_hash).join(version_dir);
538    }
539
540    // Priority 2: repo-local .ripvec/config.toml (with model validation)
541    if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
542        && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
543    {
544        if config.cache.model == model_repo {
545            return ripvec_dir.join("cache");
546        }
547        eprintln!(
548            "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
549            config.cache.model, model_repo
550        );
551    }
552
553    // Priority 3+4: env var or XDG
554    let project_hash = hash_project_root(root);
555    let version_dir = format_version_dir(model_repo);
556
557    let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
558        PathBuf::from(env_dir).join(&project_hash)
559    } else {
560        dirs::cache_dir()
561            .unwrap_or_else(|| PathBuf::from("/tmp"))
562            .join("ripvec")
563            .join(&project_hash)
564    };
565
566    base.join(version_dir)
567}
568
569/// Blake3 hash of the canonical project root path.
570fn hash_project_root(root: &Path) -> String {
571    let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
572    blake3::hash(canonical.to_string_lossy().as_bytes())
573        .to_hex()
574        .to_string()
575}
576
577/// Format the version subdirectory name from model repo.
578fn format_version_dir(model_repo: &str) -> String {
579    let model_slug = model_repo
580        .rsplit('/')
581        .next()
582        .unwrap_or(model_repo)
583        .to_lowercase();
584    format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590    use tempfile::TempDir;
591
592    #[test]
593    fn heal_stale_mtimes() {
594        use crate::cache::diff;
595        use crate::cache::manifest::Manifest;
596        use std::io::Write;
597
598        let dir = TempDir::new().unwrap();
599        let file_path = dir.path().join("test.rs");
600        let content = "fn main() {}";
601        {
602            let mut f = std::fs::File::create(&file_path).unwrap();
603            f.write_all(content.as_bytes()).unwrap();
604        }
605
606        // Create manifest with correct content hash but wrong mtime
607        let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
608        let mut manifest = Manifest::new("test-model");
609        manifest.add_file(
610            "test.rs",
611            9_999_999, // deliberately wrong mtime
612            content.len() as u64,
613            &content_hash,
614            1,
615        );
616
617        // After heal, the manifest mtime should match the filesystem
618        heal_manifest_mtimes(dir.path(), &mut manifest);
619        let actual_mtime = diff::mtime_secs(&file_path);
620        assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
621    }
622
623    #[test]
624    fn resolve_uses_repo_local_when_present() {
625        let dir = TempDir::new().unwrap();
626        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
627        cfg.save(&dir.path().join(".ripvec")).unwrap();
628
629        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
630        assert!(
631            result.starts_with(dir.path().join(".ripvec").join("cache")),
632            "expected repo-local cache dir, got: {result:?}"
633        );
634    }
635
636    #[test]
637    fn resolve_falls_back_to_user_cache_when_no_config() {
638        let dir = TempDir::new().unwrap();
639        let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
640        assert!(
641            !result.to_string_lossy().contains(".ripvec"),
642            "should not use repo-local without config, got: {result:?}"
643        );
644    }
645
646    #[test]
647    fn resolve_override_takes_priority_over_repo_local() {
648        let dir = TempDir::new().unwrap();
649        let override_dir = TempDir::new().unwrap();
650
651        let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
652        cfg.save(&dir.path().join(".ripvec")).unwrap();
653
654        let result = resolve_cache_dir(
655            dir.path(),
656            "nomic-ai/modernbert-embed-base",
657            Some(override_dir.path()),
658        );
659        assert!(
660            !result.starts_with(dir.path().join(".ripvec")),
661            "override should win over repo-local, got: {result:?}"
662        );
663    }
664}
665
666/// Rebuild a manifest by scanning the object store and deserializing each object.
667///
668/// Used when `manifest.json` is gitignored and only the objects directory is
669/// committed. Scans every object, extracts the file path from the chunks,
670/// stats the source file for mtime/size, and constructs a valid manifest.
671///
672/// Returns `None` if the objects directory doesn't exist or is empty.
673#[must_use]
674pub fn rebuild_manifest_from_objects(
675    cache_dir: &std::path::Path,
676    root: &std::path::Path,
677    model_repo: &str,
678) -> Option<super::manifest::Manifest> {
679    use super::file_cache::FileCache;
680    use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
681    use super::store::ObjectStore;
682    use std::collections::BTreeMap;
683
684    let store = ObjectStore::new(&cache_dir.join("objects"));
685    let hashes = store.list_hashes();
686    if hashes.is_empty() {
687        return None;
688    }
689
690    let mut files = BTreeMap::new();
691
692    for hash in &hashes {
693        let Ok(bytes) = store.read(hash) else {
694            continue;
695        };
696        let Ok(fc) =
697            FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
698        else {
699            continue;
700        };
701        let Some(first_chunk) = fc.chunks.first() else {
702            continue;
703        };
704
705        // The chunk's file_path may be absolute or relative.
706        // Try to make it relative to root for the manifest key.
707        let chunk_path = std::path::Path::new(&first_chunk.file_path);
708        let rel_path = chunk_path
709            .strip_prefix(root)
710            .unwrap_or(chunk_path)
711            .to_string_lossy()
712            .to_string();
713
714        // Stat the actual file for mtime/size.
715        let abs_path = root.join(&rel_path);
716        let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
717            let mtime = meta
718                .modified()
719                .ok()
720                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
721                .map_or(0, |d| d.as_secs());
722            (mtime, meta.len())
723        } else {
724            (0, 0) // file may not exist on this machine yet
725        };
726
727        files.insert(
728            rel_path,
729            FileEntry {
730                mtime_secs,
731                size,
732                content_hash: hash.clone(),
733                chunk_count: fc.chunks.len(),
734            },
735        );
736    }
737
738    if files.is_empty() {
739        return None;
740    }
741
742    let manifest = Manifest {
743        version: MANIFEST_VERSION,
744        model_repo: model_repo.to_string(),
745        root_hash: String::new(), // will be recomputed on next incremental_index
746        directories: BTreeMap::new(), // will be recomputed on next incremental_index
747        files,
748    };
749
750    // Write the rebuilt manifest to disk so subsequent runs use it.
751    let manifest_path = cache_dir.join("manifest.json");
752    if let Ok(json) = serde_json::to_string_pretty(&manifest) {
753        let _ = std::fs::write(&manifest_path, json);
754    }
755
756    Some(manifest)
757}