Skip to main content

ripvec_core/cache/
reindex.rs

1//! Incremental reindex orchestrator.
2//!
3//! Ties together the manifest, object store, diff, and embedding pipeline
4//! to provide a single `incremental_index` function that loads cached
5//! embeddings and only re-embeds changed files.
6
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20/// Statistics from an incremental reindex operation.
21#[derive(Debug)]
22pub struct ReindexStats {
23    /// Total chunks in the final index.
24    pub chunks_total: usize,
25    /// Chunks that were re-embedded (from dirty files).
26    pub chunks_reembedded: usize,
27    /// Files unchanged (loaded from cache).
28    pub files_unchanged: usize,
29    /// Files that were new or modified.
30    pub files_changed: usize,
31    /// Files removed since last index.
32    pub files_deleted: usize,
33    /// Wall-clock duration of the reindex.
34    pub duration_ms: u64,
35}
36
37/// Load or incrementally update a persistent index.
38///
39/// 1. Resolve cache directory
40/// 2. If manifest exists and model matches: Merkle diff, re-embed dirty files
41/// 3. If no manifest: full embed from scratch
42/// 4. Rebuild `SearchIndex` from all cached objects
43///
44/// # Errors
45///
46/// Returns an error if embedding fails or the cache directory is inaccessible.
47pub fn incremental_index(
48    root: &Path,
49    backends: &[&dyn EmbedBackend],
50    tokenizer: &tokenizers::Tokenizer,
51    cfg: &SearchConfig,
52    profiler: &Profiler,
53    model_repo: &str,
54    cache_dir_override: Option<&Path>,
55) -> crate::Result<(HybridIndex, ReindexStats)> {
56    let start = Instant::now();
57
58    if backends.is_empty() {
59        return Err(crate::Error::Other(anyhow::anyhow!(
60            "no embedding backends provided"
61        )));
62    }
63
64    let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
65    let manifest_path = cache_dir.join("manifest.json");
66    let objects_dir = cache_dir.join("objects");
67    let store = ObjectStore::new(&objects_dir);
68
69    // Try loading existing manifest
70    let existing_manifest = Manifest::load(&manifest_path).ok();
71
72    if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
73        // Incremental path: diff → re-embed dirty → merge
74        incremental_path(
75            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
76            start,
77        )
78    } else {
79        // Cold path: full embed
80        full_index_path(
81            root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
82        )
83    }
84}
85
86/// Incremental reindex: diff, re-embed dirty files, merge with cached.
87#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
88#[expect(
89    clippy::cast_possible_truncation,
90    reason = "duration in ms won't exceed u64"
91)]
92fn incremental_path(
93    root: &Path,
94    backends: &[&dyn EmbedBackend],
95    tokenizer: &tokenizers::Tokenizer,
96    cfg: &SearchConfig,
97    profiler: &Profiler,
98    _model_repo: &str,
99    cache_dir: &Path,
100    store: &ObjectStore,
101    mut manifest: Manifest,
102    start: Instant,
103) -> crate::Result<(HybridIndex, ReindexStats)> {
104    let diff_result = diff::compute_diff(root, &manifest)?;
105
106    let files_changed = diff_result.dirty.len();
107    let files_deleted = diff_result.deleted.len();
108    let files_unchanged = diff_result.unchanged;
109
110    // Remove deleted files from manifest
111    for deleted in &diff_result.deleted {
112        manifest.remove_file(deleted);
113    }
114
115    // Re-embed dirty files
116    let mut new_chunks_count = 0;
117    for dirty_path in &diff_result.dirty {
118        let relative = dirty_path
119            .strip_prefix(root)
120            .unwrap_or(dirty_path)
121            .to_string_lossy()
122            .to_string();
123
124        // Remove old entry if it exists
125        manifest.remove_file(&relative);
126
127        // Chunk this file
128        let Some(source) = crate::embed::read_source(dirty_path) else {
129            continue;
130        };
131
132        let ext = dirty_path
133            .extension()
134            .and_then(|e| e.to_str())
135            .unwrap_or("");
136        let chunks = if cfg.text_mode {
137            crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
138        } else {
139            match crate::languages::config_for_extension(ext) {
140                Some(lang_config) => {
141                    crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
142                }
143                None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
144            }
145        };
146
147        if chunks.is_empty() {
148            continue;
149        }
150
151        // Tokenize
152        let model_max = backends[0].max_tokens();
153        let encodings: Vec<Option<crate::backend::Encoding>> = chunks
154            .iter()
155            .map(|chunk| {
156                crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
157            })
158            .collect();
159
160        // Embed
161        let embeddings =
162            crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
163
164        // Filter out failed tokenizations
165        let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
166            .into_iter()
167            .zip(embeddings.into_iter())
168            .filter(|(_, emb)| !emb.is_empty())
169            .unzip();
170
171        let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
172
173        // Save to object store
174        let content_hash = diff::hash_file(dirty_path)?;
175        let file_cache = FileCache {
176            chunks: good_chunks.clone(),
177            embeddings: good_embeddings.iter().flatten().copied().collect(),
178            hidden_dim,
179        };
180        store.write(&content_hash, &file_cache.to_bytes())?;
181
182        // Update manifest
183        let mtime = diff::mtime_secs(dirty_path);
184        let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
185        manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
186        new_chunks_count += good_chunks.len();
187    }
188
189    // Recompute Merkle hashes
190    manifest.recompute_hashes();
191
192    // GC unreferenced objects
193    let referenced = manifest.referenced_hashes();
194    store.gc(&referenced)?;
195
196    // Save manifest
197    manifest.save(&cache_dir.join("manifest.json"))?;
198
199    // Rebuild HybridIndex (semantic + BM25) from all cached objects
200    let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
201    let chunks_total = all_chunks.len();
202    let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
203
204    Ok((
205        hybrid,
206        ReindexStats {
207            chunks_total,
208            chunks_reembedded: new_chunks_count,
209            files_unchanged,
210            files_changed,
211            files_deleted,
212            duration_ms: start.elapsed().as_millis() as u64,
213        },
214    ))
215}
216
217/// Full index from scratch: embed everything, save to cache.
218#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
219#[expect(
220    clippy::cast_possible_truncation,
221    reason = "duration in ms won't exceed u64"
222)]
223fn full_index_path(
224    root: &Path,
225    backends: &[&dyn EmbedBackend],
226    tokenizer: &tokenizers::Tokenizer,
227    cfg: &SearchConfig,
228    profiler: &Profiler,
229    model_repo: &str,
230    cache_dir: &Path,
231    store: &ObjectStore,
232    start: Instant,
233) -> crate::Result<(HybridIndex, ReindexStats)> {
234    let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
235
236    let hidden_dim = embeddings.first().map_or(384, Vec::len);
237
238    // Group chunks and embeddings by file, save to store
239    let mut manifest = Manifest::new(model_repo);
240    let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
241        std::collections::BTreeMap::new();
242
243    for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
244        file_groups
245            .entry(chunk.file_path.clone())
246            .or_default()
247            .0
248            .push(chunk.clone());
249        file_groups
250            .entry(chunk.file_path.clone())
251            .or_default()
252            .1
253            .push(emb.clone());
254    }
255
256    for (file_path, (file_chunks, file_embeddings)) in &file_groups {
257        // file_path from CodeChunk is already an absolute or cwd-relative path
258        let file_path_buf = PathBuf::from(file_path);
259
260        let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
261            // File might not exist (e.g., generated content) — use chunk content hash
262            blake3::hash(file_chunks[0].content.as_bytes())
263                .to_hex()
264                .to_string()
265        });
266
267        let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
268        let fc = FileCache {
269            chunks: file_chunks.clone(),
270            embeddings: flat_emb,
271            hidden_dim,
272        };
273        store.write(&content_hash, &fc.to_bytes())?;
274
275        let relative = file_path_buf
276            .strip_prefix(root)
277            .unwrap_or(&file_path_buf)
278            .to_string_lossy()
279            .to_string();
280        let mtime = diff::mtime_secs(&file_path_buf);
281        let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
282        manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
283    }
284
285    manifest.recompute_hashes();
286    manifest.save(&cache_dir.join("manifest.json"))?;
287
288    let chunks_total = chunks.len();
289    let files_changed = file_groups.len();
290    let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
291
292    Ok((
293        hybrid,
294        ReindexStats {
295            chunks_total,
296            chunks_reembedded: chunks_total,
297            files_unchanged: 0,
298            files_changed,
299            files_deleted: 0,
300            duration_ms: start.elapsed().as_millis() as u64,
301        },
302    ))
303}
304
305/// Load all cached chunks and embeddings from the object store.
306fn load_all_from_store(
307    store: &ObjectStore,
308    manifest: &Manifest,
309) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
310    let mut all_chunks = Vec::new();
311    let mut all_embeddings = Vec::new();
312
313    for entry in manifest.files.values() {
314        let bytes = store.read(&entry.content_hash)?;
315        let fc = FileCache::from_bytes(&bytes)?;
316        let dim = fc.hidden_dim;
317
318        for (i, chunk) in fc.chunks.into_iter().enumerate() {
319            let start = i * dim;
320            let end = start + dim;
321            if end <= fc.embeddings.len() {
322                all_embeddings.push(fc.embeddings[start..end].to_vec());
323                all_chunks.push(chunk);
324            }
325        }
326    }
327
328    Ok((all_chunks, all_embeddings))
329}
330
331/// Resolve the cache directory for a project + model combination.
332///
333/// Layout: `<base>/<project_hash>/v<VERSION>-<model_slug>/`
334///
335/// Encoding the version and model into the directory name means switching
336/// models creates a new cache dir (no migration needed) and bumping
337/// [`MANIFEST_VERSION`](super::manifest::MANIFEST_VERSION) auto-invalidates
338/// old caches (they're just orphaned directories).
339///
340/// Priority: override > `RIPVEC_CACHE` env > XDG cache dir.
341#[must_use]
342pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
343    let project_hash = {
344        let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
345        blake3::hash(canonical.to_string_lossy().as_bytes())
346            .to_hex()
347            .to_string()
348    };
349
350    // e.g. "nomic-ai/modernbert-embed-base" → "modernbert-embed-base"
351    let model_slug = model_repo
352        .rsplit('/')
353        .next()
354        .unwrap_or(model_repo)
355        .to_lowercase();
356    let version_dir = format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION);
357
358    let base = if let Some(dir) = override_dir {
359        dir.join(&project_hash)
360    } else if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
361        PathBuf::from(env_dir).join(&project_hash)
362    } else {
363        dirs::cache_dir()
364            .unwrap_or_else(|| PathBuf::from("/tmp"))
365            .join("ripvec")
366            .join(&project_hash)
367    };
368
369    base.join(version_dir)
370}