1use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20#[derive(Debug)]
22pub struct ReindexStats {
23 pub chunks_total: usize,
25 pub chunks_reembedded: usize,
27 pub files_unchanged: usize,
29 pub files_changed: usize,
31 pub files_deleted: usize,
33 pub duration_ms: u64,
35}
36
37pub fn incremental_index(
48 root: &Path,
49 backends: &[&dyn EmbedBackend],
50 tokenizer: &tokenizers::Tokenizer,
51 cfg: &SearchConfig,
52 profiler: &Profiler,
53 model_repo: &str,
54 cache_dir_override: Option<&Path>,
55 repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57 let start = Instant::now();
58
59 if backends.is_empty() {
60 return Err(crate::Error::Other(anyhow::anyhow!(
61 "no embedding backends provided"
62 )));
63 }
64
65 if repo_level {
68 let ripvec_dir = root.join(".ripvec");
69 let config_path = ripvec_dir.join("config.toml");
70 if !config_path.exists() {
71 let config = crate::cache::config::RepoConfig::new(
72 model_repo,
73 crate::cache::manifest::MANIFEST_VERSION.to_string(),
74 );
75 config.save(&ripvec_dir)?;
76 }
77 }
78
79 let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
80 let portable = is_repo_local(&cache_dir);
81 let manifest_path = cache_dir.join("manifest.json");
82 let objects_dir = cache_dir.join("objects");
83 let store = ObjectStore::new(&objects_dir);
84
85 let existing_manifest = Manifest::load(&manifest_path).ok();
87
88 if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
89 incremental_path(
91 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
92 start, portable,
93 )
94 } else {
95 full_index_path(
97 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
98 portable,
99 )
100 }
101}
102
103#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
105#[expect(
106 clippy::cast_possible_truncation,
107 reason = "duration in ms won't exceed u64"
108)]
109fn incremental_path(
110 root: &Path,
111 backends: &[&dyn EmbedBackend],
112 tokenizer: &tokenizers::Tokenizer,
113 cfg: &SearchConfig,
114 profiler: &Profiler,
115 _model_repo: &str,
116 cache_dir: &Path,
117 store: &ObjectStore,
118 mut manifest: Manifest,
119 start: Instant,
120 portable: bool,
121) -> crate::Result<(HybridIndex, ReindexStats)> {
122 let diff_result = diff::compute_diff(root, &manifest)?;
123
124 let files_changed = diff_result.dirty.len();
125 let files_deleted = diff_result.deleted.len();
126 let files_unchanged = diff_result.unchanged;
127
128 for deleted in &diff_result.deleted {
130 manifest.remove_file(deleted);
131 }
132
133 let mut new_chunks_count = 0;
135 for dirty_path in &diff_result.dirty {
136 let relative = dirty_path
137 .strip_prefix(root)
138 .unwrap_or(dirty_path)
139 .to_string_lossy()
140 .to_string();
141
142 manifest.remove_file(&relative);
144
145 let Some(source) = crate::embed::read_source(dirty_path) else {
147 continue;
148 };
149
150 let ext = dirty_path
151 .extension()
152 .and_then(|e| e.to_str())
153 .unwrap_or("");
154 let chunks = if cfg.text_mode {
155 crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
156 } else {
157 match crate::languages::config_for_extension(ext) {
158 Some(lang_config) => {
159 crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
160 }
161 None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
162 }
163 };
164
165 if chunks.is_empty() {
166 continue;
167 }
168
169 let model_max = backends[0].max_tokens();
171 let encodings: Vec<Option<crate::backend::Encoding>> = chunks
172 .iter()
173 .map(|chunk| {
174 crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
175 })
176 .collect();
177
178 let embeddings =
180 crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
181
182 let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
184 .into_iter()
185 .zip(embeddings.into_iter())
186 .filter(|(_, emb)| !emb.is_empty())
187 .unzip();
188
189 let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
190
191 let content_hash = diff::hash_file(dirty_path)?;
193 let file_cache = FileCache {
194 chunks: good_chunks.clone(),
195 embeddings: good_embeddings.iter().flatten().copied().collect(),
196 hidden_dim,
197 };
198 let bytes = if portable {
199 file_cache.to_portable_bytes()
200 } else {
201 file_cache.to_bytes()
202 };
203 store.write(&content_hash, &bytes)?;
204
205 let mtime = diff::mtime_secs(dirty_path);
207 let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
208 manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
209 new_chunks_count += good_chunks.len();
210 }
211
212 heal_manifest_mtimes(root, &mut manifest);
216
217 manifest.recompute_hashes();
219
220 let referenced = manifest.referenced_hashes();
222 store.gc(&referenced)?;
223
224 manifest.save(&cache_dir.join("manifest.json"))?;
226
227 let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
229 let chunks_total = all_chunks.len();
230 let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
231
232 Ok((
233 hybrid,
234 ReindexStats {
235 chunks_total,
236 chunks_reembedded: new_chunks_count,
237 files_unchanged,
238 files_changed,
239 files_deleted,
240 duration_ms: start.elapsed().as_millis() as u64,
241 },
242 ))
243}
244
245#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
247#[expect(
248 clippy::cast_possible_truncation,
249 reason = "duration in ms won't exceed u64"
250)]
251fn full_index_path(
252 root: &Path,
253 backends: &[&dyn EmbedBackend],
254 tokenizer: &tokenizers::Tokenizer,
255 cfg: &SearchConfig,
256 profiler: &Profiler,
257 model_repo: &str,
258 cache_dir: &Path,
259 store: &ObjectStore,
260 start: Instant,
261 portable: bool,
262) -> crate::Result<(HybridIndex, ReindexStats)> {
263 let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
264
265 let hidden_dim = embeddings.first().map_or(384, Vec::len);
266
267 let mut manifest = Manifest::new(model_repo);
269 let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
270 std::collections::BTreeMap::new();
271
272 for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
273 file_groups
274 .entry(chunk.file_path.clone())
275 .or_default()
276 .0
277 .push(chunk.clone());
278 file_groups
279 .entry(chunk.file_path.clone())
280 .or_default()
281 .1
282 .push(emb.clone());
283 }
284
285 for (file_path, (file_chunks, file_embeddings)) in &file_groups {
286 let file_path_buf = PathBuf::from(file_path);
288
289 let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
290 blake3::hash(file_chunks[0].content.as_bytes())
292 .to_hex()
293 .to_string()
294 });
295
296 let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
297 let fc = FileCache {
298 chunks: file_chunks.clone(),
299 embeddings: flat_emb,
300 hidden_dim,
301 };
302 let bytes = if portable {
303 fc.to_portable_bytes()
304 } else {
305 fc.to_bytes()
306 };
307 store.write(&content_hash, &bytes)?;
308
309 let relative = file_path_buf
310 .strip_prefix(root)
311 .unwrap_or(&file_path_buf)
312 .to_string_lossy()
313 .to_string();
314 let mtime = diff::mtime_secs(&file_path_buf);
315 let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
316 manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
317 }
318
319 manifest.recompute_hashes();
320 manifest.save(&cache_dir.join("manifest.json"))?;
321
322 let chunks_total = chunks.len();
323 let files_changed = file_groups.len();
324 let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
325
326 Ok((
327 hybrid,
328 ReindexStats {
329 chunks_total,
330 chunks_reembedded: chunks_total,
331 files_unchanged: 0,
332 files_changed,
333 files_deleted: 0,
334 duration_ms: start.elapsed().as_millis() as u64,
335 },
336 ))
337}
338
339#[must_use]
341pub fn is_repo_local(cache_dir: &Path) -> bool {
342 cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
343}
344
345pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
351 for (relative, entry) in &mut manifest.files {
352 let file_path = root.join(relative);
353 let mtime = diff::mtime_secs(&file_path);
354 if mtime != entry.mtime_secs {
355 entry.mtime_secs = mtime;
356 }
357 }
358}
359
360fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
363 if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
364 FileCache::from_portable_bytes(bytes)
365 } else {
366 FileCache::from_bytes(bytes)
367 }
368}
369
370fn load_all_from_store(
372 store: &ObjectStore,
373 manifest: &Manifest,
374) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
375 let mut all_chunks = Vec::new();
376 let mut all_embeddings = Vec::new();
377
378 for entry in manifest.files.values() {
379 let bytes = store.read(&entry.content_hash)?;
380 let fc = load_file_cache(&bytes)?;
381 let dim = fc.hidden_dim;
382
383 for (i, chunk) in fc.chunks.into_iter().enumerate() {
384 let start = i * dim;
385 let end = start + dim;
386 if end <= fc.embeddings.len() {
387 all_embeddings.push(fc.embeddings[start..end].to_vec());
388 all_chunks.push(chunk);
389 }
390 }
391 }
392
393 Ok((all_chunks, all_embeddings))
394}
395
396#[must_use]
405pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
406 let cache_dir = resolve_cache_dir(root, model_repo, None);
407 let manifest_path = cache_dir.join("manifest.json");
408 let objects_dir = cache_dir.join("objects");
409 let lock_path = cache_dir.join("manifest.lock");
410
411 if !manifest_path.exists() {
413 return None;
414 }
415
416 let lock_file = std::fs::OpenOptions::new()
418 .create(true)
419 .truncate(false)
420 .write(true)
421 .read(true)
422 .open(&lock_path)
423 .ok()?;
424 let lock = fd_lock::RwLock::new(lock_file);
425 let _guard = lock.read().ok()?;
426
427 let manifest = Manifest::load(&manifest_path).ok()?;
428 if !manifest.is_compatible(model_repo) {
429 return None;
430 }
431
432 let store = ObjectStore::new(&objects_dir);
433 let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
434 HybridIndex::new(chunks, &embeddings, None).ok()
435}
436
437#[must_use]
450pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
451 if let Some(dir) = override_dir {
453 let project_hash = hash_project_root(root);
454 let version_dir = format_version_dir(model_repo);
455 return dir.join(&project_hash).join(version_dir);
456 }
457
458 if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
460 && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
461 {
462 if config.cache.model == model_repo {
463 return ripvec_dir.join("cache");
464 }
465 eprintln!(
466 "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
467 config.cache.model, model_repo
468 );
469 }
470
471 let project_hash = hash_project_root(root);
473 let version_dir = format_version_dir(model_repo);
474
475 let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
476 PathBuf::from(env_dir).join(&project_hash)
477 } else {
478 dirs::cache_dir()
479 .unwrap_or_else(|| PathBuf::from("/tmp"))
480 .join("ripvec")
481 .join(&project_hash)
482 };
483
484 base.join(version_dir)
485}
486
487fn hash_project_root(root: &Path) -> String {
489 let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
490 blake3::hash(canonical.to_string_lossy().as_bytes())
491 .to_hex()
492 .to_string()
493}
494
495fn format_version_dir(model_repo: &str) -> String {
497 let model_slug = model_repo
498 .rsplit('/')
499 .next()
500 .unwrap_or(model_repo)
501 .to_lowercase();
502 format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
503}
504
505#[cfg(test)]
506mod tests {
507 use super::*;
508 use tempfile::TempDir;
509
510 #[test]
511 fn heal_stale_mtimes() {
512 use crate::cache::diff;
513 use crate::cache::manifest::Manifest;
514 use std::io::Write;
515
516 let dir = TempDir::new().unwrap();
517 let file_path = dir.path().join("test.rs");
518 let content = "fn main() {}";
519 {
520 let mut f = std::fs::File::create(&file_path).unwrap();
521 f.write_all(content.as_bytes()).unwrap();
522 }
523
524 let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
526 let mut manifest = Manifest::new("test-model");
527 manifest.add_file(
528 "test.rs",
529 9_999_999, content.len() as u64,
531 &content_hash,
532 1,
533 );
534
535 heal_manifest_mtimes(dir.path(), &mut manifest);
537 let actual_mtime = diff::mtime_secs(&file_path);
538 assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
539 }
540
541 #[test]
542 fn resolve_uses_repo_local_when_present() {
543 let dir = TempDir::new().unwrap();
544 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
545 cfg.save(&dir.path().join(".ripvec")).unwrap();
546
547 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
548 assert!(
549 result.starts_with(dir.path().join(".ripvec").join("cache")),
550 "expected repo-local cache dir, got: {result:?}"
551 );
552 }
553
554 #[test]
555 fn resolve_falls_back_to_user_cache_when_no_config() {
556 let dir = TempDir::new().unwrap();
557 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
558 assert!(
559 !result.to_string_lossy().contains(".ripvec"),
560 "should not use repo-local without config, got: {result:?}"
561 );
562 }
563
564 #[test]
565 fn resolve_override_takes_priority_over_repo_local() {
566 let dir = TempDir::new().unwrap();
567 let override_dir = TempDir::new().unwrap();
568
569 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
570 cfg.save(&dir.path().join(".ripvec")).unwrap();
571
572 let result = resolve_cache_dir(
573 dir.path(),
574 "nomic-ai/modernbert-embed-base",
575 Some(override_dir.path()),
576 );
577 assert!(
578 !result.starts_with(dir.path().join(".ripvec")),
579 "override should win over repo-local, got: {result:?}"
580 );
581 }
582}