1use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20#[derive(Debug)]
22pub struct ReindexStats {
23 pub chunks_total: usize,
25 pub chunks_reembedded: usize,
27 pub files_unchanged: usize,
29 pub files_changed: usize,
31 pub files_deleted: usize,
33 pub duration_ms: u64,
35}
36
37pub fn incremental_index(
48 root: &Path,
49 backends: &[&dyn EmbedBackend],
50 tokenizer: &tokenizers::Tokenizer,
51 cfg: &SearchConfig,
52 profiler: &Profiler,
53 model_repo: &str,
54 cache_dir_override: Option<&Path>,
55 repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57 let start = Instant::now();
58 tracing::info!(root = %root.display(), model = model_repo, "incremental_index starting");
59
60 if backends.is_empty() {
61 return Err(crate::Error::Other(anyhow::anyhow!(
62 "no embedding backends provided"
63 )));
64 }
65
66 if repo_level {
69 let ripvec_dir = root.join(".ripvec");
70 let config_path = ripvec_dir.join("config.toml");
71 if !config_path.exists() {
72 let config = crate::cache::config::RepoConfig::new(
73 model_repo,
74 crate::cache::manifest::MANIFEST_VERSION.to_string(),
75 );
76 config.save(&ripvec_dir)?;
77 }
78 let gitignore_path = ripvec_dir.join(".gitignore");
81 if !gitignore_path.exists() {
82 let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
83 }
84 }
85
86 let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
87 let portable = is_repo_local(&cache_dir);
88 let manifest_path = cache_dir.join("manifest.json");
89 let objects_dir = cache_dir.join("objects");
90 let store = ObjectStore::new(&objects_dir);
91
92 let existing_manifest = Manifest::load(&manifest_path)
94 .ok()
95 .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
96
97 if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
98 tracing::info!(
99 files = manifest.files.len(),
100 "manifest loaded, running incremental diff"
101 );
102 incremental_path(
104 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
105 start, portable,
106 )
107 } else {
108 full_index_path(
110 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
111 portable,
112 )
113 }
114}
115
116#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
118#[expect(
119 clippy::cast_possible_truncation,
120 reason = "duration in ms won't exceed u64"
121)]
122fn incremental_path(
123 root: &Path,
124 backends: &[&dyn EmbedBackend],
125 tokenizer: &tokenizers::Tokenizer,
126 cfg: &SearchConfig,
127 profiler: &Profiler,
128 _model_repo: &str,
129 cache_dir: &Path,
130 store: &ObjectStore,
131 mut manifest: Manifest,
132 start: Instant,
133 portable: bool,
134) -> crate::Result<(HybridIndex, ReindexStats)> {
135 let diff_result = diff::compute_diff(root, &manifest)?;
136
137 let files_changed = diff_result.dirty.len();
138 let files_deleted = diff_result.deleted.len();
139 let files_unchanged = diff_result.unchanged;
140
141 tracing::info!(
142 changed = files_changed,
143 deleted = files_deleted,
144 unchanged = files_unchanged,
145 "diff complete"
146 );
147
148 for deleted in &diff_result.deleted {
150 manifest.remove_file(deleted);
151 }
152
153 let mut new_chunks_count = 0;
155 for dirty_path in &diff_result.dirty {
156 let relative = dirty_path
157 .strip_prefix(root)
158 .unwrap_or(dirty_path)
159 .to_string_lossy()
160 .to_string();
161
162 manifest.remove_file(&relative);
164
165 let Some(source) = crate::embed::read_source(dirty_path) else {
167 continue;
168 };
169
170 let ext = dirty_path
171 .extension()
172 .and_then(|e| e.to_str())
173 .unwrap_or("");
174 let chunks = if cfg.text_mode {
175 crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
176 } else {
177 match crate::languages::config_for_extension(ext) {
178 Some(lang_config) => {
179 crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
180 }
181 None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
182 }
183 };
184
185 if chunks.is_empty() {
186 continue;
187 }
188
189 let model_max = backends[0].max_tokens();
191 let encodings: Vec<Option<crate::backend::Encoding>> = chunks
192 .iter()
193 .map(|chunk| {
194 crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
195 })
196 .collect();
197
198 let embeddings =
200 crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
201
202 let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
204 .into_iter()
205 .zip(embeddings)
206 .filter(|(_, emb)| !emb.is_empty())
207 .unzip();
208
209 let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
210
211 let content_hash = diff::hash_file(dirty_path)?;
213 let file_cache = FileCache {
214 chunks: good_chunks.clone(),
215 embeddings: good_embeddings.iter().flatten().copied().collect(),
216 hidden_dim,
217 };
218 let bytes = if portable {
219 file_cache.to_portable_bytes()
220 } else {
221 file_cache.to_bytes()
222 };
223 store.write(&content_hash, &bytes)?;
224
225 let mtime = diff::mtime_secs(dirty_path);
227 let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
228 manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
229 new_chunks_count += good_chunks.len();
230 }
231
232 heal_manifest_mtimes(root, &mut manifest);
236
237 manifest.recompute_hashes();
239
240 tracing::info!("loading cached objects from store");
243 let (all_chunks, all_embeddings) = load_all_from_store(store, &mut manifest);
244
245 let referenced = manifest.referenced_hashes();
247 store.gc(&referenced)?;
248
249 manifest.save(&cache_dir.join("manifest.json"))?;
251 let chunks_total = all_chunks.len();
252 tracing::info!(
253 chunks = chunks_total,
254 "building HybridIndex (BM25 + PolarQuant)"
255 );
256 let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
257 tracing::info!("HybridIndex ready");
258
259 Ok((
260 hybrid,
261 ReindexStats {
262 chunks_total,
263 chunks_reembedded: new_chunks_count,
264 files_unchanged,
265 files_changed,
266 files_deleted,
267 duration_ms: start.elapsed().as_millis() as u64,
268 },
269 ))
270}
271
272#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
274#[expect(
275 clippy::cast_possible_truncation,
276 reason = "duration in ms won't exceed u64"
277)]
278fn full_index_path(
279 root: &Path,
280 backends: &[&dyn EmbedBackend],
281 tokenizer: &tokenizers::Tokenizer,
282 cfg: &SearchConfig,
283 profiler: &Profiler,
284 model_repo: &str,
285 cache_dir: &Path,
286 store: &ObjectStore,
287 start: Instant,
288 portable: bool,
289) -> crate::Result<(HybridIndex, ReindexStats)> {
290 let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
291
292 let hidden_dim = embeddings.first().map_or(384, Vec::len);
293
294 let mut manifest = Manifest::new(model_repo);
296 let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
297 std::collections::BTreeMap::new();
298
299 for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
300 file_groups
301 .entry(chunk.file_path.clone())
302 .or_default()
303 .0
304 .push(chunk.clone());
305 file_groups
306 .entry(chunk.file_path.clone())
307 .or_default()
308 .1
309 .push(emb.clone());
310 }
311
312 for (file_path, (file_chunks, file_embeddings)) in &file_groups {
313 let file_path_buf = PathBuf::from(file_path);
315
316 let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
317 blake3::hash(file_chunks[0].content.as_bytes())
319 .to_hex()
320 .to_string()
321 });
322
323 let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
324 let fc = FileCache {
325 chunks: file_chunks.clone(),
326 embeddings: flat_emb,
327 hidden_dim,
328 };
329 let bytes = if portable {
330 fc.to_portable_bytes()
331 } else {
332 fc.to_bytes()
333 };
334 store.write(&content_hash, &bytes)?;
335
336 let relative = file_path_buf
337 .strip_prefix(root)
338 .unwrap_or(&file_path_buf)
339 .to_string_lossy()
340 .to_string();
341 let mtime = diff::mtime_secs(&file_path_buf);
342 let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
343 manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
344 }
345
346 manifest.recompute_hashes();
347 manifest.save(&cache_dir.join("manifest.json"))?;
348
349 let chunks_total = chunks.len();
350 let files_changed = file_groups.len();
351 let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
352
353 Ok((
354 hybrid,
355 ReindexStats {
356 chunks_total,
357 chunks_reembedded: chunks_total,
358 files_unchanged: 0,
359 files_changed,
360 files_deleted: 0,
361 duration_ms: start.elapsed().as_millis() as u64,
362 },
363 ))
364}
365
366#[must_use]
368pub fn is_repo_local(cache_dir: &Path) -> bool {
369 cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
370}
371
372pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
378 for (relative, entry) in &mut manifest.files {
379 let file_path = root.join(relative);
380 let mtime = diff::mtime_secs(&file_path);
381 if mtime != entry.mtime_secs {
382 entry.mtime_secs = mtime;
383 }
384 }
385}
386
387#[must_use]
393pub fn check_auto_stash(root: &Path) -> Option<String> {
394 use std::process::Command;
395
396 let ripvec_dir = root.join(".ripvec");
397 let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
398 if !config.cache.local {
399 return None;
400 }
401
402 if config.cache.auto_stash.is_some() {
404 return None;
405 }
406
407 let git_check = Command::new("git")
409 .args(["config", "--local", "pull.autoStash"])
410 .current_dir(root)
411 .stdout(std::process::Stdio::piped())
412 .stderr(std::process::Stdio::null())
413 .output()
414 .ok()?;
415 if git_check.status.success() {
416 let val = String::from_utf8_lossy(&git_check.stdout)
418 .trim()
419 .eq_ignore_ascii_case("true");
420 let _ = apply_auto_stash(root, val);
421 return None;
422 }
423
424 Some(
425 "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
426 Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
427 .to_string(),
428 )
429}
430
431pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
440 use std::process::Command;
441
442 let ripvec_dir = root.join(".ripvec");
443 let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
444 config.cache.auto_stash = Some(enable);
445 config.save(&ripvec_dir)?;
446
447 if enable {
448 let _ = Command::new("git")
449 .args(["config", "--local", "pull.autoStash", "true"])
450 .current_dir(root)
451 .stdout(std::process::Stdio::null())
452 .stderr(std::process::Stdio::null())
453 .status();
454 }
455
456 Ok(())
457}
458
459fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
462 if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
463 FileCache::from_portable_bytes(bytes)
464 } else {
465 FileCache::from_bytes(bytes)
466 }
467}
468
469fn load_all_from_store(
476 store: &ObjectStore,
477 manifest: &mut Manifest,
478) -> (Vec<CodeChunk>, Vec<Vec<f32>>) {
479 let mut all_chunks = Vec::new();
480 let mut all_embeddings = Vec::new();
481 let mut dangling: Vec<String> = Vec::new();
482
483 for (path, entry) in &manifest.files {
484 let bytes = match store.read(&entry.content_hash) {
485 Ok(b) => b,
486 Err(e) => {
487 tracing::warn!(
488 path = %path,
489 hash = %entry.content_hash,
490 error = %e,
491 "cache object missing or unreadable — will re-embed"
492 );
493 dangling.push(path.clone());
494 continue;
495 }
496 };
497 let fc = match load_file_cache(&bytes) {
498 Ok(fc) => fc,
499 Err(e) => {
500 tracing::warn!(
501 path = %path,
502 hash = %entry.content_hash,
503 error = %e,
504 "cache object corrupt — will re-embed"
505 );
506 dangling.push(path.clone());
507 continue;
508 }
509 };
510 let dim = fc.hidden_dim;
511
512 for (i, chunk) in fc.chunks.into_iter().enumerate() {
513 let start = i * dim;
514 let end = start + dim;
515 if end <= fc.embeddings.len() {
516 all_embeddings.push(fc.embeddings[start..end].to_vec());
517 all_chunks.push(chunk);
518 }
519 }
520 }
521
522 for path in &dangling {
525 manifest.files.remove(path);
526 }
527 if !dangling.is_empty() {
528 tracing::warn!(
529 count = dangling.len(),
530 "pruned dangling manifest entries; these files will be re-embedded on next run"
531 );
532 }
533
534 (all_chunks, all_embeddings)
535}
536
537#[must_use]
546pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
547 let cache_dir = resolve_cache_dir(root, model_repo, None);
548 let manifest_path = cache_dir.join("manifest.json");
549 let objects_dir = cache_dir.join("objects");
550 let lock_path = cache_dir.join("manifest.lock");
551
552 if !manifest_path.exists() {
554 return None;
555 }
556
557 let lock_file = std::fs::OpenOptions::new()
559 .create(true)
560 .truncate(false)
561 .write(true)
562 .read(true)
563 .open(&lock_path)
564 .ok()?;
565 let lock = fd_lock::RwLock::new(lock_file);
566 let _guard = lock.read().ok()?;
567
568 let mut manifest = Manifest::load(&manifest_path)
569 .ok()
570 .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
571 if !manifest.is_compatible(model_repo) {
572 return None;
573 }
574
575 let store = ObjectStore::new(&objects_dir);
576 let (chunks, embeddings) = load_all_from_store(&store, &mut manifest);
577 HybridIndex::new(chunks, &embeddings, None).ok()
578}
579
580#[must_use]
593pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
594 if let Some(dir) = override_dir {
596 let project_hash = hash_project_root(root);
597 let version_dir = format_version_dir(model_repo);
598 return dir.join(&project_hash).join(version_dir);
599 }
600
601 if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
603 && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
604 {
605 if config.cache.model == model_repo {
606 return ripvec_dir.join("cache");
607 }
608 eprintln!(
609 "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
610 config.cache.model, model_repo
611 );
612 }
613
614 let project_hash = hash_project_root(root);
616 let version_dir = format_version_dir(model_repo);
617
618 let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
619 PathBuf::from(env_dir).join(&project_hash)
620 } else {
621 dirs::cache_dir()
622 .unwrap_or_else(|| PathBuf::from("/tmp"))
623 .join("ripvec")
624 .join(&project_hash)
625 };
626
627 base.join(version_dir)
628}
629
630fn hash_project_root(root: &Path) -> String {
632 let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
633 blake3::hash(canonical.to_string_lossy().as_bytes())
634 .to_hex()
635 .to_string()
636}
637
638fn format_version_dir(model_repo: &str) -> String {
640 let model_slug = model_repo
641 .rsplit('/')
642 .next()
643 .unwrap_or(model_repo)
644 .to_lowercase();
645 format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
646}
647
648#[cfg(test)]
649mod tests {
650 use super::*;
651 use tempfile::TempDir;
652
653 #[test]
654 fn heal_stale_mtimes() {
655 use crate::cache::diff;
656 use crate::cache::manifest::Manifest;
657 use std::io::Write;
658
659 let dir = TempDir::new().unwrap();
660 let file_path = dir.path().join("test.rs");
661 let content = "fn main() {}";
662 {
663 let mut f = std::fs::File::create(&file_path).unwrap();
664 f.write_all(content.as_bytes()).unwrap();
665 }
666
667 let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
669 let mut manifest = Manifest::new("test-model");
670 manifest.add_file(
671 "test.rs",
672 9_999_999, content.len() as u64,
674 &content_hash,
675 1,
676 );
677
678 heal_manifest_mtimes(dir.path(), &mut manifest);
680 let actual_mtime = diff::mtime_secs(&file_path);
681 assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
682 }
683
684 #[test]
685 fn resolve_uses_repo_local_when_present() {
686 let dir = TempDir::new().unwrap();
687 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
688 cfg.save(&dir.path().join(".ripvec")).unwrap();
689
690 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
691 assert!(
692 result.starts_with(dir.path().join(".ripvec").join("cache")),
693 "expected repo-local cache dir, got: {result:?}"
694 );
695 }
696
697 #[test]
698 fn resolve_falls_back_to_user_cache_when_no_config() {
699 let dir = TempDir::new().unwrap();
700 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
701 assert!(
702 !result.to_string_lossy().contains(".ripvec"),
703 "should not use repo-local without config, got: {result:?}"
704 );
705 }
706
707 #[test]
708 fn resolve_override_takes_priority_over_repo_local() {
709 let dir = TempDir::new().unwrap();
710 let override_dir = TempDir::new().unwrap();
711
712 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
713 cfg.save(&dir.path().join(".ripvec")).unwrap();
714
715 let result = resolve_cache_dir(
716 dir.path(),
717 "nomic-ai/modernbert-embed-base",
718 Some(override_dir.path()),
719 );
720 assert!(
721 !result.starts_with(dir.path().join(".ripvec")),
722 "override should win over repo-local, got: {result:?}"
723 );
724 }
725}
726
727#[must_use]
735pub fn rebuild_manifest_from_objects(
736 cache_dir: &std::path::Path,
737 root: &std::path::Path,
738 model_repo: &str,
739) -> Option<super::manifest::Manifest> {
740 use super::file_cache::FileCache;
741 use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
742 use super::store::ObjectStore;
743 use std::collections::BTreeMap;
744
745 let store = ObjectStore::new(&cache_dir.join("objects"));
746 let hashes = store.list_hashes();
747 if hashes.is_empty() {
748 return None;
749 }
750
751 tracing::info!(
752 objects = hashes.len(),
753 "rebuilding manifest from object store"
754 );
755
756 let mut files = BTreeMap::new();
757
758 for hash in &hashes {
759 let Ok(bytes) = store.read(hash) else {
760 continue;
761 };
762 let Ok(fc) =
763 FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
764 else {
765 continue;
766 };
767 let Some(first_chunk) = fc.chunks.first() else {
768 continue;
769 };
770
771 let chunk_path = std::path::Path::new(&first_chunk.file_path);
774 let rel_path = chunk_path
775 .strip_prefix(root)
776 .unwrap_or(chunk_path)
777 .to_string_lossy()
778 .to_string();
779
780 let abs_path = root.join(&rel_path);
782 let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
783 let mtime = meta
784 .modified()
785 .ok()
786 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
787 .map_or(0, |d| d.as_secs());
788 (mtime, meta.len())
789 } else {
790 (0, 0) };
792
793 files.insert(
794 rel_path,
795 FileEntry {
796 mtime_secs,
797 size,
798 content_hash: hash.clone(),
799 chunk_count: fc.chunks.len(),
800 },
801 );
802 }
803
804 if files.is_empty() {
805 return None;
806 }
807
808 let manifest = Manifest {
809 version: MANIFEST_VERSION,
810 model_repo: model_repo.to_string(),
811 root_hash: String::new(), directories: BTreeMap::new(), files,
814 };
815
816 tracing::info!(
817 files = manifest.files.len(),
818 "manifest rebuilt from objects"
819 );
820
821 let manifest_path = cache_dir.join("manifest.json");
823 if let Ok(json) = serde_json::to_string_pretty(&manifest) {
824 let _ = std::fs::write(&manifest_path, json);
825 }
826
827 Some(manifest)
828}