1use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20#[derive(Debug)]
22pub struct ReindexStats {
23 pub chunks_total: usize,
25 pub chunks_reembedded: usize,
27 pub files_unchanged: usize,
29 pub files_changed: usize,
31 pub files_deleted: usize,
33 pub duration_ms: u64,
35}
36
37pub fn incremental_index(
48 root: &Path,
49 backends: &[&dyn EmbedBackend],
50 tokenizer: &tokenizers::Tokenizer,
51 cfg: &SearchConfig,
52 profiler: &Profiler,
53 model_repo: &str,
54 cache_dir_override: Option<&Path>,
55 repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57 let start = Instant::now();
58
59 if backends.is_empty() {
60 return Err(crate::Error::Other(anyhow::anyhow!(
61 "no embedding backends provided"
62 )));
63 }
64
65 if repo_level {
68 let ripvec_dir = root.join(".ripvec");
69 let config_path = ripvec_dir.join("config.toml");
70 if !config_path.exists() {
71 let config = crate::cache::config::RepoConfig::new(
72 model_repo,
73 crate::cache::manifest::MANIFEST_VERSION.to_string(),
74 );
75 config.save(&ripvec_dir)?;
76 }
77 let gitignore_path = ripvec_dir.join(".gitignore");
80 if !gitignore_path.exists() {
81 let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
82 }
83 }
84
85 let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
86 let portable = is_repo_local(&cache_dir);
87 let manifest_path = cache_dir.join("manifest.json");
88 let objects_dir = cache_dir.join("objects");
89 let store = ObjectStore::new(&objects_dir);
90
91 let existing_manifest = Manifest::load(&manifest_path)
93 .ok()
94 .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
95
96 if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
97 incremental_path(
99 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
100 start, portable,
101 )
102 } else {
103 full_index_path(
105 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
106 portable,
107 )
108 }
109}
110
111#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
113#[expect(
114 clippy::cast_possible_truncation,
115 reason = "duration in ms won't exceed u64"
116)]
117fn incremental_path(
118 root: &Path,
119 backends: &[&dyn EmbedBackend],
120 tokenizer: &tokenizers::Tokenizer,
121 cfg: &SearchConfig,
122 profiler: &Profiler,
123 _model_repo: &str,
124 cache_dir: &Path,
125 store: &ObjectStore,
126 mut manifest: Manifest,
127 start: Instant,
128 portable: bool,
129) -> crate::Result<(HybridIndex, ReindexStats)> {
130 let diff_result = diff::compute_diff(root, &manifest)?;
131
132 let files_changed = diff_result.dirty.len();
133 let files_deleted = diff_result.deleted.len();
134 let files_unchanged = diff_result.unchanged;
135
136 for deleted in &diff_result.deleted {
138 manifest.remove_file(deleted);
139 }
140
141 let mut new_chunks_count = 0;
143 for dirty_path in &diff_result.dirty {
144 let relative = dirty_path
145 .strip_prefix(root)
146 .unwrap_or(dirty_path)
147 .to_string_lossy()
148 .to_string();
149
150 manifest.remove_file(&relative);
152
153 let Some(source) = crate::embed::read_source(dirty_path) else {
155 continue;
156 };
157
158 let ext = dirty_path
159 .extension()
160 .and_then(|e| e.to_str())
161 .unwrap_or("");
162 let chunks = if cfg.text_mode {
163 crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
164 } else {
165 match crate::languages::config_for_extension(ext) {
166 Some(lang_config) => {
167 crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
168 }
169 None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
170 }
171 };
172
173 if chunks.is_empty() {
174 continue;
175 }
176
177 let model_max = backends[0].max_tokens();
179 let encodings: Vec<Option<crate::backend::Encoding>> = chunks
180 .iter()
181 .map(|chunk| {
182 crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
183 })
184 .collect();
185
186 let embeddings =
188 crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
189
190 let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
192 .into_iter()
193 .zip(embeddings.into_iter())
194 .filter(|(_, emb)| !emb.is_empty())
195 .unzip();
196
197 let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
198
199 let content_hash = diff::hash_file(dirty_path)?;
201 let file_cache = FileCache {
202 chunks: good_chunks.clone(),
203 embeddings: good_embeddings.iter().flatten().copied().collect(),
204 hidden_dim,
205 };
206 let bytes = if portable {
207 file_cache.to_portable_bytes()
208 } else {
209 file_cache.to_bytes()
210 };
211 store.write(&content_hash, &bytes)?;
212
213 let mtime = diff::mtime_secs(dirty_path);
215 let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
216 manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
217 new_chunks_count += good_chunks.len();
218 }
219
220 heal_manifest_mtimes(root, &mut manifest);
224
225 manifest.recompute_hashes();
227
228 let referenced = manifest.referenced_hashes();
230 store.gc(&referenced)?;
231
232 manifest.save(&cache_dir.join("manifest.json"))?;
234
235 let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
237 let chunks_total = all_chunks.len();
238 let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
239
240 Ok((
241 hybrid,
242 ReindexStats {
243 chunks_total,
244 chunks_reembedded: new_chunks_count,
245 files_unchanged,
246 files_changed,
247 files_deleted,
248 duration_ms: start.elapsed().as_millis() as u64,
249 },
250 ))
251}
252
253#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
255#[expect(
256 clippy::cast_possible_truncation,
257 reason = "duration in ms won't exceed u64"
258)]
259fn full_index_path(
260 root: &Path,
261 backends: &[&dyn EmbedBackend],
262 tokenizer: &tokenizers::Tokenizer,
263 cfg: &SearchConfig,
264 profiler: &Profiler,
265 model_repo: &str,
266 cache_dir: &Path,
267 store: &ObjectStore,
268 start: Instant,
269 portable: bool,
270) -> crate::Result<(HybridIndex, ReindexStats)> {
271 let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
272
273 let hidden_dim = embeddings.first().map_or(384, Vec::len);
274
275 let mut manifest = Manifest::new(model_repo);
277 let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
278 std::collections::BTreeMap::new();
279
280 for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
281 file_groups
282 .entry(chunk.file_path.clone())
283 .or_default()
284 .0
285 .push(chunk.clone());
286 file_groups
287 .entry(chunk.file_path.clone())
288 .or_default()
289 .1
290 .push(emb.clone());
291 }
292
293 for (file_path, (file_chunks, file_embeddings)) in &file_groups {
294 let file_path_buf = PathBuf::from(file_path);
296
297 let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
298 blake3::hash(file_chunks[0].content.as_bytes())
300 .to_hex()
301 .to_string()
302 });
303
304 let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
305 let fc = FileCache {
306 chunks: file_chunks.clone(),
307 embeddings: flat_emb,
308 hidden_dim,
309 };
310 let bytes = if portable {
311 fc.to_portable_bytes()
312 } else {
313 fc.to_bytes()
314 };
315 store.write(&content_hash, &bytes)?;
316
317 let relative = file_path_buf
318 .strip_prefix(root)
319 .unwrap_or(&file_path_buf)
320 .to_string_lossy()
321 .to_string();
322 let mtime = diff::mtime_secs(&file_path_buf);
323 let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
324 manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
325 }
326
327 manifest.recompute_hashes();
328 manifest.save(&cache_dir.join("manifest.json"))?;
329
330 let chunks_total = chunks.len();
331 let files_changed = file_groups.len();
332 let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
333
334 Ok((
335 hybrid,
336 ReindexStats {
337 chunks_total,
338 chunks_reembedded: chunks_total,
339 files_unchanged: 0,
340 files_changed,
341 files_deleted: 0,
342 duration_ms: start.elapsed().as_millis() as u64,
343 },
344 ))
345}
346
347#[must_use]
349pub fn is_repo_local(cache_dir: &Path) -> bool {
350 cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
351}
352
353pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
359 for (relative, entry) in &mut manifest.files {
360 let file_path = root.join(relative);
361 let mtime = diff::mtime_secs(&file_path);
362 if mtime != entry.mtime_secs {
363 entry.mtime_secs = mtime;
364 }
365 }
366}
367
368#[must_use]
374pub fn check_auto_stash(root: &Path) -> Option<String> {
375 use std::process::Command;
376
377 let ripvec_dir = root.join(".ripvec");
378 let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
379 if !config.cache.local {
380 return None;
381 }
382
383 if config.cache.auto_stash.is_some() {
385 return None;
386 }
387
388 let git_check = Command::new("git")
390 .args(["config", "--local", "pull.autoStash"])
391 .current_dir(root)
392 .stdout(std::process::Stdio::piped())
393 .stderr(std::process::Stdio::null())
394 .output()
395 .ok()?;
396 if git_check.status.success() {
397 let val = String::from_utf8_lossy(&git_check.stdout)
399 .trim()
400 .eq_ignore_ascii_case("true");
401 let _ = apply_auto_stash(root, val);
402 return None;
403 }
404
405 Some(
406 "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
407 Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
408 .to_string(),
409 )
410}
411
412pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
421 use std::process::Command;
422
423 let ripvec_dir = root.join(".ripvec");
424 let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
425 config.cache.auto_stash = Some(enable);
426 config.save(&ripvec_dir)?;
427
428 if enable {
429 let _ = Command::new("git")
430 .args(["config", "--local", "pull.autoStash", "true"])
431 .current_dir(root)
432 .stdout(std::process::Stdio::null())
433 .stderr(std::process::Stdio::null())
434 .status();
435 }
436
437 Ok(())
438}
439
440fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
443 if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
444 FileCache::from_portable_bytes(bytes)
445 } else {
446 FileCache::from_bytes(bytes)
447 }
448}
449
450fn load_all_from_store(
452 store: &ObjectStore,
453 manifest: &Manifest,
454) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
455 let mut all_chunks = Vec::new();
456 let mut all_embeddings = Vec::new();
457
458 for entry in manifest.files.values() {
459 let bytes = store.read(&entry.content_hash)?;
460 let fc = load_file_cache(&bytes)?;
461 let dim = fc.hidden_dim;
462
463 for (i, chunk) in fc.chunks.into_iter().enumerate() {
464 let start = i * dim;
465 let end = start + dim;
466 if end <= fc.embeddings.len() {
467 all_embeddings.push(fc.embeddings[start..end].to_vec());
468 all_chunks.push(chunk);
469 }
470 }
471 }
472
473 Ok((all_chunks, all_embeddings))
474}
475
476#[must_use]
485pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
486 let cache_dir = resolve_cache_dir(root, model_repo, None);
487 let manifest_path = cache_dir.join("manifest.json");
488 let objects_dir = cache_dir.join("objects");
489 let lock_path = cache_dir.join("manifest.lock");
490
491 if !manifest_path.exists() {
493 return None;
494 }
495
496 let lock_file = std::fs::OpenOptions::new()
498 .create(true)
499 .truncate(false)
500 .write(true)
501 .read(true)
502 .open(&lock_path)
503 .ok()?;
504 let lock = fd_lock::RwLock::new(lock_file);
505 let _guard = lock.read().ok()?;
506
507 let manifest = Manifest::load(&manifest_path)
508 .ok()
509 .or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
510 if !manifest.is_compatible(model_repo) {
511 return None;
512 }
513
514 let store = ObjectStore::new(&objects_dir);
515 let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
516 HybridIndex::new(chunks, &embeddings, None).ok()
517}
518
519#[must_use]
532pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
533 if let Some(dir) = override_dir {
535 let project_hash = hash_project_root(root);
536 let version_dir = format_version_dir(model_repo);
537 return dir.join(&project_hash).join(version_dir);
538 }
539
540 if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
542 && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
543 {
544 if config.cache.model == model_repo {
545 return ripvec_dir.join("cache");
546 }
547 eprintln!(
548 "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
549 config.cache.model, model_repo
550 );
551 }
552
553 let project_hash = hash_project_root(root);
555 let version_dir = format_version_dir(model_repo);
556
557 let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
558 PathBuf::from(env_dir).join(&project_hash)
559 } else {
560 dirs::cache_dir()
561 .unwrap_or_else(|| PathBuf::from("/tmp"))
562 .join("ripvec")
563 .join(&project_hash)
564 };
565
566 base.join(version_dir)
567}
568
569fn hash_project_root(root: &Path) -> String {
571 let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
572 blake3::hash(canonical.to_string_lossy().as_bytes())
573 .to_hex()
574 .to_string()
575}
576
577fn format_version_dir(model_repo: &str) -> String {
579 let model_slug = model_repo
580 .rsplit('/')
581 .next()
582 .unwrap_or(model_repo)
583 .to_lowercase();
584 format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
585}
586
587#[cfg(test)]
588mod tests {
589 use super::*;
590 use tempfile::TempDir;
591
592 #[test]
593 fn heal_stale_mtimes() {
594 use crate::cache::diff;
595 use crate::cache::manifest::Manifest;
596 use std::io::Write;
597
598 let dir = TempDir::new().unwrap();
599 let file_path = dir.path().join("test.rs");
600 let content = "fn main() {}";
601 {
602 let mut f = std::fs::File::create(&file_path).unwrap();
603 f.write_all(content.as_bytes()).unwrap();
604 }
605
606 let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
608 let mut manifest = Manifest::new("test-model");
609 manifest.add_file(
610 "test.rs",
611 9_999_999, content.len() as u64,
613 &content_hash,
614 1,
615 );
616
617 heal_manifest_mtimes(dir.path(), &mut manifest);
619 let actual_mtime = diff::mtime_secs(&file_path);
620 assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
621 }
622
623 #[test]
624 fn resolve_uses_repo_local_when_present() {
625 let dir = TempDir::new().unwrap();
626 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
627 cfg.save(&dir.path().join(".ripvec")).unwrap();
628
629 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
630 assert!(
631 result.starts_with(dir.path().join(".ripvec").join("cache")),
632 "expected repo-local cache dir, got: {result:?}"
633 );
634 }
635
636 #[test]
637 fn resolve_falls_back_to_user_cache_when_no_config() {
638 let dir = TempDir::new().unwrap();
639 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
640 assert!(
641 !result.to_string_lossy().contains(".ripvec"),
642 "should not use repo-local without config, got: {result:?}"
643 );
644 }
645
646 #[test]
647 fn resolve_override_takes_priority_over_repo_local() {
648 let dir = TempDir::new().unwrap();
649 let override_dir = TempDir::new().unwrap();
650
651 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
652 cfg.save(&dir.path().join(".ripvec")).unwrap();
653
654 let result = resolve_cache_dir(
655 dir.path(),
656 "nomic-ai/modernbert-embed-base",
657 Some(override_dir.path()),
658 );
659 assert!(
660 !result.starts_with(dir.path().join(".ripvec")),
661 "override should win over repo-local, got: {result:?}"
662 );
663 }
664}
665
666#[must_use]
674pub fn rebuild_manifest_from_objects(
675 cache_dir: &std::path::Path,
676 root: &std::path::Path,
677 model_repo: &str,
678) -> Option<super::manifest::Manifest> {
679 use super::file_cache::FileCache;
680 use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
681 use super::store::ObjectStore;
682 use std::collections::BTreeMap;
683
684 let store = ObjectStore::new(&cache_dir.join("objects"));
685 let hashes = store.list_hashes();
686 if hashes.is_empty() {
687 return None;
688 }
689
690 let mut files = BTreeMap::new();
691
692 for hash in &hashes {
693 let Ok(bytes) = store.read(hash) else {
694 continue;
695 };
696 let Ok(fc) =
697 FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
698 else {
699 continue;
700 };
701 let Some(first_chunk) = fc.chunks.first() else {
702 continue;
703 };
704
705 let chunk_path = std::path::Path::new(&first_chunk.file_path);
708 let rel_path = chunk_path
709 .strip_prefix(root)
710 .unwrap_or(chunk_path)
711 .to_string_lossy()
712 .to_string();
713
714 let abs_path = root.join(&rel_path);
716 let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
717 let mtime = meta
718 .modified()
719 .ok()
720 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
721 .map_or(0, |d| d.as_secs());
722 (mtime, meta.len())
723 } else {
724 (0, 0) };
726
727 files.insert(
728 rel_path,
729 FileEntry {
730 mtime_secs,
731 size,
732 content_hash: hash.clone(),
733 chunk_count: fc.chunks.len(),
734 },
735 );
736 }
737
738 if files.is_empty() {
739 return None;
740 }
741
742 let manifest = Manifest {
743 version: MANIFEST_VERSION,
744 model_repo: model_repo.to_string(),
745 root_hash: String::new(), directories: BTreeMap::new(), files,
748 };
749
750 let manifest_path = cache_dir.join("manifest.json");
752 if let Ok(json) = serde_json::to_string_pretty(&manifest) {
753 let _ = std::fs::write(&manifest_path, json);
754 }
755
756 Some(manifest)
757}