1use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20#[derive(Debug)]
22pub struct ReindexStats {
23 pub chunks_total: usize,
25 pub chunks_reembedded: usize,
27 pub files_unchanged: usize,
29 pub files_changed: usize,
31 pub files_deleted: usize,
33 pub duration_ms: u64,
35}
36
37pub fn incremental_index(
48 root: &Path,
49 backends: &[&dyn EmbedBackend],
50 tokenizer: &tokenizers::Tokenizer,
51 cfg: &SearchConfig,
52 profiler: &Profiler,
53 model_repo: &str,
54 cache_dir_override: Option<&Path>,
55 repo_level: bool,
56) -> crate::Result<(HybridIndex, ReindexStats)> {
57 let start = Instant::now();
58
59 if backends.is_empty() {
60 return Err(crate::Error::Other(anyhow::anyhow!(
61 "no embedding backends provided"
62 )));
63 }
64
65 if repo_level {
68 let ripvec_dir = root.join(".ripvec");
69 let config_path = ripvec_dir.join("config.toml");
70 if !config_path.exists() {
71 let config = crate::cache::config::RepoConfig::new(
72 model_repo,
73 crate::cache::manifest::MANIFEST_VERSION.to_string(),
74 );
75 config.save(&ripvec_dir)?;
76 }
77 let gitattributes_path = ripvec_dir.join(".gitattributes");
80 if !gitattributes_path.exists() {
81 let _ = std::fs::write(
82 &gitattributes_path,
83 "cache/manifest.json merge=ours\ncache/objects/** binary\n",
84 );
85 }
86 }
87
88 let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
89 let portable = is_repo_local(&cache_dir);
90 let manifest_path = cache_dir.join("manifest.json");
91 let objects_dir = cache_dir.join("objects");
92 let store = ObjectStore::new(&objects_dir);
93
94 let existing_manifest = Manifest::load(&manifest_path).ok();
96
97 if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
98 incremental_path(
100 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
101 start, portable,
102 )
103 } else {
104 full_index_path(
106 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
107 portable,
108 )
109 }
110}
111
112#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
114#[expect(
115 clippy::cast_possible_truncation,
116 reason = "duration in ms won't exceed u64"
117)]
118fn incremental_path(
119 root: &Path,
120 backends: &[&dyn EmbedBackend],
121 tokenizer: &tokenizers::Tokenizer,
122 cfg: &SearchConfig,
123 profiler: &Profiler,
124 _model_repo: &str,
125 cache_dir: &Path,
126 store: &ObjectStore,
127 mut manifest: Manifest,
128 start: Instant,
129 portable: bool,
130) -> crate::Result<(HybridIndex, ReindexStats)> {
131 let diff_result = diff::compute_diff(root, &manifest)?;
132
133 let files_changed = diff_result.dirty.len();
134 let files_deleted = diff_result.deleted.len();
135 let files_unchanged = diff_result.unchanged;
136
137 for deleted in &diff_result.deleted {
139 manifest.remove_file(deleted);
140 }
141
142 let mut new_chunks_count = 0;
144 for dirty_path in &diff_result.dirty {
145 let relative = dirty_path
146 .strip_prefix(root)
147 .unwrap_or(dirty_path)
148 .to_string_lossy()
149 .to_string();
150
151 manifest.remove_file(&relative);
153
154 let Some(source) = crate::embed::read_source(dirty_path) else {
156 continue;
157 };
158
159 let ext = dirty_path
160 .extension()
161 .and_then(|e| e.to_str())
162 .unwrap_or("");
163 let chunks = if cfg.text_mode {
164 crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
165 } else {
166 match crate::languages::config_for_extension(ext) {
167 Some(lang_config) => {
168 crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
169 }
170 None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
171 }
172 };
173
174 if chunks.is_empty() {
175 continue;
176 }
177
178 let model_max = backends[0].max_tokens();
180 let encodings: Vec<Option<crate::backend::Encoding>> = chunks
181 .iter()
182 .map(|chunk| {
183 crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
184 })
185 .collect();
186
187 let embeddings =
189 crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
190
191 let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
193 .into_iter()
194 .zip(embeddings.into_iter())
195 .filter(|(_, emb)| !emb.is_empty())
196 .unzip();
197
198 let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
199
200 let content_hash = diff::hash_file(dirty_path)?;
202 let file_cache = FileCache {
203 chunks: good_chunks.clone(),
204 embeddings: good_embeddings.iter().flatten().copied().collect(),
205 hidden_dim,
206 };
207 let bytes = if portable {
208 file_cache.to_portable_bytes()
209 } else {
210 file_cache.to_bytes()
211 };
212 store.write(&content_hash, &bytes)?;
213
214 let mtime = diff::mtime_secs(dirty_path);
216 let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
217 manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
218 new_chunks_count += good_chunks.len();
219 }
220
221 heal_manifest_mtimes(root, &mut manifest);
225
226 manifest.recompute_hashes();
228
229 let referenced = manifest.referenced_hashes();
231 store.gc(&referenced)?;
232
233 manifest.save(&cache_dir.join("manifest.json"))?;
235
236 let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
238 let chunks_total = all_chunks.len();
239 let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
240
241 Ok((
242 hybrid,
243 ReindexStats {
244 chunks_total,
245 chunks_reembedded: new_chunks_count,
246 files_unchanged,
247 files_changed,
248 files_deleted,
249 duration_ms: start.elapsed().as_millis() as u64,
250 },
251 ))
252}
253
254#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
256#[expect(
257 clippy::cast_possible_truncation,
258 reason = "duration in ms won't exceed u64"
259)]
260fn full_index_path(
261 root: &Path,
262 backends: &[&dyn EmbedBackend],
263 tokenizer: &tokenizers::Tokenizer,
264 cfg: &SearchConfig,
265 profiler: &Profiler,
266 model_repo: &str,
267 cache_dir: &Path,
268 store: &ObjectStore,
269 start: Instant,
270 portable: bool,
271) -> crate::Result<(HybridIndex, ReindexStats)> {
272 let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
273
274 let hidden_dim = embeddings.first().map_or(384, Vec::len);
275
276 let mut manifest = Manifest::new(model_repo);
278 let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
279 std::collections::BTreeMap::new();
280
281 for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
282 file_groups
283 .entry(chunk.file_path.clone())
284 .or_default()
285 .0
286 .push(chunk.clone());
287 file_groups
288 .entry(chunk.file_path.clone())
289 .or_default()
290 .1
291 .push(emb.clone());
292 }
293
294 for (file_path, (file_chunks, file_embeddings)) in &file_groups {
295 let file_path_buf = PathBuf::from(file_path);
297
298 let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
299 blake3::hash(file_chunks[0].content.as_bytes())
301 .to_hex()
302 .to_string()
303 });
304
305 let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
306 let fc = FileCache {
307 chunks: file_chunks.clone(),
308 embeddings: flat_emb,
309 hidden_dim,
310 };
311 let bytes = if portable {
312 fc.to_portable_bytes()
313 } else {
314 fc.to_bytes()
315 };
316 store.write(&content_hash, &bytes)?;
317
318 let relative = file_path_buf
319 .strip_prefix(root)
320 .unwrap_or(&file_path_buf)
321 .to_string_lossy()
322 .to_string();
323 let mtime = diff::mtime_secs(&file_path_buf);
324 let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
325 manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
326 }
327
328 manifest.recompute_hashes();
329 manifest.save(&cache_dir.join("manifest.json"))?;
330
331 let chunks_total = chunks.len();
332 let files_changed = file_groups.len();
333 let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
334
335 Ok((
336 hybrid,
337 ReindexStats {
338 chunks_total,
339 chunks_reembedded: chunks_total,
340 files_unchanged: 0,
341 files_changed,
342 files_deleted: 0,
343 duration_ms: start.elapsed().as_millis() as u64,
344 },
345 ))
346}
347
348#[must_use]
350pub fn is_repo_local(cache_dir: &Path) -> bool {
351 cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
352}
353
354pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
360 for (relative, entry) in &mut manifest.files {
361 let file_path = root.join(relative);
362 let mtime = diff::mtime_secs(&file_path);
363 if mtime != entry.mtime_secs {
364 entry.mtime_secs = mtime;
365 }
366 }
367}
368
369#[must_use]
375pub fn check_auto_stash(root: &Path) -> Option<String> {
376 use std::process::Command;
377
378 let ripvec_dir = root.join(".ripvec");
379 let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
380 if !config.cache.local {
381 return None;
382 }
383
384 if config.cache.auto_stash.is_some() {
386 return None;
387 }
388
389 let git_check = Command::new("git")
391 .args(["config", "--local", "pull.autoStash"])
392 .current_dir(root)
393 .stdout(std::process::Stdio::piped())
394 .stderr(std::process::Stdio::null())
395 .output()
396 .ok()?;
397 if git_check.status.success() {
398 let val = String::from_utf8_lossy(&git_check.stdout)
400 .trim()
401 .eq_ignore_ascii_case("true");
402 let _ = apply_auto_stash(root, val);
403 return None;
404 }
405
406 Some(
407 "ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
408 Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
409 .to_string(),
410 )
411}
412
413pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
422 use std::process::Command;
423
424 let ripvec_dir = root.join(".ripvec");
425 let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
426 config.cache.auto_stash = Some(enable);
427 config.save(&ripvec_dir)?;
428
429 if enable {
430 let _ = Command::new("git")
431 .args(["config", "--local", "pull.autoStash", "true"])
432 .current_dir(root)
433 .stdout(std::process::Stdio::null())
434 .stderr(std::process::Stdio::null())
435 .status();
436 }
437
438 Ok(())
439}
440
441fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
444 if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
445 FileCache::from_portable_bytes(bytes)
446 } else {
447 FileCache::from_bytes(bytes)
448 }
449}
450
451fn load_all_from_store(
453 store: &ObjectStore,
454 manifest: &Manifest,
455) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
456 let mut all_chunks = Vec::new();
457 let mut all_embeddings = Vec::new();
458
459 for entry in manifest.files.values() {
460 let bytes = store.read(&entry.content_hash)?;
461 let fc = load_file_cache(&bytes)?;
462 let dim = fc.hidden_dim;
463
464 for (i, chunk) in fc.chunks.into_iter().enumerate() {
465 let start = i * dim;
466 let end = start + dim;
467 if end <= fc.embeddings.len() {
468 all_embeddings.push(fc.embeddings[start..end].to_vec());
469 all_chunks.push(chunk);
470 }
471 }
472 }
473
474 Ok((all_chunks, all_embeddings))
475}
476
477#[must_use]
486pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
487 let cache_dir = resolve_cache_dir(root, model_repo, None);
488 let manifest_path = cache_dir.join("manifest.json");
489 let objects_dir = cache_dir.join("objects");
490 let lock_path = cache_dir.join("manifest.lock");
491
492 if !manifest_path.exists() {
494 return None;
495 }
496
497 let lock_file = std::fs::OpenOptions::new()
499 .create(true)
500 .truncate(false)
501 .write(true)
502 .read(true)
503 .open(&lock_path)
504 .ok()?;
505 let lock = fd_lock::RwLock::new(lock_file);
506 let _guard = lock.read().ok()?;
507
508 let manifest = Manifest::load(&manifest_path).ok()?;
509 if !manifest.is_compatible(model_repo) {
510 return None;
511 }
512
513 let store = ObjectStore::new(&objects_dir);
514 let (chunks, embeddings) = load_all_from_store(&store, &manifest).ok()?;
515 HybridIndex::new(chunks, &embeddings, None).ok()
516}
517
518#[must_use]
531pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
532 if let Some(dir) = override_dir {
534 let project_hash = hash_project_root(root);
535 let version_dir = format_version_dir(model_repo);
536 return dir.join(&project_hash).join(version_dir);
537 }
538
539 if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
541 && let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
542 {
543 if config.cache.model == model_repo {
544 return ripvec_dir.join("cache");
545 }
546 eprintln!(
547 "[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
548 config.cache.model, model_repo
549 );
550 }
551
552 let project_hash = hash_project_root(root);
554 let version_dir = format_version_dir(model_repo);
555
556 let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
557 PathBuf::from(env_dir).join(&project_hash)
558 } else {
559 dirs::cache_dir()
560 .unwrap_or_else(|| PathBuf::from("/tmp"))
561 .join("ripvec")
562 .join(&project_hash)
563 };
564
565 base.join(version_dir)
566}
567
568fn hash_project_root(root: &Path) -> String {
570 let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
571 blake3::hash(canonical.to_string_lossy().as_bytes())
572 .to_hex()
573 .to_string()
574}
575
576fn format_version_dir(model_repo: &str) -> String {
578 let model_slug = model_repo
579 .rsplit('/')
580 .next()
581 .unwrap_or(model_repo)
582 .to_lowercase();
583 format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
584}
585
586#[cfg(test)]
587mod tests {
588 use super::*;
589 use tempfile::TempDir;
590
591 #[test]
592 fn heal_stale_mtimes() {
593 use crate::cache::diff;
594 use crate::cache::manifest::Manifest;
595 use std::io::Write;
596
597 let dir = TempDir::new().unwrap();
598 let file_path = dir.path().join("test.rs");
599 let content = "fn main() {}";
600 {
601 let mut f = std::fs::File::create(&file_path).unwrap();
602 f.write_all(content.as_bytes()).unwrap();
603 }
604
605 let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
607 let mut manifest = Manifest::new("test-model");
608 manifest.add_file(
609 "test.rs",
610 9_999_999, content.len() as u64,
612 &content_hash,
613 1,
614 );
615
616 heal_manifest_mtimes(dir.path(), &mut manifest);
618 let actual_mtime = diff::mtime_secs(&file_path);
619 assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
620 }
621
622 #[test]
623 fn resolve_uses_repo_local_when_present() {
624 let dir = TempDir::new().unwrap();
625 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
626 cfg.save(&dir.path().join(".ripvec")).unwrap();
627
628 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
629 assert!(
630 result.starts_with(dir.path().join(".ripvec").join("cache")),
631 "expected repo-local cache dir, got: {result:?}"
632 );
633 }
634
635 #[test]
636 fn resolve_falls_back_to_user_cache_when_no_config() {
637 let dir = TempDir::new().unwrap();
638 let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
639 assert!(
640 !result.to_string_lossy().contains(".ripvec"),
641 "should not use repo-local without config, got: {result:?}"
642 );
643 }
644
645 #[test]
646 fn resolve_override_takes_priority_over_repo_local() {
647 let dir = TempDir::new().unwrap();
648 let override_dir = TempDir::new().unwrap();
649
650 let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
651 cfg.save(&dir.path().join(".ripvec")).unwrap();
652
653 let result = resolve_cache_dir(
654 dir.path(),
655 "nomic-ai/modernbert-embed-base",
656 Some(override_dir.path()),
657 );
658 assert!(
659 !result.starts_with(dir.path().join(".ripvec")),
660 "override should win over repo-local, got: {result:?}"
661 );
662 }
663}