use std::path::{Path, PathBuf};
use std::time::Instant;
use crate::backend::EmbedBackend;
use crate::cache::diff;
use crate::cache::file_cache::FileCache;
use crate::cache::manifest::Manifest;
use crate::cache::store::ObjectStore;
use crate::chunk::CodeChunk;
use crate::embed::SearchConfig;
use crate::hybrid::HybridIndex;
use crate::profile::Profiler;
#[derive(Debug)]
pub struct ReindexStats {
pub chunks_total: usize,
pub chunks_reembedded: usize,
pub files_unchanged: usize,
pub files_changed: usize,
pub files_deleted: usize,
pub duration_ms: u64,
}
pub fn incremental_index(
root: &Path,
backends: &[&dyn EmbedBackend],
tokenizer: &tokenizers::Tokenizer,
cfg: &SearchConfig,
profiler: &Profiler,
model_repo: &str,
cache_dir_override: Option<&Path>,
repo_level: bool,
) -> crate::Result<(HybridIndex, ReindexStats)> {
let start = Instant::now();
tracing::info!(root = %root.display(), model = model_repo, "incremental_index starting");
if backends.is_empty() {
return Err(crate::Error::Other(anyhow::anyhow!(
"no embedding backends provided"
)));
}
{
let guard = profiler.phase("cache_prepare");
if repo_level {
let ripvec_dir = root.join(".ripvec");
let config_path = ripvec_dir.join("config.toml");
if !config_path.exists() {
let config = crate::cache::config::RepoConfig::new(
model_repo,
crate::cache::manifest::MANIFEST_VERSION.to_string(),
);
config.save(&ripvec_dir)?;
}
let gitignore_path = ripvec_dir.join(".gitignore");
if !gitignore_path.exists() {
let _ = std::fs::write(&gitignore_path, "cache/manifest.json\n");
}
}
guard.set_detail(format!("repo_level={repo_level}"));
}
let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
let portable = is_repo_local(&cache_dir);
let manifest_path = cache_dir.join("manifest.json");
let objects_dir = cache_dir.join("objects");
let store = ObjectStore::new(&objects_dir);
tracing::info!(
cache_dir = %cache_dir.display(),
portable,
manifest = %manifest_path.display(),
"cache resolved"
);
let existing_manifest = {
let guard = profiler.phase("cache_manifest");
let manifest = Manifest::load(&manifest_path)
.ok()
.or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo));
guard.set_detail(match &manifest {
Some(m) => format!("{} files", m.files.len()),
None => "none".to_string(),
});
manifest
};
if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
tracing::info!(
files = manifest.files.len(),
"manifest loaded, running incremental diff"
);
incremental_path(
root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
start, portable,
)
} else {
full_index_path(
root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
portable,
)
}
}
#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
#[expect(
clippy::too_many_lines,
reason = "incremental cache pipeline orchestration with diagnostic phase boundaries"
)]
#[expect(
clippy::cast_possible_truncation,
reason = "duration in ms won't exceed u64"
)]
fn incremental_path(
root: &Path,
backends: &[&dyn EmbedBackend],
tokenizer: &tokenizers::Tokenizer,
cfg: &SearchConfig,
profiler: &Profiler,
_model_repo: &str,
cache_dir: &Path,
store: &ObjectStore,
mut manifest: Manifest,
start: Instant,
portable: bool,
) -> crate::Result<(HybridIndex, ReindexStats)> {
let diff_result = {
let guard = profiler.phase("cache_diff");
let diff_result = diff::compute_diff(root, &manifest)?;
guard.set_detail(format!(
"{} changed, {} deleted, {} unchanged",
diff_result.dirty.len(),
diff_result.deleted.len(),
diff_result.unchanged,
));
diff_result
};
let files_changed = diff_result.dirty.len();
let files_deleted = diff_result.deleted.len();
let files_unchanged = diff_result.unchanged;
tracing::info!(
changed = files_changed,
deleted = files_deleted,
unchanged = files_unchanged,
"diff complete"
);
for deleted in &diff_result.deleted {
manifest.remove_file(deleted);
}
let mut new_chunks_count = 0;
{
let guard = profiler.phase("reembed_dirty_files");
tracing::info!(files = files_changed, "re-embedding changed files");
for dirty_path in &diff_result.dirty {
let relative = dirty_path
.strip_prefix(root)
.unwrap_or(dirty_path)
.to_string_lossy()
.to_string();
manifest.remove_file(&relative);
let Some(source) = crate::embed::read_source(dirty_path) else {
continue;
};
let chunks =
crate::chunk::chunk_source_for_path(dirty_path, &source, cfg.text_mode, &cfg.chunk);
profiler.chunk_thread_report(chunks.len());
profiler.chunk_batch(&chunks);
if chunks.is_empty() {
tracing::debug!(file = %relative, "dirty file produced no chunks");
continue;
}
tracing::debug!(file = %relative, chunks = chunks.len(), "embedding dirty file");
let model_max = backends[0].max_tokens();
let encodings: Vec<Option<crate::backend::Encoding>> = chunks
.iter()
.map(|chunk| {
crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max)
.ok()
})
.collect();
let embeddings =
crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
.into_iter()
.zip(embeddings)
.filter(|(_, emb)| !emb.is_empty())
.unzip();
let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
let content_hash = diff::hash_file(dirty_path)?;
let file_cache = FileCache {
chunks: good_chunks.clone(),
embeddings: good_embeddings.iter().flatten().copied().collect(),
hidden_dim,
};
let bytes = if portable {
file_cache.to_portable_bytes()
} else {
file_cache.to_bytes()
};
store.write(&content_hash, &bytes)?;
let mtime = diff::mtime_secs(dirty_path);
let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
new_chunks_count += good_chunks.len();
}
guard.set_detail(format!("{files_changed} files, {new_chunks_count} chunks"));
}
heal_manifest_mtimes(root, &mut manifest);
manifest.recompute_hashes();
tracing::info!("loading cached objects from store");
let (all_chunks, all_embeddings) = {
let guard = profiler.phase("cache_load_objects");
let result = load_all_from_store(store, &mut manifest);
guard.set_detail(format!("{} chunks", result.0.len()));
result
};
{
let guard = profiler.phase("cache_gc");
let referenced = manifest.referenced_hashes();
store.gc(&referenced)?;
guard.set_detail(format!("{} referenced objects", referenced.len()));
}
{
let guard = profiler.phase("cache_manifest_save");
manifest.save(&cache_dir.join("manifest.json"))?;
guard.set_detail(format!("{} files", manifest.files.len()));
}
let chunks_total = all_chunks.len();
tracing::info!(
chunks = chunks_total,
"building HybridIndex (BM25 + PolarQuant)"
);
let hybrid = {
let guard = profiler.phase("build_hybrid_index");
let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
guard.set_detail(format!("{chunks_total} chunks"));
hybrid
};
tracing::info!("HybridIndex ready");
Ok((
hybrid,
ReindexStats {
chunks_total,
chunks_reembedded: new_chunks_count,
files_unchanged,
files_changed,
files_deleted,
duration_ms: start.elapsed().as_millis() as u64,
},
))
}
#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
#[expect(
clippy::cast_possible_truncation,
reason = "duration in ms won't exceed u64"
)]
fn full_index_path(
root: &Path,
backends: &[&dyn EmbedBackend],
tokenizer: &tokenizers::Tokenizer,
cfg: &SearchConfig,
profiler: &Profiler,
model_repo: &str,
cache_dir: &Path,
store: &ObjectStore,
start: Instant,
portable: bool,
) -> crate::Result<(HybridIndex, ReindexStats)> {
tracing::info!("no compatible manifest; building full index from source");
let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
let hidden_dim = embeddings.first().map_or(384, Vec::len);
let mut manifest = Manifest::new(model_repo);
let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
std::collections::BTreeMap::new();
for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
file_groups
.entry(chunk.file_path.clone())
.or_default()
.0
.push(chunk.clone());
file_groups
.entry(chunk.file_path.clone())
.or_default()
.1
.push(emb.clone());
}
{
let guard = profiler.phase("cache_write_objects");
for (file_path, (file_chunks, file_embeddings)) in &file_groups {
let file_path_buf = PathBuf::from(file_path);
let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
blake3::hash(file_chunks[0].content.as_bytes())
.to_hex()
.to_string()
});
let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
let fc = FileCache {
chunks: file_chunks.clone(),
embeddings: flat_emb,
hidden_dim,
};
let bytes = if portable {
fc.to_portable_bytes()
} else {
fc.to_bytes()
};
store.write(&content_hash, &bytes)?;
let relative = file_path_buf
.strip_prefix(root)
.unwrap_or(&file_path_buf)
.to_string_lossy()
.to_string();
let mtime = diff::mtime_secs(&file_path_buf);
let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
}
guard.set_detail(format!("{} files", file_groups.len()));
}
{
let guard = profiler.phase("cache_manifest_save");
manifest.recompute_hashes();
manifest.save(&cache_dir.join("manifest.json"))?;
guard.set_detail(format!("{} files", manifest.files.len()));
}
let chunks_total = chunks.len();
let files_changed = file_groups.len();
let hybrid = {
let guard = profiler.phase("build_hybrid_index");
let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
guard.set_detail(format!("{chunks_total} chunks"));
hybrid
};
Ok((
hybrid,
ReindexStats {
chunks_total,
chunks_reembedded: chunks_total,
files_unchanged: 0,
files_changed,
files_deleted: 0,
duration_ms: start.elapsed().as_millis() as u64,
},
))
}
#[must_use]
pub fn is_repo_local(cache_dir: &Path) -> bool {
cache_dir.components().any(|c| c.as_os_str() == ".ripvec")
}
pub fn heal_manifest_mtimes(root: &Path, manifest: &mut Manifest) {
for (relative, entry) in &mut manifest.files {
let file_path = root.join(relative);
let mtime = diff::mtime_secs(&file_path);
if mtime != entry.mtime_secs {
entry.mtime_secs = mtime;
}
}
}
#[must_use]
pub fn check_auto_stash(root: &Path) -> Option<String> {
use std::process::Command;
let ripvec_dir = root.join(".ripvec");
let config = crate::cache::config::RepoConfig::load(&ripvec_dir).ok()?;
if !config.cache.local {
return None;
}
if config.cache.auto_stash.is_some() {
return None;
}
let git_check = Command::new("git")
.args(["config", "--local", "pull.autoStash"])
.current_dir(root)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::null())
.output()
.ok()?;
if git_check.status.success() {
let val = String::from_utf8_lossy(&git_check.stdout)
.trim()
.eq_ignore_ascii_case("true");
let _ = apply_auto_stash(root, val);
return None;
}
Some(
"ripvec: Repo-local cache can dirty the worktree and block `git pull`.\n\
Enable `pull.autoStash` for this repo? (git stashes dirty files before pull, pops after)"
.to_string(),
)
}
pub fn apply_auto_stash(root: &Path, enable: bool) -> crate::Result<()> {
use std::process::Command;
let ripvec_dir = root.join(".ripvec");
let mut config = crate::cache::config::RepoConfig::load(&ripvec_dir)?;
config.cache.auto_stash = Some(enable);
config.save(&ripvec_dir)?;
if enable {
let _ = Command::new("git")
.args(["config", "--local", "pull.autoStash", "true"])
.current_dir(root)
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status();
}
Ok(())
}
fn load_file_cache(bytes: &[u8]) -> crate::Result<FileCache> {
if bytes.len() >= 2 && bytes[..2] == [0x42, 0x43] {
FileCache::from_portable_bytes(bytes)
} else {
FileCache::from_bytes(bytes)
}
}
fn load_all_from_store(
store: &ObjectStore,
manifest: &mut Manifest,
) -> (Vec<CodeChunk>, Vec<Vec<f32>>) {
let mut all_chunks = Vec::new();
let mut all_embeddings = Vec::new();
let mut dangling: Vec<String> = Vec::new();
let total = manifest.files.len();
tracing::info!(objects = total, "reading cached objects");
for (idx, (path, entry)) in manifest.files.iter().enumerate() {
let current = idx + 1;
if current == 1 || current % 1000 == 0 || current == total {
tracing::debug!(current, total, path = %path, "reading cached object");
}
let bytes = match store.read(&entry.content_hash) {
Ok(b) => b,
Err(e) => {
tracing::warn!(
path = %path,
hash = %entry.content_hash,
error = %e,
"cache object missing or unreadable — will re-embed"
);
dangling.push(path.clone());
continue;
}
};
let fc = match load_file_cache(&bytes) {
Ok(fc) => fc,
Err(e) => {
tracing::warn!(
path = %path,
hash = %entry.content_hash,
error = %e,
"cache object corrupt — will re-embed"
);
dangling.push(path.clone());
continue;
}
};
let dim = fc.hidden_dim;
for (i, chunk) in fc.chunks.into_iter().enumerate() {
let start = i * dim;
let end = start + dim;
if end <= fc.embeddings.len() {
all_embeddings.push(fc.embeddings[start..end].to_vec());
all_chunks.push(chunk);
}
}
}
for path in &dangling {
manifest.files.remove(path);
}
if !dangling.is_empty() {
tracing::warn!(
count = dangling.len(),
"pruned dangling manifest entries; these files will be re-embedded on next run"
);
}
(all_chunks, all_embeddings)
}
#[must_use]
pub fn load_cached_index(root: &Path, model_repo: &str) -> Option<HybridIndex> {
let cache_dir = resolve_cache_dir(root, model_repo, None);
let manifest_path = cache_dir.join("manifest.json");
let objects_dir = cache_dir.join("objects");
let lock_path = cache_dir.join("manifest.lock");
if !manifest_path.exists() {
return None;
}
let lock_file = std::fs::OpenOptions::new()
.create(true)
.truncate(false)
.write(true)
.read(true)
.open(&lock_path)
.ok()?;
let lock = fd_lock::RwLock::new(lock_file);
let _guard = lock.read().ok()?;
let mut manifest = Manifest::load(&manifest_path)
.ok()
.or_else(|| rebuild_manifest_from_objects(&cache_dir, root, model_repo))?;
if !manifest.is_compatible(model_repo) {
return None;
}
let store = ObjectStore::new(&objects_dir);
let (chunks, embeddings) = load_all_from_store(&store, &mut manifest);
HybridIndex::new(chunks, &embeddings, None).ok()
}
#[must_use]
pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
if let Some(dir) = override_dir {
let project_hash = hash_project_root(root);
let version_dir = format_version_dir(model_repo);
return dir.join(&project_hash).join(version_dir);
}
if let Some(ripvec_dir) = crate::cache::config::find_repo_config(root)
&& let Ok(config) = crate::cache::config::RepoConfig::load(&ripvec_dir)
{
if config.cache.model == model_repo {
return ripvec_dir.join("cache");
}
eprintln!(
"[ripvec] repo-local index model mismatch: config has '{}', runtime wants '{}' — falling back to user cache",
config.cache.model, model_repo
);
}
let project_hash = hash_project_root(root);
let version_dir = format_version_dir(model_repo);
let base = if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
PathBuf::from(env_dir).join(&project_hash)
} else {
dirs::cache_dir()
.unwrap_or_else(|| PathBuf::from("/tmp"))
.join("ripvec")
.join(&project_hash)
};
base.join(version_dir)
}
fn hash_project_root(root: &Path) -> String {
let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
blake3::hash(canonical.to_string_lossy().as_bytes())
.to_hex()
.to_string()
}
fn format_version_dir(model_repo: &str) -> String {
let model_slug = model_repo
.rsplit('/')
.next()
.unwrap_or(model_repo)
.to_lowercase();
format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION)
}
#[must_use]
pub fn rebuild_manifest_from_objects(
cache_dir: &std::path::Path,
root: &std::path::Path,
model_repo: &str,
) -> Option<super::manifest::Manifest> {
use super::file_cache::FileCache;
use super::manifest::{FileEntry, MANIFEST_VERSION, Manifest};
use super::store::ObjectStore;
use std::collections::BTreeMap;
let store = ObjectStore::new(&cache_dir.join("objects"));
let hashes = store.list_hashes();
if hashes.is_empty() {
return None;
}
tracing::info!(
objects = hashes.len(),
"rebuilding manifest from object store"
);
let mut files = BTreeMap::new();
for hash in &hashes {
let Ok(bytes) = store.read(hash) else {
continue;
};
let Ok(fc) =
FileCache::from_portable_bytes(&bytes).or_else(|_| FileCache::from_bytes(&bytes))
else {
continue;
};
let Some(first_chunk) = fc.chunks.first() else {
continue;
};
let chunk_path = std::path::Path::new(&first_chunk.file_path);
let rel_path = chunk_path
.strip_prefix(root)
.unwrap_or(chunk_path)
.to_string_lossy()
.to_string();
let abs_path = root.join(&rel_path);
let (mtime_secs, size) = if let Ok(meta) = std::fs::metadata(&abs_path) {
let mtime = meta
.modified()
.ok()
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.map_or(0, |d| d.as_secs());
(mtime, meta.len())
} else {
(0, 0) };
files.insert(
rel_path,
FileEntry {
mtime_secs,
size,
content_hash: hash.clone(),
chunk_count: fc.chunks.len(),
},
);
}
if files.is_empty() {
return None;
}
let manifest = Manifest {
version: MANIFEST_VERSION,
model_repo: model_repo.to_string(),
root_hash: String::new(), directories: BTreeMap::new(), files,
};
tracing::info!(
files = manifest.files.len(),
"manifest rebuilt from objects"
);
let manifest_path = cache_dir.join("manifest.json");
if let Ok(json) = serde_json::to_string_pretty(&manifest) {
let _ = std::fs::write(&manifest_path, json);
}
Some(manifest)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn heal_stale_mtimes() {
use crate::cache::diff;
use crate::cache::manifest::Manifest;
use std::io::Write;
let dir = TempDir::new().unwrap();
let file_path = dir.path().join("test.rs");
let content = "fn main() {}";
{
let mut f = std::fs::File::create(&file_path).unwrap();
f.write_all(content.as_bytes()).unwrap();
}
let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string();
let mut manifest = Manifest::new("test-model");
manifest.add_file(
"test.rs",
9_999_999, content.len() as u64,
&content_hash,
1,
);
heal_manifest_mtimes(dir.path(), &mut manifest);
let actual_mtime = diff::mtime_secs(&file_path);
assert_eq!(manifest.files["test.rs"].mtime_secs, actual_mtime);
}
#[test]
fn resolve_uses_repo_local_when_present() {
let dir = TempDir::new().unwrap();
let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
cfg.save(&dir.path().join(".ripvec")).unwrap();
let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
assert!(
result.starts_with(dir.path().join(".ripvec").join("cache")),
"expected repo-local cache dir, got: {result:?}"
);
}
#[test]
fn resolve_falls_back_to_user_cache_when_no_config() {
let dir = TempDir::new().unwrap();
let result = resolve_cache_dir(dir.path(), "nomic-ai/modernbert-embed-base", None);
assert!(
!result.to_string_lossy().contains(".ripvec"),
"should not use repo-local without config, got: {result:?}"
);
}
#[test]
fn resolve_override_takes_priority_over_repo_local() {
let dir = TempDir::new().unwrap();
let override_dir = TempDir::new().unwrap();
let cfg = crate::cache::config::RepoConfig::new("nomic-ai/modernbert-embed-base", "3");
cfg.save(&dir.path().join(".ripvec")).unwrap();
let result = resolve_cache_dir(
dir.path(),
"nomic-ai/modernbert-embed-base",
Some(override_dir.path()),
);
assert!(
!result.starts_with(dir.path().join(".ripvec")),
"override should win over repo-local, got: {result:?}"
);
}
}