1use std::path::{Path, PathBuf};
8use std::time::Instant;
9
10use crate::backend::EmbedBackend;
11use crate::cache::diff;
12use crate::cache::file_cache::FileCache;
13use crate::cache::manifest::Manifest;
14use crate::cache::store::ObjectStore;
15use crate::chunk::CodeChunk;
16use crate::embed::SearchConfig;
17use crate::hybrid::HybridIndex;
18use crate::profile::Profiler;
19
20#[derive(Debug)]
22pub struct ReindexStats {
23 pub chunks_total: usize,
25 pub chunks_reembedded: usize,
27 pub files_unchanged: usize,
29 pub files_changed: usize,
31 pub files_deleted: usize,
33 pub duration_ms: u64,
35}
36
37pub fn incremental_index(
48 root: &Path,
49 backends: &[&dyn EmbedBackend],
50 tokenizer: &tokenizers::Tokenizer,
51 cfg: &SearchConfig,
52 profiler: &Profiler,
53 model_repo: &str,
54 cache_dir_override: Option<&Path>,
55) -> crate::Result<(HybridIndex, ReindexStats)> {
56 let start = Instant::now();
57
58 if backends.is_empty() {
59 return Err(crate::Error::Other(anyhow::anyhow!(
60 "no embedding backends provided"
61 )));
62 }
63
64 let cache_dir = resolve_cache_dir(root, model_repo, cache_dir_override);
65 let manifest_path = cache_dir.join("manifest.json");
66 let objects_dir = cache_dir.join("objects");
67 let store = ObjectStore::new(&objects_dir);
68
69 let existing_manifest = Manifest::load(&manifest_path).ok();
71
72 if let Some(manifest) = existing_manifest.filter(|m| m.is_compatible(model_repo)) {
73 incremental_path(
75 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, manifest,
76 start,
77 )
78 } else {
79 full_index_path(
81 root, backends, tokenizer, cfg, profiler, model_repo, &cache_dir, &store, start,
82 )
83 }
84}
85
86#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
88#[expect(
89 clippy::cast_possible_truncation,
90 reason = "duration in ms won't exceed u64"
91)]
92fn incremental_path(
93 root: &Path,
94 backends: &[&dyn EmbedBackend],
95 tokenizer: &tokenizers::Tokenizer,
96 cfg: &SearchConfig,
97 profiler: &Profiler,
98 _model_repo: &str,
99 cache_dir: &Path,
100 store: &ObjectStore,
101 mut manifest: Manifest,
102 start: Instant,
103) -> crate::Result<(HybridIndex, ReindexStats)> {
104 let diff_result = diff::compute_diff(root, &manifest)?;
105
106 let files_changed = diff_result.dirty.len();
107 let files_deleted = diff_result.deleted.len();
108 let files_unchanged = diff_result.unchanged;
109
110 for deleted in &diff_result.deleted {
112 manifest.remove_file(deleted);
113 }
114
115 let mut new_chunks_count = 0;
117 for dirty_path in &diff_result.dirty {
118 let relative = dirty_path
119 .strip_prefix(root)
120 .unwrap_or(dirty_path)
121 .to_string_lossy()
122 .to_string();
123
124 manifest.remove_file(&relative);
126
127 let Some(source) = crate::embed::read_source(dirty_path) else {
129 continue;
130 };
131
132 let ext = dirty_path
133 .extension()
134 .and_then(|e| e.to_str())
135 .unwrap_or("");
136 let chunks = if cfg.text_mode {
137 crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk)
138 } else {
139 match crate::languages::config_for_extension(ext) {
140 Some(lang_config) => {
141 crate::chunk::chunk_file(dirty_path, &source, &lang_config, &cfg.chunk)
142 }
143 None => crate::chunk::chunk_text(dirty_path, &source, &cfg.chunk),
144 }
145 };
146
147 if chunks.is_empty() {
148 continue;
149 }
150
151 let model_max = backends[0].max_tokens();
153 let encodings: Vec<Option<crate::backend::Encoding>> = chunks
154 .iter()
155 .map(|chunk| {
156 crate::tokenize::tokenize_query(&chunk.enriched_content, tokenizer, model_max).ok()
157 })
158 .collect();
159
160 let embeddings =
162 crate::embed::embed_distributed(&encodings, backends, cfg.batch_size, profiler)?;
163
164 let (good_chunks, good_embeddings): (Vec<_>, Vec<_>) = chunks
166 .into_iter()
167 .zip(embeddings.into_iter())
168 .filter(|(_, emb)| !emb.is_empty())
169 .unzip();
170
171 let hidden_dim = good_embeddings.first().map_or(384, Vec::len);
172
173 let content_hash = diff::hash_file(dirty_path)?;
175 let file_cache = FileCache {
176 chunks: good_chunks.clone(),
177 embeddings: good_embeddings.iter().flatten().copied().collect(),
178 hidden_dim,
179 };
180 store.write(&content_hash, &file_cache.to_bytes())?;
181
182 let mtime = diff::mtime_secs(dirty_path);
184 let size = std::fs::metadata(dirty_path).map_or(0, |m| m.len());
185 manifest.add_file(&relative, mtime, size, &content_hash, good_chunks.len());
186 new_chunks_count += good_chunks.len();
187 }
188
189 manifest.recompute_hashes();
191
192 let referenced = manifest.referenced_hashes();
194 store.gc(&referenced)?;
195
196 manifest.save(&cache_dir.join("manifest.json"))?;
198
199 let (all_chunks, all_embeddings) = load_all_from_store(store, &manifest)?;
201 let chunks_total = all_chunks.len();
202 let hybrid = HybridIndex::new(all_chunks, &all_embeddings, None)?;
203
204 Ok((
205 hybrid,
206 ReindexStats {
207 chunks_total,
208 chunks_reembedded: new_chunks_count,
209 files_unchanged,
210 files_changed,
211 files_deleted,
212 duration_ms: start.elapsed().as_millis() as u64,
213 },
214 ))
215}
216
217#[expect(clippy::too_many_arguments, reason = "pipeline state passed through")]
219#[expect(
220 clippy::cast_possible_truncation,
221 reason = "duration in ms won't exceed u64"
222)]
223fn full_index_path(
224 root: &Path,
225 backends: &[&dyn EmbedBackend],
226 tokenizer: &tokenizers::Tokenizer,
227 cfg: &SearchConfig,
228 profiler: &Profiler,
229 model_repo: &str,
230 cache_dir: &Path,
231 store: &ObjectStore,
232 start: Instant,
233) -> crate::Result<(HybridIndex, ReindexStats)> {
234 let (chunks, embeddings) = crate::embed::embed_all(root, backends, tokenizer, cfg, profiler)?;
235
236 let hidden_dim = embeddings.first().map_or(384, Vec::len);
237
238 let mut manifest = Manifest::new(model_repo);
240 let mut file_groups: std::collections::BTreeMap<String, (Vec<CodeChunk>, Vec<Vec<f32>>)> =
241 std::collections::BTreeMap::new();
242
243 for (chunk, emb) in chunks.iter().zip(embeddings.iter()) {
244 file_groups
245 .entry(chunk.file_path.clone())
246 .or_default()
247 .0
248 .push(chunk.clone());
249 file_groups
250 .entry(chunk.file_path.clone())
251 .or_default()
252 .1
253 .push(emb.clone());
254 }
255
256 for (file_path, (file_chunks, file_embeddings)) in &file_groups {
257 let file_path_buf = PathBuf::from(file_path);
259
260 let content_hash = diff::hash_file(&file_path_buf).unwrap_or_else(|_| {
261 blake3::hash(file_chunks[0].content.as_bytes())
263 .to_hex()
264 .to_string()
265 });
266
267 let flat_emb: Vec<f32> = file_embeddings.iter().flatten().copied().collect();
268 let fc = FileCache {
269 chunks: file_chunks.clone(),
270 embeddings: flat_emb,
271 hidden_dim,
272 };
273 store.write(&content_hash, &fc.to_bytes())?;
274
275 let relative = file_path_buf
276 .strip_prefix(root)
277 .unwrap_or(&file_path_buf)
278 .to_string_lossy()
279 .to_string();
280 let mtime = diff::mtime_secs(&file_path_buf);
281 let size = std::fs::metadata(&file_path_buf).map_or(0, |m| m.len());
282 manifest.add_file(&relative, mtime, size, &content_hash, file_chunks.len());
283 }
284
285 manifest.recompute_hashes();
286 manifest.save(&cache_dir.join("manifest.json"))?;
287
288 let chunks_total = chunks.len();
289 let files_changed = file_groups.len();
290 let hybrid = HybridIndex::new(chunks, &embeddings, None)?;
291
292 Ok((
293 hybrid,
294 ReindexStats {
295 chunks_total,
296 chunks_reembedded: chunks_total,
297 files_unchanged: 0,
298 files_changed,
299 files_deleted: 0,
300 duration_ms: start.elapsed().as_millis() as u64,
301 },
302 ))
303}
304
305fn load_all_from_store(
307 store: &ObjectStore,
308 manifest: &Manifest,
309) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
310 let mut all_chunks = Vec::new();
311 let mut all_embeddings = Vec::new();
312
313 for entry in manifest.files.values() {
314 let bytes = store.read(&entry.content_hash)?;
315 let fc = FileCache::from_bytes(&bytes)?;
316 let dim = fc.hidden_dim;
317
318 for (i, chunk) in fc.chunks.into_iter().enumerate() {
319 let start = i * dim;
320 let end = start + dim;
321 if end <= fc.embeddings.len() {
322 all_embeddings.push(fc.embeddings[start..end].to_vec());
323 all_chunks.push(chunk);
324 }
325 }
326 }
327
328 Ok((all_chunks, all_embeddings))
329}
330
331#[must_use]
342pub fn resolve_cache_dir(root: &Path, model_repo: &str, override_dir: Option<&Path>) -> PathBuf {
343 let project_hash = {
344 let canonical = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
345 blake3::hash(canonical.to_string_lossy().as_bytes())
346 .to_hex()
347 .to_string()
348 };
349
350 let model_slug = model_repo
352 .rsplit('/')
353 .next()
354 .unwrap_or(model_repo)
355 .to_lowercase();
356 let version_dir = format!("v{}-{model_slug}", crate::cache::manifest::MANIFEST_VERSION);
357
358 let base = if let Some(dir) = override_dir {
359 dir.join(&project_hash)
360 } else if let Ok(env_dir) = std::env::var("RIPVEC_CACHE") {
361 PathBuf::from(env_dir).join(&project_hash)
362 } else {
363 dirs::cache_dir()
364 .unwrap_or_else(|| PathBuf::from("/tmp"))
365 .join("ripvec")
366 .join(&project_hash)
367 };
368
369 base.join(version_dir)
370}