1use std::collections::{BTreeMap, HashMap, HashSet};
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::time::{SystemTime, UNIX_EPOCH};
21
22use anyhow::{Context, Result, bail};
23use serde::{Deserialize, Serialize};
24
25use crate::index::dense::DenseIndex;
26use crate::index::sparse::Bm25Index;
27use crate::symbols::Symbol;
28use crate::types::Chunk;
29use crate::walker;
30
31pub const INDEX_DIR_NAME: &str = ".veles";
33
34pub const FORMAT_VERSION: u32 = 2;
37
38const MANIFEST_FILE: &str = "manifest.json";
39const CHUNKS_FILE: &str = "chunks.bin";
40const BM25_FILE: &str = "bm25.bin";
41const DENSE_FILE: &str = "dense.bin";
42const SYMBOLS_FILE: &str = "symbols.bin";
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub struct FileFingerprint {
53 pub size: u64,
55 pub mtime_secs: i64,
57 pub chunk_count: usize,
59 #[serde(default, skip_serializing_if = "Option::is_none")]
63 pub content_hash: Option<String>,
64}
65
66impl FileFingerprint {
67 pub fn from_path(path: &Path, chunk_count: usize) -> Result<Self> {
70 let meta = fs::metadata(path).with_context(|| format!("stat {}", path.display()))?;
71 let mtime = meta.modified().unwrap_or(UNIX_EPOCH);
72 let mtime_secs = mtime
73 .duration_since(UNIX_EPOCH)
74 .map(|d| d.as_secs() as i64)
75 .unwrap_or(0);
76 let content_hash = Some(content_hash(path)?);
77 Ok(Self {
78 size: meta.len(),
79 mtime_secs,
80 chunk_count,
81 content_hash,
82 })
83 }
84}
85
86pub fn content_hash(path: &Path) -> Result<String> {
90 let bytes = fs::read(path).with_context(|| format!("read {}", path.display()))?;
91 Ok(blake3::hash(&bytes).to_hex().to_string())
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct Manifest {
100 pub veles_version: String,
102 pub format_version: u32,
105 pub model_name: String,
108 pub embedding_dim: usize,
110 pub include_text_files: bool,
113 pub indexed_at: i64,
115 pub files: BTreeMap<String, FileFingerprint>,
117 pub total_chunks: usize,
119}
120
121impl Manifest {
122 pub fn new(model_name: &str, embedding_dim: usize, include_text_files: bool) -> Self {
123 Self {
124 veles_version: env!("CARGO_PKG_VERSION").to_string(),
125 format_version: FORMAT_VERSION,
126 model_name: model_name.to_string(),
127 embedding_dim,
128 include_text_files,
129 indexed_at: now_secs(),
130 files: BTreeMap::new(),
131 total_chunks: 0,
132 }
133 }
134
135 pub fn touch(&mut self) {
136 self.indexed_at = now_secs();
137 }
138}
139
140fn now_secs() -> i64 {
141 SystemTime::now()
142 .duration_since(UNIX_EPOCH)
143 .map(|d| d.as_secs() as i64)
144 .unwrap_or(0)
145}
146
147pub fn index_dir_for(repo_root: &Path) -> PathBuf {
149 repo_root.join(INDEX_DIR_NAME)
150}
151
152pub fn index_exists(repo_root: &Path) -> bool {
154 let dir = index_dir_for(repo_root);
155 dir.join(MANIFEST_FILE).is_file()
156 && dir.join(CHUNKS_FILE).is_file()
157 && dir.join(BM25_FILE).is_file()
158 && dir.join(DENSE_FILE).is_file()
159}
160
161pub struct PersistedIndex {
164 pub manifest: Manifest,
165 pub chunks: Vec<Chunk>,
166 pub bm25: Bm25Index,
167 pub dense: DenseIndex,
168 pub symbols: Vec<Symbol>,
169}
170
171pub fn save(
184 repo_root: &Path,
185 manifest: &Manifest,
186 chunks: &[Chunk],
187 bm25: &Bm25Index,
188 dense: &DenseIndex,
189 symbols: &[Symbol],
190) -> Result<()> {
191 let dir = index_dir_for(repo_root);
192 fs::create_dir_all(&dir).with_context(|| format!("create index dir {}", dir.display()))?;
193
194 write_json(&dir.join(MANIFEST_FILE), manifest)?;
195
196 let chunks_path = dir.join(CHUNKS_FILE);
197 let bm25_path = dir.join(BM25_FILE);
198 let dense_path = dir.join(DENSE_FILE);
199 let symbols_path = dir.join(SYMBOLS_FILE);
200
201 let ((r1, r2), (r3, r4)) = rayon::join(
202 || {
203 rayon::join(
204 || write_bincode(&chunks_path, &chunks),
205 || write_bincode(&bm25_path, bm25),
206 )
207 },
208 || {
209 rayon::join(
210 || write_bincode(&dense_path, dense),
211 || write_bincode(&symbols_path, &symbols),
212 )
213 },
214 );
215 r1?;
216 r2?;
217 r3?;
218 r4?;
219 Ok(())
220}
221
222pub fn load(repo_root: &Path) -> Result<PersistedIndex> {
224 let dir = index_dir_for(repo_root);
225 if !dir.is_dir() {
226 bail!("No index found at {}", dir.display());
227 }
228
229 let manifest: Manifest = read_json(&dir.join(MANIFEST_FILE))?;
230 if manifest.format_version != FORMAT_VERSION {
231 bail!(
232 "Index format version {} is incompatible (expected {}). Run `veles index --force` to rebuild.",
233 manifest.format_version,
234 FORMAT_VERSION
235 );
236 }
237 let chunks: Vec<Chunk> = read_bincode(&dir.join(CHUNKS_FILE))?;
238 let bm25: Bm25Index = read_bincode(&dir.join(BM25_FILE))?;
239 let dense: DenseIndex = read_bincode(&dir.join(DENSE_FILE))?;
240 let symbols: Vec<Symbol> = if dir.join(SYMBOLS_FILE).is_file() {
242 read_bincode(&dir.join(SYMBOLS_FILE))?
243 } else {
244 Vec::new()
245 };
246
247 Ok(PersistedIndex {
248 manifest,
249 chunks,
250 bm25,
251 dense,
252 symbols,
253 })
254}
255
256pub fn load_manifest(repo_root: &Path) -> Result<Manifest> {
258 let dir = index_dir_for(repo_root);
259 read_json(&dir.join(MANIFEST_FILE))
260}
261
262pub fn clean(repo_root: &Path) -> Result<bool> {
264 let dir = index_dir_for(repo_root);
265 if dir.is_dir() {
266 fs::remove_dir_all(&dir).with_context(|| format!("remove {}", dir.display()))?;
267 return Ok(true);
268 }
269 Ok(false)
270}
271
272fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<()> {
273 let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
274 serde_json::to_writer_pretty(f, value).with_context(|| format!("write {}", path.display()))?;
275 Ok(())
276}
277
278fn read_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
279 let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
280 let value = serde_json::from_reader(std::io::BufReader::new(f))
281 .with_context(|| format!("parse {}", path.display()))?;
282 Ok(value)
283}
284
285fn write_bincode<T: Serialize>(path: &Path, value: &T) -> Result<()> {
286 let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
287 let mut w = std::io::BufWriter::new(f);
288 bincode::serialize_into(&mut w, value).with_context(|| format!("encode {}", path.display()))?;
289 Ok(())
290}
291
292fn read_bincode<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
293 let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
294 let r = std::io::BufReader::new(f);
295 let value =
296 bincode::deserialize_from(r).with_context(|| format!("decode {}", path.display()))?;
297 Ok(value)
298}
299
300#[derive(Debug, Clone)]
302pub struct DiskEntry {
303 pub abs_path: PathBuf,
304 pub size: u64,
305 pub mtime_secs: i64,
306}
307
308#[derive(Debug, Clone)]
316pub enum Classification {
317 Unchanged,
319 MtimeOnly { hash: String },
322 Modified { hash: Option<String> },
327 Added,
329}
330
331#[derive(Debug)]
333pub struct DiskState {
334 pub on_disk: HashMap<String, DiskEntry>,
336 pub classification: HashMap<String, Classification>,
338 pub removed: Vec<String>,
340}
341
342impl DiskState {
343 pub fn seen_now(&self) -> usize {
345 self.on_disk.len()
346 }
347 pub fn count_added(&self) -> usize {
349 self.classification
350 .values()
351 .filter(|c| matches!(c, Classification::Added))
352 .count()
353 }
354 pub fn count_modified(&self) -> usize {
355 self.classification
356 .values()
357 .filter(|c| matches!(c, Classification::Modified { .. }))
358 .count()
359 }
360 pub fn count_mtime_only(&self) -> usize {
361 self.classification
362 .values()
363 .filter(|c| matches!(c, Classification::MtimeOnly { .. }))
364 .count()
365 }
366 pub fn count_unchanged(&self) -> usize {
367 self.classification
368 .values()
369 .filter(|c| matches!(c, Classification::Unchanged))
370 .count()
371 }
372 pub fn count_removed(&self) -> usize {
373 self.removed.len()
374 }
375 pub fn is_clean(&self) -> bool {
379 self.removed.is_empty()
380 && self
381 .classification
382 .values()
383 .all(|c| matches!(c, Classification::Unchanged))
384 }
385}
386
387pub fn classify_disk(
392 repo_root: &Path,
393 manifest: &Manifest,
394 extensions: &HashSet<String>,
395) -> DiskState {
396 let mut on_disk: HashMap<String, DiskEntry> = HashMap::new();
398 for abs in walker::walk_files(repo_root, extensions) {
399 let Ok(rel_path) = abs.strip_prefix(repo_root) else {
400 continue;
401 };
402 let rel = rel_path.to_string_lossy().into_owned();
403 let Ok(meta) = fs::metadata(&abs) else {
404 continue;
405 };
406 let mtime_secs = meta
407 .modified()
408 .ok()
409 .and_then(|m| m.duration_since(UNIX_EPOCH).ok())
410 .map(|d| d.as_secs() as i64)
411 .unwrap_or(0);
412 on_disk.insert(
413 rel,
414 DiskEntry {
415 abs_path: abs,
416 size: meta.len(),
417 mtime_secs,
418 },
419 );
420 }
421
422 let mut classification: HashMap<String, Classification> = HashMap::new();
424 for (rel, entry) in &on_disk {
425 let cls = match manifest.files.get(rel) {
426 Some(prev) if prev.size == entry.size && prev.mtime_secs == entry.mtime_secs => {
427 Classification::Unchanged
428 }
429 Some(prev) if prev.size == entry.size && prev.content_hash.is_some() => {
430 match content_hash(&entry.abs_path) {
431 Ok(h) if Some(&h) == prev.content_hash.as_ref() => {
432 Classification::MtimeOnly { hash: h }
433 }
434 Ok(h) => Classification::Modified { hash: Some(h) },
435 Err(_) => Classification::Modified { hash: None },
436 }
437 }
438 Some(_) => Classification::Modified { hash: None },
439 None => Classification::Added,
440 };
441 classification.insert(rel.clone(), cls);
442 }
443
444 let removed: Vec<String> = manifest
446 .files
447 .keys()
448 .filter(|k| !on_disk.contains_key(*k))
449 .cloned()
450 .collect();
451
452 DiskState {
453 on_disk,
454 classification,
455 removed,
456 }
457}
458
459#[derive(Debug, Default, Clone)]
462pub struct UpdateReport {
463 pub added_files: usize,
465 pub modified_files: usize,
468 pub removed_files: usize,
470 pub mtime_refreshed_files: usize,
474 pub kept_chunks: usize,
476 pub new_chunks: usize,
478 pub total_chunks: usize,
480}
481
482impl UpdateReport {
483 pub fn is_noop(&self) -> bool {
486 self.added_files == 0
487 && self.modified_files == 0
488 && self.removed_files == 0
489 && self.mtime_refreshed_files == 0
490 }
491}
492
493#[cfg(test)]
494mod tests {
495 use super::*;
496
497 #[test]
498 fn manifest_roundtrip_via_json() {
499 let mut m = Manifest::new("test-model", 64, false);
500 m.files.insert(
501 "src/lib.rs".to_string(),
502 FileFingerprint {
503 size: 100,
504 mtime_secs: 1_000_000,
505 chunk_count: 2,
506 content_hash: Some("deadbeef".to_string()),
507 },
508 );
509 m.total_chunks = 2;
510
511 let s = serde_json::to_string(&m).unwrap();
512 let m2: Manifest = serde_json::from_str(&s).unwrap();
513 assert_eq!(m2.model_name, "test-model");
514 assert_eq!(m2.embedding_dim, 64);
515 assert_eq!(m2.files.len(), 1);
516 assert_eq!(m2.files["src/lib.rs"].size, 100);
517 assert_eq!(
518 m2.files["src/lib.rs"].content_hash.as_deref(),
519 Some("deadbeef")
520 );
521 }
522
523 #[test]
524 fn legacy_manifest_without_content_hash_loads() {
525 let json = r#"{
528 "veles_version": "0.2.3",
529 "format_version": 2,
530 "model_name": "test-model",
531 "embedding_dim": 64,
532 "include_text_files": false,
533 "indexed_at": 0,
534 "files": {
535 "src/lib.rs": {
536 "size": 100,
537 "mtime_secs": 1000000,
538 "chunk_count": 2
539 }
540 },
541 "total_chunks": 2
542 }"#;
543 let m: Manifest = serde_json::from_str(json).unwrap();
544 assert_eq!(m.files["src/lib.rs"].size, 100);
545 assert!(m.files["src/lib.rs"].content_hash.is_none());
546 }
547
548 #[test]
549 fn content_hash_is_deterministic_and_discriminates() {
550 let dir = tempfile::tempdir().unwrap();
551 let p = dir.path().join("a.txt");
552
553 std::fs::write(&p, b"hello").unwrap();
554 let h1 = content_hash(&p).unwrap();
555 let h2 = content_hash(&p).unwrap();
556 assert_eq!(h1, h2, "same bytes must hash the same");
557
558 std::fs::write(&p, b"hello world").unwrap();
559 let h3 = content_hash(&p).unwrap();
560 assert_ne!(h1, h3, "different bytes must hash differently");
561 }
562}