Skip to main content

veles_core/
persist.rs

1//! Persistent on-disk index format.
2//!
3//! Layout under `<repo>/.veles/`:
4//!
5//! ```text
6//! .veles/
7//!   manifest.json   - format version, model, per-file fingerprints
8//!   chunks.bin      - bincode Vec<Chunk>
9//!   bm25.bin        - bincode Bm25Index
10//!   dense.bin       - bincode DenseIndex
11//! ```
12//!
13//! The manifest records a (size, mtime, chunk_count) fingerprint per file so
14//! `update` can detect added / removed / modified files without re-reading
15//! everything.
16
17use std::collections::{BTreeMap, HashMap, HashSet};
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::time::{SystemTime, UNIX_EPOCH};
21
22use anyhow::{Context, Result, bail};
23use serde::{Deserialize, Serialize};
24
25use crate::index::dense::DenseIndex;
26use crate::index::sparse::Bm25Index;
27use crate::symbols::Symbol;
28use crate::types::Chunk;
29use crate::walker;
30
31/// Directory name used under the indexed repo to store the on-disk index.
32pub const INDEX_DIR_NAME: &str = ".veles";
33
34/// Bumped whenever the on-disk format changes incompatibly. Bumped to 2
35/// when symbols.bin was added — older indexes lack tree-sitter symbols.
36pub const FORMAT_VERSION: u32 = 2;
37
38const MANIFEST_FILE: &str = "manifest.json";
39const CHUNKS_FILE: &str = "chunks.bin";
40const BM25_FILE: &str = "bm25.bin";
41const DENSE_FILE: &str = "dense.bin";
42const SYMBOLS_FILE: &str = "symbols.bin";
43
44/// Cheap fingerprint for change detection.
45///
46/// `(size, mtime)` is fast to compute and covers almost all real edits;
47/// `content_hash` (BLAKE3 of the file bytes) is the fallback used by
48/// incremental update when mtime drifts but the bytes haven't changed
49/// (`touch`, `git checkout` of an identical version, no-op formatter
50/// runs).
51#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub struct FileFingerprint {
53    /// File size in bytes.
54    pub size: u64,
55    /// Modification time as Unix epoch seconds.
56    pub mtime_secs: i64,
57    /// Number of chunks this file produced.
58    pub chunk_count: usize,
59    /// BLAKE3 hex digest of the file bytes. `None` for fingerprints
60    /// loaded from a pre-content-hash manifest; new fingerprints
61    /// always populate it.
62    #[serde(default, skip_serializing_if = "Option::is_none")]
63    pub content_hash: Option<String>,
64}
65
66impl FileFingerprint {
67    /// Compute the fingerprint for a path on disk. `chunk_count` is provided
68    /// by the caller after chunking.
69    pub fn from_path(path: &Path, chunk_count: usize) -> Result<Self> {
70        let meta = fs::metadata(path).with_context(|| format!("stat {}", path.display()))?;
71        let mtime = meta.modified().unwrap_or(UNIX_EPOCH);
72        let mtime_secs = mtime
73            .duration_since(UNIX_EPOCH)
74            .map(|d| d.as_secs() as i64)
75            .unwrap_or(0);
76        let content_hash = Some(content_hash(path)?);
77        Ok(Self {
78            size: meta.len(),
79            mtime_secs,
80            chunk_count,
81            content_hash,
82        })
83    }
84}
85
86/// BLAKE3 hex digest of `path`'s bytes. Used both at index-build time
87/// (to populate the manifest) and at update time (to verify whether a
88/// touched file's content actually changed).
89pub fn content_hash(path: &Path) -> Result<String> {
90    let bytes = fs::read(path).with_context(|| format!("read {}", path.display()))?;
91    Ok(blake3::hash(&bytes).to_hex().to_string())
92}
93
94/// Small JSON sidecar describing a persisted index.
95///
96/// Human-readable on purpose so users can `cat .veles/manifest.json` to
97/// debug staleness or model mismatches.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct Manifest {
100    /// Version of `veles` that wrote this index (from `CARGO_PKG_VERSION`).
101    pub veles_version: String,
102    /// On-disk format version. Bumped on incompatible layout changes; a
103    /// mismatch on `load` forces a `veles index --force`.
104    pub format_version: u32,
105    /// Embedding model used at build time (e.g. `"minishlab/potion-code-16M"`).
106    /// Loading with a different model is rejected.
107    pub model_name: String,
108    /// Dimensionality of the dense vectors.
109    pub embedding_dim: usize,
110    /// Whether text/document files (markdown, yaml, ...) were indexed
111    /// alongside source code.
112    pub include_text_files: bool,
113    /// Unix epoch seconds when the index was last written.
114    pub indexed_at: i64,
115    /// Per-file fingerprints used by incremental update.
116    pub files: BTreeMap<String, FileFingerprint>,
117    /// Total chunks across all files.
118    pub total_chunks: usize,
119}
120
121impl Manifest {
122    pub fn new(model_name: &str, embedding_dim: usize, include_text_files: bool) -> Self {
123        Self {
124            veles_version: env!("CARGO_PKG_VERSION").to_string(),
125            format_version: FORMAT_VERSION,
126            model_name: model_name.to_string(),
127            embedding_dim,
128            include_text_files,
129            indexed_at: now_secs(),
130            files: BTreeMap::new(),
131            total_chunks: 0,
132        }
133    }
134
135    pub fn touch(&mut self) {
136        self.indexed_at = now_secs();
137    }
138}
139
140fn now_secs() -> i64 {
141    SystemTime::now()
142        .duration_since(UNIX_EPOCH)
143        .map(|d| d.as_secs() as i64)
144        .unwrap_or(0)
145}
146
147/// Path of the `.veles/` directory under a given repo root.
148pub fn index_dir_for(repo_root: &Path) -> PathBuf {
149    repo_root.join(INDEX_DIR_NAME)
150}
151
152/// Returns true if a saved index appears to exist at the given path.
153pub fn index_exists(repo_root: &Path) -> bool {
154    let dir = index_dir_for(repo_root);
155    dir.join(MANIFEST_FILE).is_file()
156        && dir.join(CHUNKS_FILE).is_file()
157        && dir.join(BM25_FILE).is_file()
158        && dir.join(DENSE_FILE).is_file()
159}
160
161/// Components of a loaded index — the model is provided separately at load
162/// time so the heavy weights aren't serialised.
163pub struct PersistedIndex {
164    pub manifest: Manifest,
165    pub chunks: Vec<Chunk>,
166    pub bm25: Bm25Index,
167    pub dense: DenseIndex,
168    pub symbols: Vec<Symbol>,
169}
170
171/// Write all index artefacts to `<repo_root>/.veles/`.
172///
173/// Manifest goes first synchronously so a partial-failure left-over
174/// has the freshest pointer to "what was meant to be saved". The four
175/// bincode artefacts are then written in parallel via `rayon::join`
176/// nested 2×2 (§5.7 of the perf plan) — modern filesystems handle
177/// concurrent same-dir creates fine, and the per-file `BufWriter` +
178/// bincode encode work in CPU dominate I/O for large indexes.
179///
180/// Also drops the previous `chunks.to_vec()` / `symbols.to_vec()`
181/// temporaries: slices implement `Serialize`, so we feed them in
182/// directly and skip the per-save full copy.
183pub fn save(
184    repo_root: &Path,
185    manifest: &Manifest,
186    chunks: &[Chunk],
187    bm25: &Bm25Index,
188    dense: &DenseIndex,
189    symbols: &[Symbol],
190) -> Result<()> {
191    let dir = index_dir_for(repo_root);
192    fs::create_dir_all(&dir).with_context(|| format!("create index dir {}", dir.display()))?;
193
194    write_json(&dir.join(MANIFEST_FILE), manifest)?;
195
196    let chunks_path = dir.join(CHUNKS_FILE);
197    let bm25_path = dir.join(BM25_FILE);
198    let dense_path = dir.join(DENSE_FILE);
199    let symbols_path = dir.join(SYMBOLS_FILE);
200
201    let ((r1, r2), (r3, r4)) = rayon::join(
202        || {
203            rayon::join(
204                || write_bincode(&chunks_path, &chunks),
205                || write_bincode(&bm25_path, bm25),
206            )
207        },
208        || {
209            rayon::join(
210                || write_bincode(&dense_path, dense),
211                || write_bincode(&symbols_path, &symbols),
212            )
213        },
214    );
215    r1?;
216    r2?;
217    r3?;
218    r4?;
219    Ok(())
220}
221
222/// Load all index artefacts from `<repo_root>/.veles/`.
223pub fn load(repo_root: &Path) -> Result<PersistedIndex> {
224    let dir = index_dir_for(repo_root);
225    if !dir.is_dir() {
226        bail!("No index found at {}", dir.display());
227    }
228
229    let manifest: Manifest = read_json(&dir.join(MANIFEST_FILE))?;
230    if manifest.format_version != FORMAT_VERSION {
231        bail!(
232            "Index format version {} is incompatible (expected {}). Run `veles index --force` to rebuild.",
233            manifest.format_version,
234            FORMAT_VERSION
235        );
236    }
237    let chunks: Vec<Chunk> = read_bincode(&dir.join(CHUNKS_FILE))?;
238    let bm25: Bm25Index = read_bincode(&dir.join(BM25_FILE))?;
239    let dense: DenseIndex = read_bincode(&dir.join(DENSE_FILE))?;
240    // Symbols file may be missing on a partially-written index; treat as empty.
241    let symbols: Vec<Symbol> = if dir.join(SYMBOLS_FILE).is_file() {
242        read_bincode(&dir.join(SYMBOLS_FILE))?
243    } else {
244        Vec::new()
245    };
246
247    Ok(PersistedIndex {
248        manifest,
249        chunks,
250        bm25,
251        dense,
252        symbols,
253    })
254}
255
256/// Read just the manifest (cheap — used by `status` and to check compatibility).
257pub fn load_manifest(repo_root: &Path) -> Result<Manifest> {
258    let dir = index_dir_for(repo_root);
259    read_json(&dir.join(MANIFEST_FILE))
260}
261
262/// Remove the on-disk index directory if it exists.
263pub fn clean(repo_root: &Path) -> Result<bool> {
264    let dir = index_dir_for(repo_root);
265    if dir.is_dir() {
266        fs::remove_dir_all(&dir).with_context(|| format!("remove {}", dir.display()))?;
267        return Ok(true);
268    }
269    Ok(false)
270}
271
272fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<()> {
273    let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
274    serde_json::to_writer_pretty(f, value).with_context(|| format!("write {}", path.display()))?;
275    Ok(())
276}
277
278fn read_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
279    let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
280    let value = serde_json::from_reader(std::io::BufReader::new(f))
281        .with_context(|| format!("parse {}", path.display()))?;
282    Ok(value)
283}
284
285fn write_bincode<T: Serialize>(path: &Path, value: &T) -> Result<()> {
286    let f = fs::File::create(path).with_context(|| format!("create {}", path.display()))?;
287    let mut w = std::io::BufWriter::new(f);
288    bincode::serialize_into(&mut w, value).with_context(|| format!("encode {}", path.display()))?;
289    Ok(())
290}
291
292fn read_bincode<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
293    let f = fs::File::open(path).with_context(|| format!("open {}", path.display()))?;
294    let r = std::io::BufReader::new(f);
295    let value =
296        bincode::deserialize_from(r).with_context(|| format!("decode {}", path.display()))?;
297    Ok(value)
298}
299
300/// Per-file disk metadata used during classification.
301#[derive(Debug, Clone)]
302pub struct DiskEntry {
303    pub abs_path: PathBuf,
304    pub size: u64,
305    pub mtime_secs: i64,
306}
307
308/// Per-file change classification against a `Manifest`.
309///
310/// Distinguishes the four cases that incremental `update` cares about:
311/// no content read (`Unchanged`), mtime drift but bytes still match
312/// (`MtimeOnly`), bytes actually changed (`Modified`), or new since
313/// last save (`Added`). Files removed since last save live in
314/// `DiskState::removed`, not here.
315#[derive(Debug, Clone)]
316pub enum Classification {
317    /// `(size, mtime)` matched the manifest exactly — no content read.
318    Unchanged,
319    /// `mtime` drifted but the BLAKE3 content hash still matches.
320    /// Carries the hash we computed so callers don't re-read the file.
321    MtimeOnly { hash: String },
322    /// File was in the manifest but bytes have actually changed.
323    /// `hash` is `Some` when we computed one during classification
324    /// (only happens when size matched and the manifest had a hash),
325    /// otherwise `None`.
326    Modified { hash: Option<String> },
327    /// File is new — not in the manifest.
328    Added,
329}
330
331/// Result of walking the repo and classifying each file against a manifest.
332#[derive(Debug)]
333pub struct DiskState {
334    /// For each file currently on disk: its metadata.
335    pub on_disk: HashMap<String, DiskEntry>,
336    /// Per-file classification — keys mirror `on_disk`.
337    pub classification: HashMap<String, Classification>,
338    /// Paths that were in the manifest but are not on disk now.
339    pub removed: Vec<String>,
340}
341
342impl DiskState {
343    /// Files seen now (on disk).
344    pub fn seen_now(&self) -> usize {
345        self.on_disk.len()
346    }
347    /// Count of files in each classification bucket.
348    pub fn count_added(&self) -> usize {
349        self.classification
350            .values()
351            .filter(|c| matches!(c, Classification::Added))
352            .count()
353    }
354    pub fn count_modified(&self) -> usize {
355        self.classification
356            .values()
357            .filter(|c| matches!(c, Classification::Modified { .. }))
358            .count()
359    }
360    pub fn count_mtime_only(&self) -> usize {
361        self.classification
362            .values()
363            .filter(|c| matches!(c, Classification::MtimeOnly { .. }))
364            .count()
365    }
366    pub fn count_unchanged(&self) -> usize {
367        self.classification
368            .values()
369            .filter(|c| matches!(c, Classification::Unchanged))
370            .count()
371    }
372    pub fn count_removed(&self) -> usize {
373        self.removed.len()
374    }
375    /// True iff nothing changed at all (no chunk edits, no mtime
376    /// drift, no adds/removes). Matches `UpdateReport::is_noop` after
377    /// `update_from_path` has consumed the state.
378    pub fn is_clean(&self) -> bool {
379        self.removed.is_empty()
380            && self
381                .classification
382                .values()
383                .all(|c| matches!(c, Classification::Unchanged))
384    }
385}
386
387/// Walk `repo_root` filtered by `extensions` and classify each file
388/// against `manifest`. Single place where the "mtime fast path then
389/// BLAKE3 fallback" decision lives — both `VelesIndex::update_from_path`
390/// and the MCP `status` handler call this (§3.3 of the perf plan).
391pub fn classify_disk(
392    repo_root: &Path,
393    manifest: &Manifest,
394    extensions: &HashSet<String>,
395) -> DiskState {
396    // 1. Walk on-disk files.
397    let mut on_disk: HashMap<String, DiskEntry> = HashMap::new();
398    for abs in walker::walk_files(repo_root, extensions) {
399        let Ok(rel_path) = abs.strip_prefix(repo_root) else {
400            continue;
401        };
402        let rel = rel_path.to_string_lossy().into_owned();
403        let Ok(meta) = fs::metadata(&abs) else {
404            continue;
405        };
406        let mtime_secs = meta
407            .modified()
408            .ok()
409            .and_then(|m| m.duration_since(UNIX_EPOCH).ok())
410            .map(|d| d.as_secs() as i64)
411            .unwrap_or(0);
412        on_disk.insert(
413            rel,
414            DiskEntry {
415                abs_path: abs,
416                size: meta.len(),
417                mtime_secs,
418            },
419        );
420    }
421
422    // 2. Classify each on-disk file.
423    let mut classification: HashMap<String, Classification> = HashMap::new();
424    for (rel, entry) in &on_disk {
425        let cls = match manifest.files.get(rel) {
426            Some(prev) if prev.size == entry.size && prev.mtime_secs == entry.mtime_secs => {
427                Classification::Unchanged
428            }
429            Some(prev) if prev.size == entry.size && prev.content_hash.is_some() => {
430                match content_hash(&entry.abs_path) {
431                    Ok(h) if Some(&h) == prev.content_hash.as_ref() => {
432                        Classification::MtimeOnly { hash: h }
433                    }
434                    Ok(h) => Classification::Modified { hash: Some(h) },
435                    Err(_) => Classification::Modified { hash: None },
436                }
437            }
438            Some(_) => Classification::Modified { hash: None },
439            None => Classification::Added,
440        };
441        classification.insert(rel.clone(), cls);
442    }
443
444    // 3. Files in the manifest that disappeared.
445    let removed: Vec<String> = manifest
446        .files
447        .keys()
448        .filter(|k| !on_disk.contains_key(*k))
449        .cloned()
450        .collect();
451
452    DiskState {
453        on_disk,
454        classification,
455        removed,
456    }
457}
458
459/// Outcome of an incremental update — returned by
460/// [`crate::VelesIndex::update_from_path`].
461#[derive(Debug, Default, Clone)]
462pub struct UpdateReport {
463    /// Files seen on disk that weren't in the previous manifest.
464    pub added_files: usize,
465    /// Files whose `(size, mtime)` fingerprint changed and whose content
466    /// (when checked via `content_hash`) actually differed.
467    pub modified_files: usize,
468    /// Files in the previous manifest no longer present on disk.
469    pub removed_files: usize,
470    /// Files whose `mtime` drifted but whose `content_hash` still matched —
471    /// no re-embedding needed, but the manifest's fingerprint was refreshed
472    /// so subsequent `status` / `update` calls skip the hash recompute.
473    pub mtime_refreshed_files: usize,
474    /// Chunks reused from the previous index without re-embedding.
475    pub kept_chunks: usize,
476    /// Chunks freshly embedded for added/modified files.
477    pub new_chunks: usize,
478    /// Total chunks in the updated index (`kept + new`).
479    pub total_chunks: usize,
480}
481
482impl UpdateReport {
483    /// True when nothing changed — no chunk-level edits and no fingerprint
484    /// refreshes pending. Callers use this to skip persistence.
485    pub fn is_noop(&self) -> bool {
486        self.added_files == 0
487            && self.modified_files == 0
488            && self.removed_files == 0
489            && self.mtime_refreshed_files == 0
490    }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496
497    #[test]
498    fn manifest_roundtrip_via_json() {
499        let mut m = Manifest::new("test-model", 64, false);
500        m.files.insert(
501            "src/lib.rs".to_string(),
502            FileFingerprint {
503                size: 100,
504                mtime_secs: 1_000_000,
505                chunk_count: 2,
506                content_hash: Some("deadbeef".to_string()),
507            },
508        );
509        m.total_chunks = 2;
510
511        let s = serde_json::to_string(&m).unwrap();
512        let m2: Manifest = serde_json::from_str(&s).unwrap();
513        assert_eq!(m2.model_name, "test-model");
514        assert_eq!(m2.embedding_dim, 64);
515        assert_eq!(m2.files.len(), 1);
516        assert_eq!(m2.files["src/lib.rs"].size, 100);
517        assert_eq!(
518            m2.files["src/lib.rs"].content_hash.as_deref(),
519            Some("deadbeef")
520        );
521    }
522
523    #[test]
524    fn legacy_manifest_without_content_hash_loads() {
525        // Pre-content-hash manifests omit the field entirely. Serde
526        // must default it to None, not bail.
527        let json = r#"{
528            "veles_version": "0.2.3",
529            "format_version": 2,
530            "model_name": "test-model",
531            "embedding_dim": 64,
532            "include_text_files": false,
533            "indexed_at": 0,
534            "files": {
535                "src/lib.rs": {
536                    "size": 100,
537                    "mtime_secs": 1000000,
538                    "chunk_count": 2
539                }
540            },
541            "total_chunks": 2
542        }"#;
543        let m: Manifest = serde_json::from_str(json).unwrap();
544        assert_eq!(m.files["src/lib.rs"].size, 100);
545        assert!(m.files["src/lib.rs"].content_hash.is_none());
546    }
547
548    #[test]
549    fn content_hash_is_deterministic_and_discriminates() {
550        let dir = tempfile::tempdir().unwrap();
551        let p = dir.path().join("a.txt");
552
553        std::fs::write(&p, b"hello").unwrap();
554        let h1 = content_hash(&p).unwrap();
555        let h2 = content_hash(&p).unwrap();
556        assert_eq!(h1, h2, "same bytes must hash the same");
557
558        std::fs::write(&p, b"hello world").unwrap();
559        let h3 = content_hash(&p).unwrap();
560        assert_ne!(h1, h3, "different bytes must hash differently");
561    }
562}