Skip to main content

veles_core/
persist.rs

1//! Persistent on-disk index format.
2//!
3//! Layout under `<repo>/.veles/`:
4//!
5//! ```text
6//! .veles/
7//!   manifest.json   - format version, model, per-file fingerprints
8//!   chunks.bin      - bincode Vec<Chunk>
9//!   bm25.bin        - bincode Bm25Index
10//!   dense.bin       - bincode DenseIndex
11//! ```
12//!
13//! The manifest records a (size, mtime, chunk_count) fingerprint per file so
14//! `update` can detect added / removed / modified files without re-reading
15//! everything.
16
17use std::collections::BTreeMap;
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::time::{SystemTime, UNIX_EPOCH};
21
22use anyhow::{Context, Result, bail};
23use serde::{Deserialize, Serialize};
24
25use crate::index::dense::DenseIndex;
26use crate::index::sparse::Bm25Index;
27use crate::symbols::Symbol;
28use crate::types::Chunk;
29
30/// Directory name used under the indexed repo to store the on-disk index.
31pub const INDEX_DIR_NAME: &str = ".veles";
32
33/// Bumped whenever the on-disk format changes incompatibly. Bumped to 2
34/// when symbols.bin was added — older indexes lack tree-sitter symbols.
35pub const FORMAT_VERSION: u32 = 2;
36
37const MANIFEST_FILE: &str = "manifest.json";
38const CHUNKS_FILE: &str = "chunks.bin";
39const BM25_FILE: &str = "bm25.bin";
40const DENSE_FILE: &str = "dense.bin";
41const SYMBOLS_FILE: &str = "symbols.bin";
42
43/// Cheap fingerprint for change detection.
44///
45/// `(size, mtime)` is fast to compute and covers almost all real edits;
46/// content hashing can be layered on later if needed.
47#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
48pub struct FileFingerprint {
49    /// File size in bytes.
50    pub size: u64,
51    /// Modification time as Unix epoch seconds.
52    pub mtime_secs: i64,
53    /// Number of chunks this file produced.
54    pub chunk_count: usize,
55}
56
57impl FileFingerprint {
58    /// Compute the fingerprint for a path on disk. `chunk_count` is provided
59    /// by the caller after chunking.
60    pub fn from_path(path: &Path, chunk_count: usize) -> Result<Self> {
61        let meta = fs::metadata(path)
62            .with_context(|| format!("stat {}", path.display()))?;
63        let mtime = meta.modified().unwrap_or(UNIX_EPOCH);
64        let mtime_secs = mtime
65            .duration_since(UNIX_EPOCH)
66            .map(|d| d.as_secs() as i64)
67            .unwrap_or(0);
68        Ok(Self {
69            size: meta.len(),
70            mtime_secs,
71            chunk_count,
72        })
73    }
74}
75
76/// Small JSON sidecar describing a persisted index.
77///
78/// Human-readable on purpose so users can `cat .veles/manifest.json` to
79/// debug staleness or model mismatches.
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct Manifest {
82    /// Version of `veles` that wrote this index (from `CARGO_PKG_VERSION`).
83    pub veles_version: String,
84    /// On-disk format version. Bumped on incompatible layout changes; a
85    /// mismatch on `load` forces a `veles index --force`.
86    pub format_version: u32,
87    /// Embedding model used at build time (e.g. `"minishlab/potion-code-16M"`).
88    /// Loading with a different model is rejected.
89    pub model_name: String,
90    /// Dimensionality of the dense vectors.
91    pub embedding_dim: usize,
92    /// Whether text/document files (markdown, yaml, ...) were indexed
93    /// alongside source code.
94    pub include_text_files: bool,
95    /// Unix epoch seconds when the index was last written.
96    pub indexed_at: i64,
97    /// Per-file fingerprints used by incremental update.
98    pub files: BTreeMap<String, FileFingerprint>,
99    /// Total chunks across all files.
100    pub total_chunks: usize,
101}
102
103impl Manifest {
104    pub fn new(
105        model_name: &str,
106        embedding_dim: usize,
107        include_text_files: bool,
108    ) -> Self {
109        Self {
110            veles_version: env!("CARGO_PKG_VERSION").to_string(),
111            format_version: FORMAT_VERSION,
112            model_name: model_name.to_string(),
113            embedding_dim,
114            include_text_files,
115            indexed_at: now_secs(),
116            files: BTreeMap::new(),
117            total_chunks: 0,
118        }
119    }
120
121    pub fn touch(&mut self) {
122        self.indexed_at = now_secs();
123    }
124}
125
126fn now_secs() -> i64 {
127    SystemTime::now()
128        .duration_since(UNIX_EPOCH)
129        .map(|d| d.as_secs() as i64)
130        .unwrap_or(0)
131}
132
133/// Path of the `.veles/` directory under a given repo root.
134pub fn index_dir_for(repo_root: &Path) -> PathBuf {
135    repo_root.join(INDEX_DIR_NAME)
136}
137
138/// Returns true if a saved index appears to exist at the given path.
139pub fn index_exists(repo_root: &Path) -> bool {
140    let dir = index_dir_for(repo_root);
141    dir.join(MANIFEST_FILE).is_file()
142        && dir.join(CHUNKS_FILE).is_file()
143        && dir.join(BM25_FILE).is_file()
144        && dir.join(DENSE_FILE).is_file()
145}
146
147/// Components of a loaded index — the model is provided separately at load
148/// time so the heavy weights aren't serialised.
149pub struct PersistedIndex {
150    pub manifest: Manifest,
151    pub chunks: Vec<Chunk>,
152    pub bm25: Bm25Index,
153    pub dense: DenseIndex,
154    pub symbols: Vec<Symbol>,
155}
156
157/// Write all index artefacts to `<repo_root>/.veles/`.
158pub fn save(
159    repo_root: &Path,
160    manifest: &Manifest,
161    chunks: &[Chunk],
162    bm25: &Bm25Index,
163    dense: &DenseIndex,
164    symbols: &[Symbol],
165) -> Result<()> {
166    let dir = index_dir_for(repo_root);
167    fs::create_dir_all(&dir)
168        .with_context(|| format!("create index dir {}", dir.display()))?;
169
170    write_json(&dir.join(MANIFEST_FILE), manifest)?;
171    write_bincode(&dir.join(CHUNKS_FILE), &chunks.to_vec())?;
172    write_bincode(&dir.join(BM25_FILE), bm25)?;
173    write_bincode(&dir.join(DENSE_FILE), dense)?;
174    write_bincode(&dir.join(SYMBOLS_FILE), &symbols.to_vec())?;
175    Ok(())
176}
177
178/// Load all index artefacts from `<repo_root>/.veles/`.
179pub fn load(repo_root: &Path) -> Result<PersistedIndex> {
180    let dir = index_dir_for(repo_root);
181    if !dir.is_dir() {
182        bail!("No index found at {}", dir.display());
183    }
184
185    let manifest: Manifest = read_json(&dir.join(MANIFEST_FILE))?;
186    if manifest.format_version != FORMAT_VERSION {
187        bail!(
188            "Index format version {} is incompatible (expected {}). Run `veles index --force` to rebuild.",
189            manifest.format_version,
190            FORMAT_VERSION
191        );
192    }
193    let chunks: Vec<Chunk> = read_bincode(&dir.join(CHUNKS_FILE))?;
194    let bm25: Bm25Index = read_bincode(&dir.join(BM25_FILE))?;
195    let dense: DenseIndex = read_bincode(&dir.join(DENSE_FILE))?;
196    // Symbols file may be missing on a partially-written index; treat as empty.
197    let symbols: Vec<Symbol> = if dir.join(SYMBOLS_FILE).is_file() {
198        read_bincode(&dir.join(SYMBOLS_FILE))?
199    } else {
200        Vec::new()
201    };
202
203    Ok(PersistedIndex {
204        manifest,
205        chunks,
206        bm25,
207        dense,
208        symbols,
209    })
210}
211
212/// Read just the manifest (cheap — used by `status` and to check compatibility).
213pub fn load_manifest(repo_root: &Path) -> Result<Manifest> {
214    let dir = index_dir_for(repo_root);
215    read_json(&dir.join(MANIFEST_FILE))
216}
217
218/// Remove the on-disk index directory if it exists.
219pub fn clean(repo_root: &Path) -> Result<bool> {
220    let dir = index_dir_for(repo_root);
221    if dir.is_dir() {
222        fs::remove_dir_all(&dir)
223            .with_context(|| format!("remove {}", dir.display()))?;
224        return Ok(true);
225    }
226    Ok(false)
227}
228
229fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<()> {
230    let f = fs::File::create(path)
231        .with_context(|| format!("create {}", path.display()))?;
232    serde_json::to_writer_pretty(f, value)
233        .with_context(|| format!("write {}", path.display()))?;
234    Ok(())
235}
236
237fn read_json<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
238    let f = fs::File::open(path)
239        .with_context(|| format!("open {}", path.display()))?;
240    let value = serde_json::from_reader(std::io::BufReader::new(f))
241        .with_context(|| format!("parse {}", path.display()))?;
242    Ok(value)
243}
244
245fn write_bincode<T: Serialize>(path: &Path, value: &T) -> Result<()> {
246    let f = fs::File::create(path)
247        .with_context(|| format!("create {}", path.display()))?;
248    let mut w = std::io::BufWriter::new(f);
249    bincode::serialize_into(&mut w, value)
250        .with_context(|| format!("encode {}", path.display()))?;
251    Ok(())
252}
253
254fn read_bincode<T: for<'de> Deserialize<'de>>(path: &Path) -> Result<T> {
255    let f = fs::File::open(path)
256        .with_context(|| format!("open {}", path.display()))?;
257    let r = std::io::BufReader::new(f);
258    let value = bincode::deserialize_from(r)
259        .with_context(|| format!("decode {}", path.display()))?;
260    Ok(value)
261}
262
263/// Outcome of an incremental update — returned by
264/// [`crate::VelesIndex::update_from_path`].
265#[derive(Debug, Default, Clone)]
266pub struct UpdateReport {
267    /// Files seen on disk that weren't in the previous manifest.
268    pub added_files: usize,
269    /// Files whose `(size, mtime)` fingerprint changed.
270    pub modified_files: usize,
271    /// Files in the previous manifest no longer present on disk.
272    pub removed_files: usize,
273    /// Chunks reused from the previous index without re-embedding.
274    pub kept_chunks: usize,
275    /// Chunks freshly embedded for added/modified files.
276    pub new_chunks: usize,
277    /// Total chunks in the updated index (`kept + new`).
278    pub total_chunks: usize,
279}
280
281impl UpdateReport {
282    /// True when no files were added, modified, or removed.
283    pub fn is_noop(&self) -> bool {
284        self.added_files == 0 && self.modified_files == 0 && self.removed_files == 0
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    #[test]
293    fn manifest_roundtrip_via_json() {
294        let mut m = Manifest::new("test-model", 64, false);
295        m.files.insert(
296            "src/lib.rs".to_string(),
297            FileFingerprint {
298                size: 100,
299                mtime_secs: 1_000_000,
300                chunk_count: 2,
301            },
302        );
303        m.total_chunks = 2;
304
305        let s = serde_json::to_string(&m).unwrap();
306        let m2: Manifest = serde_json::from_str(&s).unwrap();
307        assert_eq!(m2.model_name, "test-model");
308        assert_eq!(m2.embedding_dim, 64);
309        assert_eq!(m2.files.len(), 1);
310        assert_eq!(m2.files["src/lib.rs"].size, 100);
311    }
312}