Skip to main content

ripvec_core/encoder/ripvec/
manifest.rs

1//! In-memory manifest tracking indexed files for online reconciliation.
2//!
3//! Each entry stores cheap stat data — `(mtime, size, inode)` on Unix
4//! (`inode = 0` on Windows / unavailable filesystems) — plus a blake3
5//! content hash. Reconciliation runs on every search via
6//! [`RipvecIndex::diff_against`](super::index::RipvecIndex::diff_against):
7//!
8//!   1. Walk the corpus with the same [`WalkOptions`] used at index
9//!      construction.
10//!   2. For each walked file: compare the stat tuple to the manifest
11//!      entry. Match → guaranteed-unchanged, skip.
12//!   3. For mismatches: read the file, blake3-hash, compare against the
13//!      stored hash. Match → metadata-only change (vim save-no-edit,
14//!      build-tool touch), update the manifest's stat tuple in place to
15//!      short-circuit future diffs. Mismatch → record as `dirty`.
16//!   4. Manifest entries not seen during the walk → `deleted`.
17//!   5. Walked paths not in the manifest → `new`.
18//!
19//! If the resulting [`Diff`] is empty, the existing index is up-to-date
20//! and no work is needed. Otherwise the caller rebuilds.
21//!
22//! # Why blake3 + the stat tuple
23//!
24//! The stat tuple is the cheap pre-filter: warm `stat()` is ~1 µs per
25//! file, so the whole tuple check on a 200-file repo is sub-millisecond.
26//! Most files won't have a stat change between queries; the cheap path
27//! skips them entirely.
28//!
29//! When the stat tuple *does* mismatch, the question is whether content
30//! actually changed. Reading + blake3'ing a typical 1-30 KB source file
31//! costs ~1-20 µs warm — two orders of magnitude cheaper than the
32//! ~1-5 ms cost of re-chunking and re-embedding it. The break-even is
33//! "blake3 is worth it when more than 0.7% of stat changes are touches
34//! rather than real edits"; real-world workflows have 5-50% touch rates
35//! (vim `:w` with no edits, autoformatters that hash-equal their input,
36//! build tools that touch source for dependency tracking).
37//!
38//! # Inode as a third dimension
39//!
40//! `(mtime, size)` alone has a rare blind spot: same-byte-count
41//! content swaps. Atomic-rename saves (the modern editor default) bump
42//! the inode, so adding `inode` to the tuple catches those without a
43//! blake3 round-trip. Inode is best-effort: 0 on Windows, where we
44//! fall back to `(mtime, size)`. The blake3 verification path still
45//! guarantees correctness even when the inode signal is unavailable.
46
47use std::collections::{HashMap, HashSet};
48use std::path::{Path, PathBuf};
49use std::time::SystemTime;
50
51/// One file's tracked state in the manifest.
52///
53/// Constructed via [`FileEntry::from_bytes`] when the caller already has
54/// the file bytes in hand (avoids a redundant read), or via
55/// [`FileEntry::from_path`] when only the path is known.
56#[derive(Debug, Clone)]
57pub struct FileEntry {
58    /// Last modification time, or `UNIX_EPOCH` if the platform doesn't
59    /// expose it. Used as the first part of the cheap stat-tuple check.
60    pub mtime: SystemTime,
61    /// File size in bytes, second part of the stat tuple.
62    pub size: u64,
63    /// File inode number on Unix (`0` on Windows / unavailable). Third
64    /// part of the stat tuple; catches atomic-rename saves where mtime
65    /// and size could coincide with the previous entry.
66    pub ino: u64,
67    /// Blake3 content hash. Authoritative — when the stat tuple changes,
68    /// this confirms whether content actually changed vs. a touch.
69    pub blake3: [u8; 32],
70}
71
72impl FileEntry {
73    /// Build an entry from filesystem metadata and the file's bytes.
74    ///
75    /// Use this when the caller has already read the file (e.g., during
76    /// chunking) to avoid the redundant read for blake3 hashing.
77    #[must_use]
78    pub fn from_bytes(metadata: &std::fs::Metadata, bytes: &[u8]) -> Self {
79        Self {
80            mtime: metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH),
81            size: metadata.len(),
82            ino: inode(metadata),
83            blake3: *blake3::hash(bytes).as_bytes(),
84        }
85    }
86
87    /// Build an entry by reading the file from disk.
88    ///
89    /// Reads the file once for blake3. Use [`Self::from_bytes`] if the
90    /// caller already has the bytes.
91    ///
92    /// # Errors
93    ///
94    /// Returns the I/O error if stat or read fails.
95    pub fn from_path(path: &Path) -> std::io::Result<Self> {
96        let metadata = std::fs::metadata(path)?;
97        let bytes = std::fs::read(path)?;
98        Ok(Self::from_bytes(&metadata, &bytes))
99    }
100}
101
102/// Per-root manifest of indexed files.
103///
104/// Keys are absolute, canonical paths (matching the paths returned by
105/// [`crate::walk::collect_files_with_options`]).
106#[derive(Debug, Clone, Default)]
107pub struct Manifest {
108    pub files: HashMap<PathBuf, FileEntry>,
109}
110
111impl Manifest {
112    /// Construct an empty manifest.
113    #[must_use]
114    pub fn new() -> Self {
115        Self {
116            files: HashMap::new(),
117        }
118    }
119
120    /// Number of tracked files.
121    #[must_use]
122    pub fn len(&self) -> usize {
123        self.files.len()
124    }
125
126    /// Whether the manifest tracks zero files.
127    #[must_use]
128    pub fn is_empty(&self) -> bool {
129        self.files.is_empty()
130    }
131
132    /// Insert or replace an entry.
133    pub fn insert(&mut self, path: PathBuf, entry: FileEntry) {
134        self.files.insert(path, entry);
135    }
136
137    /// Look up an entry by path.
138    #[must_use]
139    pub fn get(&self, path: &Path) -> Option<&FileEntry> {
140        self.files.get(path)
141    }
142}
143
144/// Categorized filesystem changes detected by [`diff_against_walk`].
145///
146/// All three vectors hold absolute paths (matching the walk's output).
147/// A [`Diff`] is "empty" only when every list is empty; the
148/// [`Self::is_empty`] helper exists to make this the canonical
149/// "no-work-needed" check.
150#[derive(Debug, Default)]
151pub struct Diff {
152    /// Files present in both manifest and walk whose content changed.
153    pub dirty: Vec<PathBuf>,
154    /// Files present in the walk but not in the manifest.
155    pub new: Vec<PathBuf>,
156    /// Files present in the manifest but not in the walk.
157    pub deleted: Vec<PathBuf>,
158}
159
160impl Diff {
161    /// Whether all change lists are empty.
162    #[must_use]
163    pub fn is_empty(&self) -> bool {
164        self.dirty.is_empty() && self.new.is_empty() && self.deleted.is_empty()
165    }
166
167    /// Total number of changed files across all categories.
168    #[must_use]
169    pub fn total(&self) -> usize {
170        self.dirty.len() + self.new.len() + self.deleted.len()
171    }
172}
173
174/// Compare the manifest to the current filesystem state and produce a
175/// [`Diff`].
176///
177/// The walked file set is supplied by the caller (typically via
178/// [`crate::walk::collect_files_with_options`]) so this function does no
179/// I/O for path discovery — only per-file stat and (on stat mismatch)
180/// content read for blake3 verification.
181///
182/// # Mutation of the manifest
183///
184/// When a file's stat tuple changes but its blake3 hash still matches
185/// the manifest entry (the touch-without-content-change case), this
186/// function updates the entry's `(mtime, size, ino)` in place. This is
187/// not a correctness step — the diff is the same with or without the
188/// update — but it short-circuits future diffs on the same touched
189/// file: the next call sees the new stat tuple, hits the cheap-path
190/// match, and skips the blake3 read.
191///
192/// # Robustness
193///
194/// Files that vanish between the walk and the per-file stat (rare race)
195/// are silently skipped; they will appear in `deleted` on the next
196/// diff. Permission errors are treated similarly. The function never
197/// fails — every call returns a [`Diff`].
198pub fn diff_against_walk(manifest: &mut Manifest, current_files: &[PathBuf]) -> Diff {
199    let mut diff = Diff::default();
200    let mut seen: HashSet<&Path> = HashSet::with_capacity(current_files.len());
201
202    for path in current_files {
203        seen.insert(path.as_path());
204        let Ok(metadata) = std::fs::metadata(path) else {
205            // Vanished between walk and stat; let the next diff catch
206            // it via the deleted-files pass.
207            continue;
208        };
209        let mtime = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH);
210        let size = metadata.len();
211        let ino = inode(&metadata);
212
213        match manifest.files.get(path) {
214            None => {
215                diff.new.push(path.clone());
216            }
217            Some(entry) => {
218                if entry.mtime == mtime && entry.size == size && entry.ino == ino {
219                    // Stat tuple unchanged → content guaranteed
220                    // unchanged. The cheap path.
221                    continue;
222                }
223                // Stat changed; blake3 to distinguish real edits from
224                // metadata-only touches.
225                let Ok(bytes) = std::fs::read(path) else {
226                    // Treat permission/read errors conservatively as
227                    // dirty so the rebuild path notices.
228                    diff.dirty.push(path.clone());
229                    continue;
230                };
231                let new_hash = *blake3::hash(&bytes).as_bytes();
232                if new_hash == entry.blake3 {
233                    // Touch without content change. Refresh the stat
234                    // tuple so the next diff hits the cheap path.
235                    if let Some(entry_mut) = manifest.files.get_mut(path) {
236                        entry_mut.mtime = mtime;
237                        entry_mut.size = size;
238                        entry_mut.ino = ino;
239                    }
240                } else {
241                    diff.dirty.push(path.clone());
242                }
243            }
244        }
245    }
246
247    // Manifest entries we didn't visit during the walk → deleted (or
248    // filtered out of the walk by changed `WalkOptions`, which the
249    // caller treats identically: drop the chunks).
250    for path in manifest.files.keys() {
251        if !seen.contains(path.as_path()) {
252            diff.deleted.push(path.clone());
253        }
254    }
255
256    diff
257}
258
259#[cfg(unix)]
260fn inode(metadata: &std::fs::Metadata) -> u64 {
261    use std::os::unix::fs::MetadataExt;
262    metadata.ino()
263}
264
265#[cfg(not(unix))]
266fn inode(_metadata: &std::fs::Metadata) -> u64 {
267    0
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use std::io::Write;
274    use tempfile::TempDir;
275
276    fn write_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
277        let path = dir.join(name);
278        let mut f = std::fs::File::create(&path).unwrap();
279        f.write_all(content).unwrap();
280        path
281    }
282
283    fn manifest_with(path: PathBuf, content: &[u8]) -> Manifest {
284        let metadata = std::fs::metadata(&path).unwrap();
285        let entry = FileEntry::from_bytes(&metadata, content);
286        let mut m = Manifest::new();
287        m.insert(path, entry);
288        m
289    }
290
291    #[test]
292    fn empty_diff_against_empty_walk() {
293        let mut m = Manifest::new();
294        let diff = diff_against_walk(&mut m, &[]);
295        assert!(diff.is_empty());
296        assert_eq!(diff.total(), 0);
297    }
298
299    #[test]
300    fn detects_new_file() {
301        let dir = TempDir::new().unwrap();
302        let p1 = write_file(dir.path(), "a.txt", b"hello");
303        let mut m = Manifest::new();
304        let diff = diff_against_walk(&mut m, std::slice::from_ref(&p1));
305        assert_eq!(diff.new, vec![p1]);
306        assert!(diff.dirty.is_empty());
307        assert!(diff.deleted.is_empty());
308    }
309
310    #[test]
311    fn detects_deleted_file_via_missing_from_walk() {
312        let dir = TempDir::new().unwrap();
313        let p1 = write_file(dir.path(), "gone.txt", b"hello");
314        let mut m = manifest_with(p1.clone(), b"hello");
315        std::fs::remove_file(&p1).unwrap();
316        // Caller walked the dir — empty since gone.txt is gone
317        let diff = diff_against_walk(&mut m, &[]);
318        assert_eq!(diff.deleted, vec![p1]);
319        assert!(diff.dirty.is_empty());
320        assert!(diff.new.is_empty());
321    }
322
323    #[test]
324    fn unchanged_file_skipped_via_stat_tuple() {
325        let dir = TempDir::new().unwrap();
326        let p1 = write_file(dir.path(), "stable.txt", b"hello");
327        let mut m = manifest_with(p1.clone(), b"hello");
328        let diff = diff_against_walk(&mut m, &[p1]);
329        assert!(diff.is_empty(), "stat tuple match must skip blake3");
330    }
331
332    #[test]
333    fn detects_content_change_when_size_changes() {
334        let dir = TempDir::new().unwrap();
335        let p1 = write_file(dir.path(), "edit.txt", b"hello");
336        let mut m = manifest_with(p1.clone(), b"hello");
337        std::thread::sleep(std::time::Duration::from_millis(20));
338        write_file(dir.path(), "edit.txt", b"hello world"); // size change
339        let diff = diff_against_walk(&mut m, std::slice::from_ref(&p1));
340        assert_eq!(diff.dirty, vec![p1]);
341    }
342
343    #[test]
344    fn detects_content_change_when_size_unchanged() {
345        let dir = TempDir::new().unwrap();
346        // Same byte count, different content
347        let p1 = write_file(dir.path(), "rename-vars.rs", b"let foo = 1;");
348        let mut m = manifest_with(p1.clone(), b"let foo = 1;");
349        std::thread::sleep(std::time::Duration::from_millis(20));
350        write_file(dir.path(), "rename-vars.rs", b"let bar = 1;"); // same size
351        let diff = diff_against_walk(&mut m, std::slice::from_ref(&p1));
352        assert_eq!(diff.dirty, vec![p1], "blake3 must catch same-size change");
353    }
354
355    #[test]
356    fn touched_but_unchanged_does_not_appear_in_diff() {
357        let dir = TempDir::new().unwrap();
358        let p1 = write_file(dir.path(), "touched.txt", b"identical");
359        let mut m = manifest_with(p1.clone(), b"identical");
360        let original_mtime = m.get(&p1).unwrap().mtime;
361        std::thread::sleep(std::time::Duration::from_millis(20));
362        // Rewrite same content → mtime updates, blake3 same
363        write_file(dir.path(), "touched.txt", b"identical");
364        let new_mtime_on_disk = std::fs::metadata(&p1).unwrap().modified().unwrap();
365        assert_ne!(
366            original_mtime, new_mtime_on_disk,
367            "setup: mtime must differ for this test to mean anything"
368        );
369
370        let diff = diff_against_walk(&mut m, std::slice::from_ref(&p1));
371        assert!(
372            diff.is_empty(),
373            "touch-without-content-change must not appear in diff"
374        );
375
376        // Manifest's mtime must be refreshed so the next diff hits the
377        // cheap stat-tuple path instead of re-blake3'ing.
378        let refreshed = m.get(&p1).unwrap();
379        assert_eq!(
380            refreshed.mtime, new_mtime_on_disk,
381            "manifest mtime must be refreshed on touch-without-change"
382        );
383    }
384
385    #[test]
386    fn touched_unchanged_then_real_change_still_detected() {
387        // Regression guard: the manifest update on touch-without-change
388        // must not mask a subsequent real edit.
389        let dir = TempDir::new().unwrap();
390        let p1 = write_file(dir.path(), "twice.txt", b"original");
391        let mut m = manifest_with(p1.clone(), b"original");
392
393        std::thread::sleep(std::time::Duration::from_millis(20));
394        write_file(dir.path(), "twice.txt", b"original"); // touch only
395        let diff1 = diff_against_walk(&mut m, std::slice::from_ref(&p1));
396        assert!(diff1.is_empty(), "first pass: touch only");
397
398        std::thread::sleep(std::time::Duration::from_millis(20));
399        write_file(dir.path(), "twice.txt", b"modified"); // real change
400        let diff2 = diff_against_walk(&mut m, std::slice::from_ref(&p1));
401        assert_eq!(diff2.dirty, vec![p1], "second pass: real edit detected");
402    }
403
404    #[test]
405    fn new_plus_deleted_plus_dirty_simultaneously() {
406        let dir = TempDir::new().unwrap();
407        let keep = write_file(dir.path(), "keep.txt", b"keep");
408        let edit = write_file(dir.path(), "edit.txt", b"orig");
409        let gone = write_file(dir.path(), "gone.txt", b"gone");
410        let added_path = dir.path().join("added.txt"); // new file we'll write below
411
412        let mut m = Manifest::new();
413        let keep_meta = std::fs::metadata(&keep).unwrap();
414        let edit_meta = std::fs::metadata(&edit).unwrap();
415        let gone_meta = std::fs::metadata(&gone).unwrap();
416        m.insert(keep.clone(), FileEntry::from_bytes(&keep_meta, b"keep"));
417        m.insert(edit.clone(), FileEntry::from_bytes(&edit_meta, b"orig"));
418        m.insert(gone.clone(), FileEntry::from_bytes(&gone_meta, b"gone"));
419
420        std::thread::sleep(std::time::Duration::from_millis(20));
421        write_file(dir.path(), "edit.txt", b"changed");
422        std::fs::remove_file(&gone).unwrap();
423        write_file(dir.path(), "added.txt", b"added");
424
425        let walk = vec![keep.clone(), edit.clone(), added_path.clone()];
426        let diff = diff_against_walk(&mut m, &walk);
427        assert_eq!(diff.dirty, vec![edit]);
428        assert_eq!(diff.new, vec![added_path]);
429        assert_eq!(diff.deleted, vec![gone]);
430        assert!(!diff.is_empty());
431        assert_eq!(diff.total(), 3);
432    }
433
434    #[test]
435    fn file_entry_from_path_round_trips_from_bytes() {
436        let dir = TempDir::new().unwrap();
437        let p = write_file(dir.path(), "x.txt", b"some content");
438        let from_path = FileEntry::from_path(&p).unwrap();
439        let metadata = std::fs::metadata(&p).unwrap();
440        let from_bytes = FileEntry::from_bytes(&metadata, b"some content");
441        assert_eq!(from_path.blake3, from_bytes.blake3);
442        assert_eq!(from_path.size, from_bytes.size);
443        // mtime may differ by stat-resolution if the OS updated atime
444        // between calls; size + hash are the load-bearing invariants.
445    }
446
447    #[test]
448    fn manifest_default_is_empty() {
449        let m = Manifest::default();
450        assert!(m.is_empty());
451        assert_eq!(m.len(), 0);
452    }
453
454    #[cfg(unix)]
455    #[test]
456    fn inode_is_non_zero_on_unix() {
457        let dir = TempDir::new().unwrap();
458        let p = write_file(dir.path(), "x", b"data");
459        let entry = FileEntry::from_path(&p).unwrap();
460        assert!(entry.ino > 0, "Unix metadata must produce a non-zero inode");
461    }
462}