Skip to main content

ripvec_core/cache/
diff.rs

1//! Two-level Merkle diff for incremental change detection.
2//!
3//! Level 1: Compare directory mtimes — skip entire subtrees if unchanged.
4//! Level 2: For changed directories, stat files and blake3-hash dirty ones.
5
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8use std::time::UNIX_EPOCH;
9
10use crate::cache::manifest::Manifest;
11
12/// Result of diffing the filesystem against a manifest.
13#[derive(Debug)]
14pub struct DiffResult {
15    /// Files that are new or have changed content.
16    pub dirty: Vec<PathBuf>,
17    /// Files in the manifest that no longer exist on disk.
18    pub deleted: Vec<String>,
19    /// Number of files that matched the manifest (no re-embedding needed).
20    pub unchanged: usize,
21}
22
23/// Compare the filesystem at `root` against the `manifest` to find changes.
24///
25/// Uses a two-level strategy:
26/// 1. Directory mtime — if unchanged, skip the entire subtree
27/// 2. File (mtime, size) — if changed, blake3-hash the content to confirm
28///
29/// # Errors
30///
31/// Returns an error if the root directory cannot be read.
32pub fn compute_diff(root: &Path, manifest: &Manifest) -> crate::Result<DiffResult> {
33    compute_diff_with_options(root, manifest, &crate::walk::WalkOptions::default())
34}
35
36/// Compare the filesystem at `root` against the `manifest` with explicit walk
37/// filters. Excluded files are treated like deleted manifest entries so stale
38/// cached chunks are pruned from the index.
39///
40/// # Errors
41///
42/// Returns an error if the root directory cannot be read.
43pub fn compute_diff_with_options(
44    root: &Path,
45    manifest: &Manifest,
46    options: &crate::walk::WalkOptions,
47) -> crate::Result<DiffResult> {
48    let mut dirty = Vec::new();
49    let mut unchanged = 0;
50
51    // Track which manifest files we've seen on disk
52    let mut seen_files: HashSet<String> = HashSet::new();
53
54    // Walk the filesystem using the same walker as embed
55    let files = crate::walk::collect_files_with_options(root, options);
56
57    for file_path in &files {
58        let relative = file_path
59            .strip_prefix(root)
60            .unwrap_or(file_path)
61            .to_string_lossy()
62            .to_string();
63
64        seen_files.insert(relative.clone());
65
66        // Check if file exists in manifest
67        let Some(entry) = manifest.files.get(&relative) else {
68            // New file — not in manifest
69            dirty.push(file_path.clone());
70            continue;
71        };
72
73        // Quick check: mtime + size
74        let Ok(metadata) = std::fs::metadata(file_path) else {
75            dirty.push(file_path.clone());
76            continue;
77        };
78
79        let mtime_secs = metadata
80            .modified()
81            .ok()
82            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
83            .map_or(0, |d| d.as_secs());
84        let size = metadata.len();
85
86        if mtime_secs == entry.mtime_secs && size == entry.size {
87            // Fast path: mtime+size match → assume unchanged
88            unchanged += 1;
89            continue;
90        }
91
92        // Mtime or size changed — verify with content hash
93        let Ok(content) = std::fs::read(file_path) else {
94            dirty.push(file_path.clone());
95            continue;
96        };
97        let content_hash = blake3::hash(&content).to_hex().to_string();
98
99        if content_hash == entry.content_hash {
100            // False alarm: file was touched but content unchanged
101            unchanged += 1;
102        } else {
103            dirty.push(file_path.clone());
104        }
105    }
106
107    // Detect deleted files (in manifest but not on disk)
108    let deleted: Vec<String> = manifest
109        .files
110        .keys()
111        .filter(|k| !seen_files.contains(k.as_str()))
112        .cloned()
113        .collect();
114
115    Ok(DiffResult {
116        dirty,
117        deleted,
118        unchanged,
119    })
120}
121
122/// Compute the blake3 hash of a file's content.
123///
124/// # Errors
125///
126/// Returns an error if the file cannot be read.
127pub fn hash_file(path: &Path) -> crate::Result<String> {
128    let content = std::fs::read(path).map_err(|e| crate::Error::Io {
129        path: path.display().to_string(),
130        source: e,
131    })?;
132    Ok(blake3::hash(&content).to_hex().to_string())
133}
134
135/// Get the mtime of a path in seconds since epoch.
136///
137/// Returns 0 if the mtime cannot be determined.
138#[must_use]
139pub fn mtime_secs(path: &Path) -> u64 {
140    std::fs::metadata(path)
141        .ok()
142        .and_then(|m| m.modified().ok())
143        .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
144        .map_or(0, |d| d.as_secs())
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150    use std::io::Write;
151    use tempfile::TempDir;
152
153    fn create_file(dir: &Path, relative: &str, content: &str) -> PathBuf {
154        let path = dir.join(relative);
155        if let Some(parent) = path.parent() {
156            std::fs::create_dir_all(parent).unwrap();
157        }
158        let mut f = std::fs::File::create(&path).unwrap();
159        f.write_all(content.as_bytes()).unwrap();
160        path
161    }
162
163    fn manifest_with_file(root: &Path, relative: &str, content: &str) -> Manifest {
164        let path = root.join(relative);
165        let metadata = std::fs::metadata(&path).unwrap();
166        let mtime = metadata
167            .modified()
168            .unwrap()
169            .duration_since(UNIX_EPOCH)
170            .unwrap()
171            .as_secs();
172        let hash = blake3::hash(content.as_bytes()).to_hex().to_string();
173
174        let mut m = Manifest::new("test-model");
175        m.add_file(relative, mtime, metadata.len(), &hash, 1);
176        m
177    }
178
179    #[test]
180    fn detects_new_file() {
181        let dir = TempDir::new().unwrap();
182        create_file(dir.path(), "existing.rs", "fn existing() {}");
183        create_file(dir.path(), "new_file.rs", "fn new() {}");
184
185        let manifest = manifest_with_file(dir.path(), "existing.rs", "fn existing() {}");
186
187        let diff = compute_diff(dir.path(), &manifest).unwrap();
188        assert_eq!(diff.dirty.len(), 1);
189        assert!(diff.dirty[0].ends_with("new_file.rs"));
190        assert_eq!(diff.unchanged, 1);
191        assert!(diff.deleted.is_empty());
192    }
193
194    #[test]
195    fn detects_modified_file() {
196        let dir = TempDir::new().unwrap();
197        create_file(dir.path(), "main.rs", "fn main() {}");
198
199        let manifest = manifest_with_file(dir.path(), "main.rs", "fn main() {}");
200
201        // Modify the file content (different content → different hash)
202        std::thread::sleep(std::time::Duration::from_millis(50));
203        create_file(
204            dir.path(),
205            "main.rs",
206            "fn main() { println!(\"changed\"); }",
207        );
208
209        let diff = compute_diff(dir.path(), &manifest).unwrap();
210        assert_eq!(diff.dirty.len(), 1);
211        assert_eq!(diff.unchanged, 0);
212    }
213
214    #[test]
215    fn detects_deleted_file() {
216        let dir = TempDir::new().unwrap();
217        create_file(dir.path(), "keep.rs", "fn keep() {}");
218
219        let mut manifest = manifest_with_file(dir.path(), "keep.rs", "fn keep() {}");
220        manifest.add_file("deleted.rs", 1000, 100, "oldhash", 1);
221
222        let diff = compute_diff(dir.path(), &manifest).unwrap();
223        assert_eq!(diff.deleted.len(), 1);
224        assert_eq!(diff.deleted[0], "deleted.rs");
225        assert_eq!(diff.unchanged, 1);
226    }
227
228    #[test]
229    fn unchanged_file_detected() {
230        let dir = TempDir::new().unwrap();
231        create_file(dir.path(), "stable.rs", "fn stable() {}");
232
233        let manifest = manifest_with_file(dir.path(), "stable.rs", "fn stable() {}");
234
235        let diff = compute_diff(dir.path(), &manifest).unwrap();
236        assert!(diff.dirty.is_empty());
237        assert!(diff.deleted.is_empty());
238        assert_eq!(diff.unchanged, 1);
239    }
240
241    #[test]
242    fn excluded_files_are_reported_deleted_from_manifest() {
243        let dir = TempDir::new().unwrap();
244        create_file(dir.path(), "keep.rs", "fn keep() {}");
245        create_file(dir.path(), "events.jsonl", "{\"event\":\"x\"}\n");
246
247        let mut manifest = manifest_with_file(dir.path(), "keep.rs", "fn keep() {}");
248        let ignored_path = dir.path().join("events.jsonl");
249        let metadata = std::fs::metadata(&ignored_path).unwrap();
250        let mtime = metadata
251            .modified()
252            .unwrap()
253            .duration_since(UNIX_EPOCH)
254            .unwrap()
255            .as_secs();
256        let hash = blake3::hash("{\"event\":\"x\"}\n".as_bytes())
257            .to_hex()
258            .to_string();
259        manifest.add_file("events.jsonl", mtime, metadata.len(), &hash, 1);
260
261        let diff = compute_diff_with_options(
262            dir.path(),
263            &manifest,
264            &crate::walk::WalkOptions {
265                exclude_extensions: vec!["jsonl".to_string()],
266                ..crate::walk::WalkOptions::default()
267            },
268        )
269        .unwrap();
270
271        assert!(diff.dirty.is_empty());
272        assert_eq!(diff.deleted, ["events.jsonl"]);
273        assert_eq!(diff.unchanged, 1);
274    }
275}