Skip to main content

openjd_snapshots/ops/
diff.rs

1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// Copyright by contributors to this project.
3// SPDX-License-Identifier: (Apache-2.0 OR MIT)
4
5use crate::manifest::{Diff, DirEntry, FileEntry, Full, Manifest};
6use std::collections::HashMap;
7
8/// Options controlling the DIFF operation ([`diff_snapshots`]).
9#[derive(Default)]
10pub struct DiffOptions {
11    /// Hash of the parent manifest to record in the diff's
12    /// `parent_manifest_hash` field. The caller computes this from the
13    /// parent's serialized form.
14    pub parent_manifest_hash: Option<String>,
15
16    /// If `true`, ignore `hash`/`chunk_hashes` when comparing entries and
17    /// skip the hash-state compatibility check. Useful for fast diffs where
18    /// only metadata (size, mtime, runnable) is compared.
19    pub ignore_hashes: bool,
20
21    /// If `true`, the `runnable` field is treated specially when a file appears
22    /// in both snapshots:
23    ///
24    /// 1. **Comparison**: `runnable` is ignored when deciding whether the file
25    ///    changed (so a Windows collector that lost the POSIX execute bit does
26    ///    not cause a spurious "modified" entry).
27    /// 2. **Preservation**: if the file is otherwise modified (size, mtime, or
28    ///    hash differs), the parent entry's `runnable` value is copied into
29    ///    the diff entry instead of the current entry's value.
30    ///
31    /// The comparison half is delegated to [`entries_differ`] via its
32    /// `ignore_runnable` parameter; the preservation half is applied by
33    /// [`diff_snapshots`] itself when it emits the diff entry.
34    pub preserve_runnable: bool,
35}
36
37/// Compares two file entries to determine if they differ.
38///
39/// Checks entry type transitions (regular vs symlink), content hashes (unless
40/// `ignore_hashes`), and metadata (size, mtime, runnable).
41///
42/// # Parameters
43///
44/// - `parent`: The parent-side entry (older snapshot).
45/// - `current`: The current-side entry (newer snapshot).
46/// - `ignore_hashes`: When `true`, skip the `hash`/`chunk_hashes` comparison.
47///   Used for fast diff mode where only metadata is compared.
48/// - `ignore_runnable`: When `true`, skip the `runnable` comparison. Callers
49///   that want the top-level [`DiffOptions::preserve_runnable`] behaviour pass
50///   the value of that field here — this function only controls whether
51///   `runnable` participates in the comparison, not how the diff entry is
52///   constructed. The "preserve" half of `preserve_runnable` (copying the
53///   parent's `runnable` into the diff entry) is handled by [`diff_snapshots`]
54///   itself. This matches the Python reference's `_entries_differ(...,
55///   ignore_runnable=...)`.
56pub fn entries_differ(
57    parent: &FileEntry,
58    current: &FileEntry,
59    ignore_hashes: bool,
60    ignore_runnable: bool,
61) -> bool {
62    let parent_is_symlink = parent.symlink_target.is_some();
63    let current_is_symlink = current.symlink_target.is_some();
64
65    // Type transition always differs
66    if parent_is_symlink != current_is_symlink {
67        return true;
68    }
69
70    // Symlinks: compare target only
71    if current_is_symlink {
72        return parent.symlink_target != current.symlink_target;
73    }
74
75    // Regular files
76    if parent.size != current.size || parent.mtime != current.mtime {
77        return true;
78    }
79    if !ignore_hashes
80        && (parent.hash != current.hash || parent.chunk_hashes != current.chunk_hashes)
81    {
82        return true;
83    }
84    if !ignore_runnable && parent.runnable != current.runnable {
85        return true;
86    }
87    false
88}
89
90/// Returns `None` if no regular files, `Some(true)` if any have hashes, `Some(false)` if none do.
91fn has_hashed_files<P, K>(manifest: &Manifest<P, K>) -> Option<bool> {
92    let regular_files: Vec<_> = manifest
93        .files
94        .iter()
95        .filter(|f| f.symlink_target.is_none() && !f.deleted)
96        .collect();
97    if regular_files.is_empty() {
98        return None;
99    }
100    Some(
101        regular_files
102            .iter()
103            .any(|f| f.hash.is_some() || f.chunk_hashes.is_some()),
104    )
105}
106
107/// Computes the difference between two snapshot manifests.
108///
109/// Returns a diff manifest containing new/modified entries and deletion
110/// markers. Both manifests must have the same path style. When a directory
111/// is deleted, all its contents receive explicit deletion markers.
112pub fn diff_snapshots<P: Clone>(
113    parent: &Manifest<P, Full>,
114    current: &Manifest<P, Full>,
115    options: &DiffOptions,
116) -> crate::Result<Manifest<P, Diff>> {
117    if !options.ignore_hashes {
118        let parent_hashed = has_hashed_files(parent);
119        let current_hashed = has_hashed_files(current);
120        if let (Some(ph), Some(ch)) = (parent_hashed, current_hashed) {
121            if ph && !ch {
122                return Err(crate::SnapshotError::Validation(
123                    "cannot diff hashed parent manifest against unhashed current manifest when ignore_hashes=false".into(),
124                ));
125            }
126            if !ph && ch {
127                return Err(crate::SnapshotError::Validation(
128                    "cannot diff unhashed parent manifest against hashed current manifest when ignore_hashes=false".into(),
129                ));
130            }
131        }
132    }
133
134    let parent_files: HashMap<&str, &FileEntry> =
135        parent.files.iter().map(|f| (f.path.as_str(), f)).collect();
136    let parent_dirs: HashMap<&str, &DirEntry> =
137        parent.dirs.iter().map(|d| (d.path.as_str(), d)).collect();
138    let current_files: HashMap<&str, &FileEntry> =
139        current.files.iter().map(|f| (f.path.as_str(), f)).collect();
140    let current_dirs: HashMap<&str, &DirEntry> =
141        current.dirs.iter().map(|d| (d.path.as_str(), d)).collect();
142
143    let mut files = Vec::new();
144
145    // New and modified files
146    for cf in &current.files {
147        match parent_files.get(cf.path.as_str()) {
148            None => files.push(cf.clone()),
149            Some(pf) => {
150                // `preserve_runnable` at the options layer means two things:
151                //   1. Ignore the `runnable` field when deciding if files differ
152                //      (this is the `ignore_runnable` arg to `entries_differ`).
153                //   2. If the file is otherwise modified, copy the parent's
154                //      `runnable` into the diff entry (handled below).
155                if entries_differ(
156                    pf,
157                    cf,
158                    options.ignore_hashes,
159                    /* ignore_runnable = */ options.preserve_runnable,
160                ) {
161                    let mut entry = cf.clone();
162                    if options.preserve_runnable && cf.symlink_target.is_none() {
163                        entry.runnable = pf.runnable;
164                    }
165                    files.push(entry);
166                }
167            }
168        }
169    }
170
171    // Deleted files
172    let mut deleted_file_paths: std::collections::HashSet<&str> = std::collections::HashSet::new();
173    for pf in &parent.files {
174        if !current_files.contains_key(pf.path.as_str()) {
175            deleted_file_paths.insert(&pf.path);
176        }
177    }
178
179    let mut dirs = Vec::new();
180
181    // New dirs
182    for cd in &current.dirs {
183        if !parent_dirs.contains_key(cd.path.as_str()) {
184            dirs.push(cd.clone());
185        }
186    }
187
188    // Deleted dirs
189    let mut deleted_dir_paths: std::collections::HashSet<&str> = std::collections::HashSet::new();
190    for pd in &parent.dirs {
191        if !current_dirs.contains_key(pd.path.as_str()) {
192            deleted_dir_paths.insert(&pd.path);
193        }
194    }
195
196    // Ensure deleted directories have all their contents deleted too.
197    for deleted_dir in deleted_dir_paths.iter().copied().collect::<Vec<_>>() {
198        let dir_prefix = format!("{}/", deleted_dir);
199        for pf in &parent.files {
200            if pf.path.starts_with(&dir_prefix) && !current_files.contains_key(pf.path.as_str()) {
201                deleted_file_paths.insert(&pf.path);
202            }
203        }
204        for pd in &parent.dirs {
205            if pd.path.starts_with(&dir_prefix) && !current_dirs.contains_key(pd.path.as_str()) {
206                deleted_dir_paths.insert(&pd.path);
207            }
208        }
209    }
210
211    // Add file deletion markers
212    for path in &deleted_file_paths {
213        files.push(FileEntry::deleted(*path));
214    }
215
216    // Add dir deletion markers
217    let sorted_deleted_dirs: Vec<&str> = deleted_dir_paths.into_iter().collect();
218    for path in sorted_deleted_dirs {
219        dirs.push(DirEntry::deleted(path));
220    }
221
222    // Sort: non-deleted files first (by path), then deleted files (by path)
223    files.sort_by(|a, b| match (a.deleted, b.deleted) {
224        (false, true) => std::cmp::Ordering::Less,
225        (true, false) => std::cmp::Ordering::Greater,
226        _ => a.path.cmp(&b.path),
227    });
228
229    // Sort: non-deleted dirs first (by path), then deleted dirs (deepest-first, then by path)
230    dirs.sort_by(|a, b| match (a.deleted, b.deleted) {
231        (false, true) => std::cmp::Ordering::Less,
232        (true, false) => std::cmp::Ordering::Greater,
233        (false, false) => a.path.cmp(&b.path),
234        (true, true) => b.path.len().cmp(&a.path.len()).then(a.path.cmp(&b.path)),
235    });
236
237    let mut result = Manifest::new(parent.hash_alg, parent.file_chunk_size_bytes);
238    result.files = files;
239    result.dirs = dirs;
240    result.parent_manifest_hash = options.parent_manifest_hash.clone();
241    result.recompute_total_size();
242    Ok(result)
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248    use crate::hash::HashAlgorithm;
249    use crate::manifest::{Full, Rel};
250    use crate::{DirEntry, FileEntry, Manifest, DEFAULT_FILE_CHUNK_SIZE};
251
252    type RelSnapshot = Manifest<Rel, Full>;
253
254    fn make(files: Vec<FileEntry>, dirs: Vec<DirEntry>) -> RelSnapshot {
255        Manifest::new(HashAlgorithm::Xxh128, DEFAULT_FILE_CHUNK_SIZE)
256            .with_files(files)
257            .with_dirs(dirs)
258    }
259
260    fn default_opts() -> DiffOptions {
261        DiffOptions::default()
262    }
263
264    #[test]
265    fn no_changes_empty_diff() {
266        let m = make(vec![FileEntry::file("a.txt", 100, 1000)], vec![]);
267        let diff = diff_snapshots(&m, &m, &default_opts()).unwrap();
268        assert!(diff.files.is_empty());
269        assert!(diff.dirs.is_empty());
270    }
271
272    #[test]
273    fn new_file_detected() {
274        let parent = make(vec![], vec![]);
275        let current = make(vec![FileEntry::file("new.txt", 50, 1)], vec![]);
276        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
277        assert_eq!(diff.files.len(), 1);
278        assert_eq!(diff.files[0].path, "new.txt");
279        assert!(!diff.files[0].deleted);
280    }
281
282    #[test]
283    fn modified_file_by_mtime() {
284        let parent = make(vec![FileEntry::file("a.txt", 100, 1000)], vec![]);
285        let current = make(vec![FileEntry::file("a.txt", 100, 2000)], vec![]);
286        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
287        assert_eq!(diff.files.len(), 1);
288        assert_eq!(diff.files[0].mtime, Some(2000));
289    }
290
291    #[test]
292    fn modified_file_by_hash() {
293        let mut pf = FileEntry::file("a.txt", 100, 1000);
294        pf.hash = Some("aaa".into());
295        let mut cf = FileEntry::file("a.txt", 100, 1000);
296        cf.hash = Some("bbb".into());
297        let parent = make(vec![pf], vec![]);
298        let current = make(vec![cf], vec![]);
299        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
300        assert_eq!(diff.files.len(), 1);
301        assert_eq!(diff.files[0].hash.as_deref(), Some("bbb"));
302    }
303
304    #[test]
305    fn deleted_file_marker() {
306        let parent = make(vec![FileEntry::file("gone.txt", 100, 1)], vec![]);
307        let current = make(vec![], vec![]);
308        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
309        assert_eq!(diff.files.len(), 1);
310        assert!(diff.files[0].deleted);
311        assert_eq!(diff.files[0].path, "gone.txt");
312    }
313
314    #[test]
315    fn ignore_hashes_mode() {
316        let mut pf = FileEntry::file("a.txt", 100, 1000);
317        pf.hash = Some("aaa".into());
318        let mut cf = FileEntry::file("a.txt", 100, 1000);
319        cf.hash = Some("bbb".into());
320        let parent = make(vec![pf], vec![]);
321        let current = make(vec![cf], vec![]);
322        let opts = DiffOptions {
323            ignore_hashes: true,
324            ..default_opts()
325        };
326        let diff = diff_snapshots(&parent, &current, &opts).unwrap();
327        assert!(diff.files.is_empty(), "hash-only change should be ignored");
328    }
329
330    #[test]
331    fn preserve_runnable_copies_from_parent() {
332        let mut pf = FileEntry::file("script.sh", 100, 1000);
333        pf.runnable = true;
334        // Current has different mtime (modified) but runnable=false (Windows)
335        let cf = FileEntry::file("script.sh", 100, 2000);
336        let parent = make(vec![pf], vec![]);
337        let current = make(vec![cf], vec![]);
338        let opts = DiffOptions {
339            preserve_runnable: true,
340            ..default_opts()
341        };
342        let diff = diff_snapshots(&parent, &current, &opts).unwrap();
343        assert_eq!(diff.files.len(), 1);
344        assert!(
345            diff.files[0].runnable,
346            "runnable should be copied from parent"
347        );
348    }
349
350    #[test]
351    fn symlink_change_detected() {
352        let parent = make(vec![FileEntry::symlink("link", "target_a")], vec![]);
353        let current = make(vec![FileEntry::symlink("link", "target_b")], vec![]);
354        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
355        assert_eq!(diff.files.len(), 1);
356        assert_eq!(diff.files[0].symlink_target.as_deref(), Some("target_b"));
357    }
358
359    #[test]
360    fn dir_additions_and_deletions() {
361        let parent = make(vec![], vec![DirEntry::new("old_dir")]);
362        let current = make(vec![], vec![DirEntry::new("new_dir")]);
363        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
364        assert_eq!(diff.dirs.len(), 2);
365        let new = diff.dirs.iter().find(|d| d.path == "new_dir").unwrap();
366        assert!(!new.deleted);
367        let old = diff.dirs.iter().find(|d| d.path == "old_dir").unwrap();
368        assert!(old.deleted);
369    }
370
371    #[test]
372    fn hash_state_mismatch_hashed_parent_unhashed_current() {
373        let mut pf = FileEntry::file("a.txt", 100, 1000);
374        pf.hash = Some("aaa".into());
375        let parent = make(vec![pf], vec![]);
376        let current = make(vec![FileEntry::file("a.txt", 100, 1000)], vec![]);
377        let result = diff_snapshots(&parent, &current, &default_opts());
378        assert!(result.is_err());
379        let msg = result.unwrap_err().to_string();
380        assert!(msg.contains("hashed parent"));
381    }
382
383    #[test]
384    fn hash_state_mismatch_unhashed_parent_hashed_current() {
385        let parent = make(vec![FileEntry::file("a.txt", 100, 1000)], vec![]);
386        let mut cf = FileEntry::file("a.txt", 100, 1000);
387        cf.hash = Some("bbb".into());
388        let current = make(vec![cf], vec![]);
389        let result = diff_snapshots(&parent, &current, &default_opts());
390        assert!(result.is_err());
391        let msg = result.unwrap_err().to_string();
392        assert!(msg.contains("unhashed parent"));
393    }
394
395    #[test]
396    fn hash_state_mismatch_allowed_with_ignore_hashes() {
397        let mut pf = FileEntry::file("a.txt", 100, 1000);
398        pf.hash = Some("aaa".into());
399        let parent = make(vec![pf], vec![]);
400        let current = make(vec![FileEntry::file("a.txt", 100, 1000)], vec![]);
401        let opts = DiffOptions {
402            ignore_hashes: true,
403            ..default_opts()
404        };
405        assert!(diff_snapshots(&parent, &current, &opts).is_ok());
406    }
407
408    #[test]
409    fn hash_state_empty_manifests_compatible() {
410        let parent = make(vec![], vec![]);
411        let mut cf = FileEntry::file("a.txt", 100, 1000);
412        cf.hash = Some("aaa".into());
413        let current = make(vec![cf], vec![]);
414        // Empty parent has no regular files -> None, so no mismatch
415        assert!(diff_snapshots(&parent, &current, &default_opts()).is_ok());
416    }
417
418    #[test]
419    fn hash_state_symlink_only_compatible() {
420        let parent = make(vec![FileEntry::symlink("link", "target")], vec![]);
421        let mut cf = FileEntry::file("a.txt", 100, 1000);
422        cf.hash = Some("aaa".into());
423        let current = make(vec![cf], vec![]);
424        // Parent has only symlinks -> None for has_hashed_files
425        assert!(diff_snapshots(&parent, &current, &default_opts()).is_ok());
426    }
427
428    #[test]
429    fn deleted_dir_cascades_to_contents() {
430        let parent = make(
431            vec![
432                FileEntry::file("dir/a.txt", 10, 1),
433                FileEntry::file("dir/sub/b.txt", 20, 2),
434            ],
435            vec![DirEntry::new("dir"), DirEntry::new("dir/sub")],
436        );
437        let current = make(vec![], vec![]);
438        let diff = diff_snapshots(&parent, &current, &default_opts()).unwrap();
439        // All files under deleted dirs should have deletion markers
440        let deleted_files: Vec<&str> = diff
441            .files
442            .iter()
443            .filter(|f| f.deleted)
444            .map(|f| f.path.as_str())
445            .collect();
446        assert!(deleted_files.contains(&"dir/a.txt"));
447        assert!(deleted_files.contains(&"dir/sub/b.txt"));
448        // Deleted dirs should also be present
449        let deleted_dirs: Vec<&str> = diff
450            .dirs
451            .iter()
452            .filter(|d| d.deleted)
453            .map(|d| d.path.as_str())
454            .collect();
455        assert!(deleted_dirs.contains(&"dir"));
456        assert!(deleted_dirs.contains(&"dir/sub"));
457    }
458
459    #[test]
460    fn parent_manifest_hash_set() {
461        let m = make(vec![], vec![]);
462        let opts = DiffOptions {
463            parent_manifest_hash: Some("hash123".into()),
464            ..default_opts()
465        };
466        let diff = diff_snapshots(&m, &m, &opts).unwrap();
467        assert_eq!(diff.parent_manifest_hash.as_deref(), Some("hash123"));
468    }
469}