Skip to main content

objects/object/
tree_diff.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Shared tree-to-tree diffing implementation.
3//!
4//! This module provides a generic tree diffing algorithm that can be used
5//! by both `repo` and `semantic` crates.
6
7use std::ops::ControlFlow;
8
9use super::FileChangeSet;
10use crate::{
11    object::{DiffKind, EntryType, FileChange, Tree, TreeEntry},
12    store::ObjectStore,
13};
14
15/// Collect all file changes between two trees.
16///
17/// This is the materializing variant: it walks the trees via
18/// [`diff_trees_visit`] and collects every [`FileChange`] into a
19/// [`FileChangeSet`]. Streaming or early-exit consumers should prefer
20/// [`diff_trees_visit`], which avoids allocating the full change list.
21pub fn diff_trees<S: ObjectStore + ?Sized>(
22    store: &S,
23    from: &crate::object::ContentHash,
24    to: &crate::object::ContentHash,
25) -> Result<FileChangeSet, anyhow::Error> {
26    let mut changes = FileChangeSet::new();
27    // The visitor never short-circuits here, so the `ControlFlow` result is
28    // always `Continue(())`; we ignore it and return the collected set.
29    let _ = diff_trees_visit(store, from, to, |change| {
30        changes.push(change);
31        ControlFlow::<()>::Continue(())
32    })?;
33    Ok(changes)
34}
35
36/// Diff two trees with internal iteration, invoking `visitor` for each
37/// [`FileChange`] in traversal order.
38///
39/// This is the streaming counterpart to [`diff_trees`]. The visitor returns a
40/// [`ControlFlow`]: `Continue(())` keeps walking, while `Break(value)` stops
41/// the traversal immediately — no further subtrees are loaded and no further
42/// changes are produced. Early-exit consumers (e.g. "does anything under path
43/// X differ?", first-N, quick-status checks) use this to avoid materializing
44/// the entire change list.
45///
46/// On early exit the carried `B` is returned as `Ok(ControlFlow::Break(b))`;
47/// on full completion it returns `Ok(ControlFlow::Continue(()))`. Changes are
48/// emitted in exactly the same order as [`diff_trees`] collects them, so the
49/// two paths are behavior-identical.
50pub fn diff_trees_visit<S, V, B>(
51    store: &S,
52    from: &crate::object::ContentHash,
53    to: &crate::object::ContentHash,
54    mut visitor: V,
55) -> Result<ControlFlow<B>, anyhow::Error>
56where
57    S: ObjectStore + ?Sized,
58    V: FnMut(FileChange) -> ControlFlow<B>,
59{
60    if from == to {
61        return Ok(ControlFlow::Continue(()));
62    }
63    let from_tree = store.get_tree(from)?;
64    let to_tree = store.get_tree(to)?;
65    diff_trees_recursive(store, &from_tree, &to_tree, "", &mut visitor)
66}
67
68/// Recursively diff two trees, invoking `visitor` for each change.
69///
70/// Returns `Ok(ControlFlow::Break(b))` as soon as the visitor breaks, so the
71/// caller can propagate the short-circuit up the recursion without walking the
72/// remaining entries or loading further subtrees.
73fn diff_trees_recursive<S, V, B>(
74    store: &S,
75    from: &Option<Tree>,
76    to: &Option<Tree>,
77    prefix: &str,
78    visitor: &mut V,
79) -> Result<ControlFlow<B>, anyhow::Error>
80where
81    S: ObjectStore + ?Sized,
82    V: FnMut(FileChange) -> ControlFlow<B>,
83{
84    let from_entries = from.as_ref().map_or(&[][..], Tree::entries);
85    let to_entries = to.as_ref().map_or(&[][..], Tree::entries);
86
87    let mut from_index = 0;
88    let mut to_index = 0;
89
90    while let (Some(from_entry), Some(to_entry)) =
91        (from_entries.get(from_index), to_entries.get(to_index))
92    {
93        match from_entry.name.cmp(&to_entry.name) {
94            std::cmp::Ordering::Less => {
95                if let ControlFlow::Break(b) =
96                    visit_deleted_entry(store, prefix, from_entry, visitor)?
97                {
98                    return Ok(ControlFlow::Break(b));
99                }
100                from_index += 1;
101            }
102            std::cmp::Ordering::Greater => {
103                if let ControlFlow::Break(b) = visit_added_entry(store, prefix, to_entry, visitor)?
104                {
105                    return Ok(ControlFlow::Break(b));
106                }
107                to_index += 1;
108            }
109            std::cmp::Ordering::Equal => {
110                if from_entry.hash != to_entry.hash {
111                    if from_entry.entry_type == EntryType::Tree
112                        && to_entry.entry_type == EntryType::Tree
113                    {
114                        let from_subtree = store.get_tree(&from_entry.hash)?;
115                        let to_subtree = store.get_tree(&to_entry.hash)?;
116                        let path = child_path(prefix, &to_entry.name);
117                        if let ControlFlow::Break(b) =
118                            diff_trees_recursive(store, &from_subtree, &to_subtree, &path, visitor)?
119                        {
120                            return Ok(ControlFlow::Break(b));
121                        }
122                    } else {
123                        let path = child_path(prefix, &to_entry.name);
124                        if let ControlFlow::Break(b) =
125                            visitor(FileChange::new(path, DiffKind::Modified))
126                        {
127                            return Ok(ControlFlow::Break(b));
128                        }
129                    }
130                }
131                from_index += 1;
132                to_index += 1;
133            }
134        }
135    }
136
137    for from_entry in &from_entries[from_index..] {
138        if let ControlFlow::Break(b) = visit_deleted_entry(store, prefix, from_entry, visitor)? {
139            return Ok(ControlFlow::Break(b));
140        }
141    }
142
143    for to_entry in &to_entries[to_index..] {
144        if let ControlFlow::Break(b) = visit_added_entry(store, prefix, to_entry, visitor)? {
145            return Ok(ControlFlow::Break(b));
146        }
147    }
148
149    Ok(ControlFlow::Continue(()))
150}
151
152fn visit_added_entry<S, V, B>(
153    store: &S,
154    prefix: &str,
155    to_entry: &TreeEntry,
156    visitor: &mut V,
157) -> Result<ControlFlow<B>, anyhow::Error>
158where
159    S: ObjectStore + ?Sized,
160    V: FnMut(FileChange) -> ControlFlow<B>,
161{
162    // Symmetric with the delete branch below: if the added entry is itself a
163    // directory, recurse into it so callers see per-leaf `added` entries.
164    let path = child_path(prefix, &to_entry.name);
165    if to_entry.entry_type == EntryType::Tree {
166        let to_subtree = store.get_tree(&to_entry.hash)?;
167        diff_trees_recursive(store, &None, &to_subtree, &path, visitor)
168    } else {
169        Ok(visitor(FileChange::new(path, DiffKind::Added)))
170    }
171}
172
173fn visit_deleted_entry<S, V, B>(
174    store: &S,
175    prefix: &str,
176    from_entry: &TreeEntry,
177    visitor: &mut V,
178) -> Result<ControlFlow<B>, anyhow::Error>
179where
180    S: ObjectStore + ?Sized,
181    V: FnMut(FileChange) -> ControlFlow<B>,
182{
183    let path = child_path(prefix, &from_entry.name);
184    if from_entry.entry_type == EntryType::Tree {
185        let from_subtree = store.get_tree(&from_entry.hash)?;
186        diff_trees_recursive(store, &from_subtree, &None, &path, visitor)
187    } else {
188        Ok(visitor(FileChange::new(path, DiffKind::Deleted)))
189    }
190}
191
192fn child_path(prefix: &str, name: &str) -> String {
193    if prefix.is_empty() {
194        name.to_owned()
195    } else {
196        let mut path = String::with_capacity(prefix.len() + 1 + name.len());
197        path.push_str(prefix);
198        path.push('/');
199        path.push_str(name);
200        path
201    }
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use crate::{
208        object::{Blob, ContentHash, FileMode, Tree, TreeEntry},
209        store::InMemoryStore,
210    };
211
212    fn create_blob(store: &InMemoryStore, content: &str) -> ContentHash {
213        let blob = Blob::from_slice(content.as_bytes());
214        store.put_blob(&blob).unwrap()
215    }
216
217    fn create_tree(
218        store: &InMemoryStore,
219        entries: Vec<(&str, ContentHash, EntryType)>,
220    ) -> ContentHash {
221        let tree_entries: Vec<TreeEntry> = entries
222            .into_iter()
223            .map(|(name, hash, entry_type)| TreeEntry {
224                name: name.to_string(),
225                mode: FileMode::Normal,
226                hash,
227                entry_type,
228            })
229            .collect();
230        let tree = Tree::from_entries(tree_entries);
231        store.put_tree(&tree).unwrap()
232    }
233
234    #[test]
235    fn test_diff_identical_trees() {
236        let store = InMemoryStore::new();
237        let hash = create_tree(
238            &store,
239            vec![("a.txt", create_blob(&store, "content"), EntryType::Blob)],
240        );
241        let changes = diff_trees(&store, &hash, &hash).unwrap();
242        assert!(changes.is_empty());
243    }
244
245    #[test]
246    fn test_diff_added_file() {
247        let store = InMemoryStore::new();
248        let from_hash = create_tree(&store, vec![]);
249        let to_hash = create_tree(
250            &store,
251            vec![("a.txt", create_blob(&store, "content"), EntryType::Blob)],
252        );
253        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
254
255        assert_eq!(changes.len(), 1);
256        assert_eq!(changes.added_count(), 1);
257
258        let added: Vec<_> = changes.added().collect();
259        assert_eq!(added[0].path, "a.txt");
260    }
261
262    #[test]
263    fn test_diff_deleted_file() {
264        let store = InMemoryStore::new();
265        let blob_hash = create_blob(&store, "content");
266        let from_hash = create_tree(&store, vec![("a.txt", blob_hash, EntryType::Blob)]);
267        let to_hash = create_tree(&store, vec![]);
268        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
269
270        assert_eq!(changes.len(), 1);
271        assert_eq!(changes.deleted_count(), 1);
272
273        let deleted: Vec<_> = changes.deleted().collect();
274        assert_eq!(deleted[0].path, "a.txt");
275    }
276
277    #[test]
278    fn test_diff_modified_file() {
279        let store = InMemoryStore::new();
280        let blob1_hash = create_blob(&store, "original");
281        let blob2_hash = create_blob(&store, "modified");
282        let from_hash = create_tree(&store, vec![("a.txt", blob1_hash, EntryType::Blob)]);
283        let to_hash = create_tree(&store, vec![("a.txt", blob2_hash, EntryType::Blob)]);
284        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
285
286        assert_eq!(changes.len(), 1);
287        assert_eq!(changes.modified_count(), 1);
288
289        let modified: Vec<_> = changes.modified().collect();
290        assert_eq!(modified[0].path, "a.txt");
291    }
292
293    #[test]
294    fn test_diff_nested_directories() {
295        let store = InMemoryStore::new();
296        let sub_blob = create_blob(&store, "sub content");
297        let sub_tree = Tree::from_entries(vec![TreeEntry {
298            name: "nested.txt".to_string(),
299            mode: FileMode::Normal,
300            hash: sub_blob,
301            entry_type: EntryType::Blob,
302        }]);
303        let sub_hash = store.put_tree(&sub_tree).unwrap();
304
305        let from_hash = create_tree(&store, vec![("subdir", sub_hash, EntryType::Tree)]);
306        let to_hash = create_tree(&store, vec![]);
307        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
308
309        assert_eq!(changes.len(), 1);
310        assert_eq!(changes.deleted_count(), 1);
311
312        let deleted: Vec<_> = changes.deleted().collect();
313        assert_eq!(deleted[0].path, "subdir/nested.txt");
314    }
315
316    #[test]
317    fn test_diff_added_directory_recurses() {
318        // Mirror of `test_diff_nested_directories` for the add side.
319        // An added subdirectory should surface each leaf file it
320        // contains — not just the directory name. Previously the add
321        // branch was asymmetric with the delete branch and returned a
322        // single `"subdir"` entry; the root-commit case (empty →
323        // full) hit this every time and broke downstream code that
324        // expected leaf paths.
325        let store = InMemoryStore::new();
326        let sub_blob = create_blob(&store, "sub content");
327        let sub_tree = Tree::from_entries(vec![TreeEntry {
328            name: "nested.txt".to_string(),
329            mode: FileMode::Normal,
330            hash: sub_blob,
331            entry_type: EntryType::Blob,
332        }]);
333        let sub_hash = store.put_tree(&sub_tree).unwrap();
334
335        let from_hash = create_tree(&store, vec![]);
336        let to_hash = create_tree(&store, vec![("subdir", sub_hash, EntryType::Tree)]);
337        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
338
339        assert_eq!(changes.len(), 1);
340        assert_eq!(changes.added_count(), 1);
341
342        let added: Vec<_> = changes.added().collect();
343        assert_eq!(added[0].path, "subdir/nested.txt");
344    }
345
346    #[test]
347    fn test_diff_added_directory_deep_nesting() {
348        // `a/b/c.txt` added to an empty tree should produce one `added`
349        // entry with the full slash-joined path. Exercises multi-level
350        // recursion on the add side.
351        let store = InMemoryStore::new();
352        let leaf_blob = create_blob(&store, "leaf");
353        let c_tree = Tree::from_entries(vec![TreeEntry {
354            name: "c.txt".to_string(),
355            mode: FileMode::Normal,
356            hash: leaf_blob,
357            entry_type: EntryType::Blob,
358        }]);
359        let c_hash = store.put_tree(&c_tree).unwrap();
360        let b_tree = Tree::from_entries(vec![TreeEntry {
361            name: "b".to_string(),
362            mode: FileMode::Normal,
363            hash: c_hash,
364            entry_type: EntryType::Tree,
365        }]);
366        let b_hash = store.put_tree(&b_tree).unwrap();
367        let from_hash = create_tree(&store, vec![]);
368        let to_hash = create_tree(&store, vec![("a", b_hash, EntryType::Tree)]);
369
370        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
371        assert_eq!(changes.added_count(), 1);
372        let added: Vec<_> = changes.added().collect();
373        assert_eq!(added[0].path, "a/b/c.txt");
374    }
375
376    #[test]
377    fn test_diff_changes_follow_sorted_tree_entry_order() {
378        let store = InMemoryStore::new();
379        let from_sub_blob = create_blob(&store, "old nested");
380        let from_sub_tree = Tree::from_entries(vec![TreeEntry {
381            name: "c.txt".to_string(),
382            mode: FileMode::Normal,
383            hash: from_sub_blob,
384            entry_type: EntryType::Blob,
385        }]);
386        let from_sub_hash = store.put_tree(&from_sub_tree).unwrap();
387        let to_sub_blob = create_blob(&store, "new nested");
388        let to_sub_tree = Tree::from_entries(vec![TreeEntry {
389            name: "b.txt".to_string(),
390            mode: FileMode::Normal,
391            hash: to_sub_blob,
392            entry_type: EntryType::Blob,
393        }]);
394        let to_sub_hash = store.put_tree(&to_sub_tree).unwrap();
395
396        let from_hash = create_tree(
397            &store,
398            vec![
399                ("z.txt", create_blob(&store, "old z"), EntryType::Blob),
400                ("dir", from_sub_hash, EntryType::Tree),
401                ("m.txt", create_blob(&store, "same"), EntryType::Blob),
402                ("a.txt", create_blob(&store, "old a"), EntryType::Blob),
403            ],
404        );
405        let to_hash = create_tree(
406            &store,
407            vec![
408                ("b.txt", create_blob(&store, "new b"), EntryType::Blob),
409                ("dir", to_sub_hash, EntryType::Tree),
410                ("m.txt", create_blob(&store, "same"), EntryType::Blob),
411                ("z.txt", create_blob(&store, "new z"), EntryType::Blob),
412            ],
413        );
414
415        let changes: Vec<_> = diff_trees(&store, &from_hash, &to_hash)
416            .unwrap()
417            .into_iter()
418            .map(|change| (change.path, change.kind))
419            .collect();
420
421        assert_eq!(
422            changes,
423            vec![
424                ("a.txt".to_string(), crate::object::DiffKind::Deleted),
425                ("b.txt".to_string(), crate::object::DiffKind::Added),
426                ("dir/b.txt".to_string(), crate::object::DiffKind::Added),
427                ("dir/c.txt".to_string(), crate::object::DiffKind::Deleted),
428                ("z.txt".to_string(), crate::object::DiffKind::Modified),
429            ]
430        );
431    }
432
433    /// The visitor variant must emit changes in exactly the same order as
434    /// `diff_trees` collects them. This is the byte-identical guarantee that
435    /// lets `diff_trees` delegate to `diff_trees_visit` without changing
436    /// observable output.
437    #[test]
438    fn test_visit_matches_collect_order() {
439        let store = InMemoryStore::new();
440        let from_sub_blob = create_blob(&store, "old nested");
441        let from_sub_tree = Tree::from_entries(vec![TreeEntry {
442            name: "c.txt".to_string(),
443            mode: FileMode::Normal,
444            hash: from_sub_blob,
445            entry_type: EntryType::Blob,
446        }]);
447        let from_sub_hash = store.put_tree(&from_sub_tree).unwrap();
448        let to_sub_blob = create_blob(&store, "new nested");
449        let to_sub_tree = Tree::from_entries(vec![TreeEntry {
450            name: "b.txt".to_string(),
451            mode: FileMode::Normal,
452            hash: to_sub_blob,
453            entry_type: EntryType::Blob,
454        }]);
455        let to_sub_hash = store.put_tree(&to_sub_tree).unwrap();
456
457        let from_hash = create_tree(
458            &store,
459            vec![
460                ("z.txt", create_blob(&store, "old z"), EntryType::Blob),
461                ("dir", from_sub_hash, EntryType::Tree),
462                ("m.txt", create_blob(&store, "same"), EntryType::Blob),
463                ("a.txt", create_blob(&store, "old a"), EntryType::Blob),
464            ],
465        );
466        let to_hash = create_tree(
467            &store,
468            vec![
469                ("b.txt", create_blob(&store, "new b"), EntryType::Blob),
470                ("dir", to_sub_hash, EntryType::Tree),
471                ("m.txt", create_blob(&store, "same"), EntryType::Blob),
472                ("z.txt", create_blob(&store, "new z"), EntryType::Blob),
473            ],
474        );
475
476        let collected: Vec<_> = diff_trees(&store, &from_hash, &to_hash)
477            .unwrap()
478            .into_iter()
479            .map(FileChange::into_tuple)
480            .collect();
481
482        let mut visited = Vec::new();
483        let flow = diff_trees_visit(&store, &from_hash, &to_hash, |change| {
484            visited.push(change.into_tuple());
485            ControlFlow::<()>::Continue(())
486        })
487        .unwrap();
488
489        assert!(flow.is_continue());
490        assert_eq!(visited, collected);
491    }
492
493    #[test]
494    fn test_visit_identical_trees_never_calls_visitor() {
495        let store = InMemoryStore::new();
496        let hash = create_tree(
497            &store,
498            vec![("a.txt", create_blob(&store, "content"), EntryType::Blob)],
499        );
500        let mut count = 0usize;
501        let flow = diff_trees_visit(&store, &hash, &hash, |_change| {
502            count += 1;
503            ControlFlow::<()>::Continue(())
504        })
505        .unwrap();
506        assert!(flow.is_continue());
507        assert_eq!(count, 0);
508    }
509
510    /// Early-exit: breaking from the visitor stops the walk and stops loading
511    /// further subtrees. We assert both the carried `Break` payload and that
512    /// the visitor saw strictly fewer changes than the full diff.
513    #[test]
514    fn test_visit_early_exit_stops_walk() {
515        let store = InMemoryStore::new();
516        // Five distinct top-level files all added → five `added` changes in
517        // sorted order: a, b, c, d, e.
518        let from_hash = create_tree(&store, vec![]);
519        let to_hash = create_tree(
520            &store,
521            vec![
522                ("a.txt", create_blob(&store, "a"), EntryType::Blob),
523                ("b.txt", create_blob(&store, "b"), EntryType::Blob),
524                ("c.txt", create_blob(&store, "c"), EntryType::Blob),
525                ("d.txt", create_blob(&store, "d"), EntryType::Blob),
526                ("e.txt", create_blob(&store, "e"), EntryType::Blob),
527            ],
528        );
529
530        let mut seen = Vec::new();
531        let flow = diff_trees_visit(&store, &from_hash, &to_hash, |change| {
532            seen.push(change.path.clone());
533            if change.path == "c.txt" {
534                ControlFlow::Break("found c")
535            } else {
536                ControlFlow::Continue(())
537            }
538        })
539        .unwrap();
540
541        assert_eq!(flow, ControlFlow::Break("found c"));
542        // Stopped at c.txt — never visited d.txt or e.txt.
543        assert_eq!(seen, vec!["a.txt", "b.txt", "c.txt"]);
544    }
545
546    /// Early-exit must also short-circuit out of nested-subtree recursion, not
547    /// just the top level.
548    #[test]
549    fn test_visit_early_exit_inside_subtree() {
550        let store = InMemoryStore::new();
551        let sub_tree = Tree::from_entries(vec![
552            TreeEntry {
553                name: "x.txt".to_string(),
554                mode: FileMode::Normal,
555                hash: create_blob(&store, "x"),
556                entry_type: EntryType::Blob,
557            },
558            TreeEntry {
559                name: "y.txt".to_string(),
560                mode: FileMode::Normal,
561                hash: create_blob(&store, "y"),
562                entry_type: EntryType::Blob,
563            },
564        ]);
565        let sub_hash = store.put_tree(&sub_tree).unwrap();
566        let from_hash = create_tree(&store, vec![]);
567        let to_hash = create_tree(
568            &store,
569            vec![
570                ("dir", sub_hash, EntryType::Tree),
571                ("z.txt", create_blob(&store, "z"), EntryType::Blob),
572            ],
573        );
574
575        let mut seen = Vec::new();
576        let flow = diff_trees_visit(&store, &from_hash, &to_hash, |change| {
577            seen.push(change.path.clone());
578            ControlFlow::Break(())
579        })
580        .unwrap();
581
582        assert_eq!(flow, ControlFlow::Break(()));
583        // Broke on the very first leaf inside `dir`; `dir/y.txt` and `z.txt`
584        // were never visited.
585        assert_eq!(seen, vec!["dir/x.txt"]);
586    }
587}