Skip to main content

objects/object/
tree_diff.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Shared tree-to-tree diffing implementation.
3//!
4//! This module provides a generic tree diffing algorithm that can be used
5//! by both `repo` and `semantic` crates.
6
7use std::collections::HashMap;
8
9use super::FileChangeSet;
10use crate::{
11    object::{EntryType, Tree},
12    store::ObjectStore,
13};
14
15/// Collect all file changes between two trees.
16pub fn diff_trees<S: ObjectStore + ?Sized>(
17    store: &S,
18    from: &crate::object::ContentHash,
19    to: &crate::object::ContentHash,
20) -> Result<FileChangeSet, anyhow::Error> {
21    if from == to {
22        return Ok(FileChangeSet::new());
23    }
24    let from_tree = store.get_tree(from)?;
25    let to_tree = store.get_tree(to)?;
26    let mut changes = FileChangeSet::new();
27    diff_trees_recursive(store, &from_tree, &to_tree, "", &mut changes)?;
28    Ok(changes)
29}
30
31/// Recursively diff two trees, collecting file changes.
32fn diff_trees_recursive<S: ObjectStore + ?Sized>(
33    store: &S,
34    from: &Option<Tree>,
35    to: &Option<Tree>,
36    prefix: &str,
37    changes: &mut FileChangeSet,
38) -> Result<(), anyhow::Error> {
39    let from_entries: HashMap<_, _> = from
40        .as_ref()
41        .map(|tree| {
42            tree.entries()
43                .iter()
44                .map(|entry| (entry.name.clone(), entry))
45                .collect()
46        })
47        .unwrap_or_default();
48
49    let to_entries: HashMap<_, _> = to
50        .as_ref()
51        .map(|tree| {
52            tree.entries()
53                .iter()
54                .map(|entry| (entry.name.clone(), entry))
55                .collect()
56        })
57        .unwrap_or_default();
58
59    for (name, to_entry) in &to_entries {
60        let path = if prefix.is_empty() {
61            name.clone()
62        } else {
63            format!("{}/{}", prefix, name)
64        };
65
66        match from_entries.get(name) {
67            None => {
68                // Symmetric with the delete branch below: if the added
69                // entry is itself a directory, recurse into it so
70                // callers see per-leaf `added` entries
71                // (e.g. `"subdir/file.rs"`) rather than a single
72                // synthetic `"subdir"` directory entry. This matters
73                // for anything that keys on file paths — the ingest
74                // pipeline's transcript matcher, for instance, uses
75                // file-level overlap to score candidate sessions, and
76                // a root commit's `src/` directory would otherwise
77                // collapse to one entry and miss the actual files.
78                if to_entry.entry_type == EntryType::Tree {
79                    let to_subtree = store.get_tree(&to_entry.hash)?;
80                    diff_trees_recursive(store, &None, &to_subtree, &path, changes)?;
81                } else {
82                    changes.push_added(&path);
83                }
84            }
85            Some(from_entry) => {
86                if from_entry.hash != to_entry.hash {
87                    if from_entry.entry_type == EntryType::Tree
88                        && to_entry.entry_type == EntryType::Tree
89                    {
90                        let from_subtree = store.get_tree(&from_entry.hash)?;
91                        let to_subtree = store.get_tree(&to_entry.hash)?;
92                        diff_trees_recursive(store, &from_subtree, &to_subtree, &path, changes)?;
93                    } else {
94                        changes.push_modified(&path);
95                    }
96                }
97            }
98        }
99    }
100
101    for (name, from_entry) in &from_entries {
102        if !to_entries.contains_key(name) {
103            let path = if prefix.is_empty() {
104                name.clone()
105            } else {
106                format!("{}/{}", prefix, name)
107            };
108            // Recursively handle deleted entries - if it's a directory, descend into it
109            if from_entry.entry_type == EntryType::Tree {
110                let from_subtree = store.get_tree(&from_entry.hash)?;
111                diff_trees_recursive(store, &from_subtree, &None, &path, changes)?;
112            } else {
113                changes.push_deleted(&path);
114            }
115        }
116    }
117
118    Ok(())
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124    use crate::{
125        object::{Blob, ContentHash, FileMode, Tree, TreeEntry},
126        store::InMemoryStore,
127    };
128
129    fn create_blob(store: &InMemoryStore, content: &str) -> ContentHash {
130        let blob = Blob::from_slice(content.as_bytes());
131        store.put_blob(&blob).unwrap()
132    }
133
134    fn create_tree(
135        store: &InMemoryStore,
136        entries: Vec<(&str, ContentHash, EntryType)>,
137    ) -> ContentHash {
138        let tree_entries: Vec<TreeEntry> = entries
139            .into_iter()
140            .map(|(name, hash, entry_type)| TreeEntry {
141                name: name.to_string(),
142                mode: FileMode::Normal,
143                hash,
144                entry_type,
145            })
146            .collect();
147        let tree = Tree::from_entries(tree_entries);
148        store.put_tree(&tree).unwrap()
149    }
150
151    #[test]
152    fn test_diff_identical_trees() {
153        let store = InMemoryStore::new();
154        let hash = create_tree(
155            &store,
156            vec![("a.txt", create_blob(&store, "content"), EntryType::Blob)],
157        );
158        let changes = diff_trees(&store, &hash, &hash).unwrap();
159        assert!(changes.is_empty());
160    }
161
162    #[test]
163    fn test_diff_added_file() {
164        let store = InMemoryStore::new();
165        let from_hash = create_tree(&store, vec![]);
166        let to_hash = create_tree(
167            &store,
168            vec![("a.txt", create_blob(&store, "content"), EntryType::Blob)],
169        );
170        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
171
172        assert_eq!(changes.len(), 1);
173        assert_eq!(changes.added_count(), 1);
174
175        let added: Vec<_> = changes.added().collect();
176        assert_eq!(added[0].path, "a.txt");
177    }
178
179    #[test]
180    fn test_diff_deleted_file() {
181        let store = InMemoryStore::new();
182        let blob_hash = create_blob(&store, "content");
183        let from_hash = create_tree(&store, vec![("a.txt", blob_hash, EntryType::Blob)]);
184        let to_hash = create_tree(&store, vec![]);
185        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
186
187        assert_eq!(changes.len(), 1);
188        assert_eq!(changes.deleted_count(), 1);
189
190        let deleted: Vec<_> = changes.deleted().collect();
191        assert_eq!(deleted[0].path, "a.txt");
192    }
193
194    #[test]
195    fn test_diff_modified_file() {
196        let store = InMemoryStore::new();
197        let blob1_hash = create_blob(&store, "original");
198        let blob2_hash = create_blob(&store, "modified");
199        let from_hash = create_tree(&store, vec![("a.txt", blob1_hash, EntryType::Blob)]);
200        let to_hash = create_tree(&store, vec![("a.txt", blob2_hash, EntryType::Blob)]);
201        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
202
203        assert_eq!(changes.len(), 1);
204        assert_eq!(changes.modified_count(), 1);
205
206        let modified: Vec<_> = changes.modified().collect();
207        assert_eq!(modified[0].path, "a.txt");
208    }
209
210    #[test]
211    fn test_diff_nested_directories() {
212        let store = InMemoryStore::new();
213        let sub_blob = create_blob(&store, "sub content");
214        let sub_tree = Tree::from_entries(vec![TreeEntry {
215            name: "nested.txt".to_string(),
216            mode: FileMode::Normal,
217            hash: sub_blob,
218            entry_type: EntryType::Blob,
219        }]);
220        let sub_hash = store.put_tree(&sub_tree).unwrap();
221
222        let from_hash = create_tree(&store, vec![("subdir", sub_hash, EntryType::Tree)]);
223        let to_hash = create_tree(&store, vec![]);
224        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
225
226        assert_eq!(changes.len(), 1);
227        assert_eq!(changes.deleted_count(), 1);
228
229        let deleted: Vec<_> = changes.deleted().collect();
230        assert_eq!(deleted[0].path, "subdir/nested.txt");
231    }
232
233    #[test]
234    fn test_diff_added_directory_recurses() {
235        // Mirror of `test_diff_nested_directories` for the add side.
236        // An added subdirectory should surface each leaf file it
237        // contains — not just the directory name. Previously the add
238        // branch was asymmetric with the delete branch and returned a
239        // single `"subdir"` entry; the root-commit case (empty →
240        // full) hit this every time and broke downstream code that
241        // expected leaf paths.
242        let store = InMemoryStore::new();
243        let sub_blob = create_blob(&store, "sub content");
244        let sub_tree = Tree::from_entries(vec![TreeEntry {
245            name: "nested.txt".to_string(),
246            mode: FileMode::Normal,
247            hash: sub_blob,
248            entry_type: EntryType::Blob,
249        }]);
250        let sub_hash = store.put_tree(&sub_tree).unwrap();
251
252        let from_hash = create_tree(&store, vec![]);
253        let to_hash = create_tree(&store, vec![("subdir", sub_hash, EntryType::Tree)]);
254        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
255
256        assert_eq!(changes.len(), 1);
257        assert_eq!(changes.added_count(), 1);
258
259        let added: Vec<_> = changes.added().collect();
260        assert_eq!(added[0].path, "subdir/nested.txt");
261    }
262
263    #[test]
264    fn test_diff_added_directory_deep_nesting() {
265        // `a/b/c.txt` added to an empty tree should produce one `added`
266        // entry with the full slash-joined path. Exercises multi-level
267        // recursion on the add side.
268        let store = InMemoryStore::new();
269        let leaf_blob = create_blob(&store, "leaf");
270        let c_tree = Tree::from_entries(vec![TreeEntry {
271            name: "c.txt".to_string(),
272            mode: FileMode::Normal,
273            hash: leaf_blob,
274            entry_type: EntryType::Blob,
275        }]);
276        let c_hash = store.put_tree(&c_tree).unwrap();
277        let b_tree = Tree::from_entries(vec![TreeEntry {
278            name: "b".to_string(),
279            mode: FileMode::Normal,
280            hash: c_hash,
281            entry_type: EntryType::Tree,
282        }]);
283        let b_hash = store.put_tree(&b_tree).unwrap();
284        let from_hash = create_tree(&store, vec![]);
285        let to_hash = create_tree(&store, vec![("a", b_hash, EntryType::Tree)]);
286
287        let changes = diff_trees(&store, &from_hash, &to_hash).unwrap();
288        assert_eq!(changes.added_count(), 1);
289        let added: Vec<_> = changes.added().collect();
290        assert_eq!(added[0].path, "a/b/c.txt");
291    }
292}