Skip to main content

clayers_repo/
diff.rs

1//! Structural tree diff operating on the Merkle DAG.
2//!
3//! Compares two content-addressed trees by exploiting the Merkle property:
4//! if hashes are equal, the subtrees are identical (short-circuit).
5
6use std::collections::HashSet;
7
8use clayers_xml::ContentHash;
9
10use crate::error::{Error, Result};
11use crate::object::{Object, TreeObject};
12use crate::store::ObjectStore;
13
14/// A structural diff between two trees.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct TreeDiff {
17    /// The individual node-level changes.
18    pub changes: Vec<NodeChange>,
19}
20
21impl TreeDiff {
22    /// True if there are no changes.
23    #[must_use]
24    pub fn is_empty(&self) -> bool {
25        self.changes.is_empty()
26    }
27}
28
29/// A single change in the tree diff.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum NodeChange {
32    /// A node was added to a parent at a given position.
33    Added {
34        /// Parent element hash.
35        parent: ContentHash,
36        /// Position among siblings.
37        position: usize,
38        /// Hash of the added node.
39        node: ContentHash,
40    },
41    /// A node was removed from a parent at a given position.
42    Removed {
43        /// Parent element hash.
44        parent: ContentHash,
45        /// Position among siblings.
46        position: usize,
47        /// Hash of the removed node.
48        node: ContentHash,
49    },
50    /// A node was modified (hash changed, subtree differs).
51    Modified {
52        /// Hash before the change.
53        hash_before: ContentHash,
54        /// Hash after the change.
55        hash_after: ContentHash,
56        /// Recursive diff of the subtree.
57        inner: Box<TreeDiff>,
58    },
59    /// An attribute on an element was changed.
60    AttributeChanged {
61        /// The element's hash (after change).
62        element: ContentHash,
63        /// The attribute name.
64        attr: String,
65        /// The old value (None if attribute was added).
66        old: Option<String>,
67        /// The new value (None if attribute was removed).
68        new: Option<String>,
69    },
70    /// A text node's content changed.
71    TextChanged {
72        /// The old text object hash.
73        old: ContentHash,
74        /// The new text object hash.
75        new: ContentHash,
76    },
77}
78
79/// Compute a structural diff between two content-addressed trees.
80///
81/// Uses the Merkle property to short-circuit comparison of unchanged subtrees.
82///
83/// # Errors
84///
85/// Returns an error if referenced objects cannot be loaded from the store.
86pub async fn diff(
87    store: &dyn ObjectStore,
88    a: ContentHash,
89    b: ContentHash,
90) -> Result<TreeDiff> {
91    // Short-circuit: identical hashes mean identical content.
92    if a == b {
93        return Ok(TreeDiff {
94            changes: Vec::new(),
95        });
96    }
97
98    let obj_a = store.get(&a).await?.ok_or(Error::NotFound(a))?;
99    let obj_b = store.get(&b).await?.ok_or(Error::NotFound(b))?;
100
101    let mut changes = Vec::new();
102
103    match (&obj_a, &obj_b) {
104        (Object::Text(_), Object::Text(_)) => {
105            changes.push(NodeChange::TextChanged { old: a, new: b });
106        }
107        (Object::Element(el_a), Object::Element(el_b)) => {
108            // Diff attributes.
109            diff_attributes(&mut changes, b, el_a, el_b);
110
111            // Diff children using position-based matching.
112            diff_children(store, &mut changes, a, &el_a.children, &el_b.children).await?;
113        }
114        _ => {
115            // Different node types: treat as complete replacement.
116            changes.push(NodeChange::Removed {
117                parent: a,
118                position: 0,
119                node: a,
120            });
121            changes.push(NodeChange::Added {
122                parent: b,
123                position: 0,
124                node: b,
125            });
126        }
127    }
128
129    Ok(TreeDiff { changes })
130}
131
132/// Diff attributes between two elements.
133fn diff_attributes(
134    changes: &mut Vec<NodeChange>,
135    element_hash: ContentHash,
136    el_a: &crate::object::ElementObject,
137    el_b: &crate::object::ElementObject,
138) {
139    // Check for removed or changed attributes.
140    for attr_a in &el_a.attributes {
141        let matching = el_b.attributes.iter().find(|ab| {
142            ab.local_name == attr_a.local_name && ab.namespace_uri == attr_a.namespace_uri
143        });
144        match matching {
145            Some(attr_b) if attr_b.value != attr_a.value => {
146                changes.push(NodeChange::AttributeChanged {
147                    element: element_hash,
148                    attr: attr_a.local_name.clone(),
149                    old: Some(attr_a.value.clone()),
150                    new: Some(attr_b.value.clone()),
151                });
152            }
153            None => {
154                changes.push(NodeChange::AttributeChanged {
155                    element: element_hash,
156                    attr: attr_a.local_name.clone(),
157                    old: Some(attr_a.value.clone()),
158                    new: None,
159                });
160            }
161            _ => {}
162        }
163    }
164
165    // Check for added attributes.
166    for attr_b in &el_b.attributes {
167        let exists_in_a = el_a.attributes.iter().any(|aa| {
168            aa.local_name == attr_b.local_name && aa.namespace_uri == attr_b.namespace_uri
169        });
170        if !exists_in_a {
171            changes.push(NodeChange::AttributeChanged {
172                element: element_hash,
173                attr: attr_b.local_name.clone(),
174                old: None,
175                new: Some(attr_b.value.clone()),
176            });
177        }
178    }
179}
180
181/// Diff ordered child lists using position-based matching with hash short-circuit.
182async fn diff_children(
183    store: &dyn ObjectStore,
184    changes: &mut Vec<NodeChange>,
185    parent: ContentHash,
186    children_a: &[ContentHash],
187    children_b: &[ContentHash],
188) -> Result<()> {
189    let len_a = children_a.len();
190    let len_b = children_b.len();
191    let min_len = len_a.min(len_b);
192
193    // Match by position.
194    for i in 0..min_len {
195        if children_a[i] != children_b[i] {
196            // Subtrees differ: recurse.
197            let inner = Box::pin(diff(store, children_a[i], children_b[i])).await?;
198            changes.push(NodeChange::Modified {
199                hash_before: children_a[i],
200                hash_after: children_b[i],
201                inner: Box::new(inner),
202            });
203        }
204    }
205
206    // Removed children (a has more than b).
207    for (i, child) in children_a.iter().enumerate().skip(min_len) {
208        changes.push(NodeChange::Removed {
209            parent,
210            position: i,
211            node: *child,
212        });
213    }
214
215    // Added children (b has more than a).
216    for (i, child) in children_b.iter().enumerate().skip(min_len) {
217        changes.push(NodeChange::Added {
218            parent,
219            position: i,
220            node: *child,
221        });
222    }
223
224    Ok(())
225}
226
227// ---------------------------------------------------------------------------
228// File-level (tree-to-tree) diff
229// ---------------------------------------------------------------------------
230
231/// A file-level change between two tree objects.
232#[derive(Debug, Clone, PartialEq, Eq)]
233#[cfg_attr(feature = "serde", derive(serde::Serialize))]
234#[cfg_attr(feature = "serde", serde(tag = "type", rename_all = "snake_case"))]
235pub enum FileChange {
236    /// A file was added.
237    Added {
238        /// File path in the tree.
239        path: String,
240        /// Document hash.
241        document: ContentHash,
242    },
243    /// A file was removed.
244    Removed {
245        /// File path in the tree.
246        path: String,
247        /// Document hash.
248        document: ContentHash,
249    },
250    /// A file was modified (document hash changed).
251    Modified {
252        /// File path in the tree.
253        path: String,
254        /// Old document hash.
255        old_doc: ContentHash,
256        /// New document hash.
257        new_doc: ContentHash,
258    },
259}
260
261/// Compare two tree objects and return file-level changes.
262///
263/// Identifies which files were added, removed, or modified between `tree_a`
264/// and `tree_b` by comparing tree entries by path and document hash.
265#[must_use]
266pub fn diff_trees(tree_a: &TreeObject, tree_b: &TreeObject) -> Vec<FileChange> {
267    let mut changes = Vec::new();
268
269    let paths_a: HashSet<&str> = tree_a.entries.iter().map(|e| e.path.as_str()).collect();
270
271    // Removed or modified.
272    for entry in &tree_a.entries {
273        if let Some(entry_b) = tree_b.entries.iter().find(|e| e.path == entry.path) {
274            if entry.document != entry_b.document {
275                changes.push(FileChange::Modified {
276                    path: entry.path.clone(),
277                    old_doc: entry.document,
278                    new_doc: entry_b.document,
279                });
280            }
281        } else {
282            changes.push(FileChange::Removed {
283                path: entry.path.clone(),
284                document: entry.document,
285            });
286        }
287    }
288
289    // Added.
290    for entry in &tree_b.entries {
291        if !paths_a.contains(entry.path.as_str()) {
292            changes.push(FileChange::Added {
293                path: entry.path.clone(),
294                document: entry.document,
295            });
296        }
297    }
298
299    changes
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305    use crate::object::{ElementObject, TextObject};
306    use crate::store::memory::MemoryStore;
307
308    async fn store_text(store: &MemoryStore, text: &str) -> ContentHash {
309        let hash = ContentHash::from_canonical(text.as_bytes());
310        let mut tx = store.transaction().await.unwrap();
311        tx.put(
312            hash,
313            Object::Text(TextObject {
314                content: text.into(),
315            }),
316        )
317        .await
318        .unwrap();
319        tx.commit().await.unwrap();
320        hash
321    }
322
323    #[tokio::test]
324    async fn identical_hashes_no_changes() {
325        let store = MemoryStore::new();
326        let h = store_text(&store, "same").await;
327        let d = diff(&store, h, h).await.unwrap();
328        assert!(d.is_empty());
329    }
330
331    #[tokio::test]
332    async fn text_content_change() {
333        let store = MemoryStore::new();
334        let h1 = store_text(&store, "old").await;
335        let h2 = store_text(&store, "new").await;
336        let d = diff(&store, h1, h2).await.unwrap();
337        assert_eq!(d.changes.len(), 1);
338        assert!(matches!(&d.changes[0], NodeChange::TextChanged { .. }));
339    }
340
341    #[tokio::test]
342    async fn attribute_change_detected() {
343        let store = MemoryStore::new();
344        let h1 = ContentHash::from_canonical(b"el1");
345        let h2 = ContentHash::from_canonical(b"el2");
346
347        let mut tx = store.transaction().await.unwrap();
348        tx.put(
349            h1,
350            Object::Element(ElementObject {
351                local_name: "div".into(),
352                namespace_uri: None,
353                namespace_prefix: None,
354                extra_namespaces: vec![],
355                attributes: vec![crate::object::Attribute {
356                    local_name: "class".into(),
357                    namespace_uri: None,
358                    namespace_prefix: None,
359                    value: "old".into(),
360                }],
361                children: vec![],
362                inclusive_hash: h1,
363            }),
364        )
365        .await
366        .unwrap();
367        tx.put(
368            h2,
369            Object::Element(ElementObject {
370                local_name: "div".into(),
371                namespace_uri: None,
372                namespace_prefix: None,
373                extra_namespaces: vec![],
374                attributes: vec![crate::object::Attribute {
375                    local_name: "class".into(),
376                    namespace_uri: None,
377                    namespace_prefix: None,
378                    value: "new".into(),
379                }],
380                children: vec![],
381                inclusive_hash: h2,
382            }),
383        )
384        .await
385        .unwrap();
386        tx.commit().await.unwrap();
387
388        let d = diff(&store, h1, h2).await.unwrap();
389        assert!(d.changes.iter().any(|c| matches!(
390            c,
391            NodeChange::AttributeChanged {
392                attr,
393                old: Some(_),
394                new: Some(_),
395                ..
396            } if attr == "class"
397        )));
398    }
399}