x_diff_rs/
diff.rs

1use std::{
2    collections::{HashMap, HashSet},
3    fmt::Display,
4};
5
6use crate::tree::{XNode, XTree};
7use md5::Digest;
8
9trait Concat {
10    fn concat(self, other: Self) -> Self;
11}
12
13impl Concat for Digest {
14    fn concat(mut self, other: Self) -> Self {
15        for i in 0..self.0.len() {
16            self.0[i] = self.0[i].wrapping_add(other.0[i]);
17        }
18        self
19    }
20}
21
22#[derive(Debug, Clone)]
23pub enum Edit<'a, 'tree1, 'tree2> {
24    Insert {
25        child_node: XNode<'a, 'tree2>,
26        to_node: XNode<'a, 'tree1>,
27    },
28    Delete(XNode<'a, 'tree1>),
29    Update {
30        old: XNode<'a, 'tree1>,
31        new: XNode<'a, 'tree2>,
32    },
33    ReplaceRoot,
34}
35
36impl Display for Edit<'_, '_, '_> {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        match self {
39            Edit::Insert {
40                child_node,
41                to_node,
42            } => {
43                write!(
44                    f,
45                    "insert node {} to node {}",
46                    child_node.id(),
47                    to_node.id()
48                )
49            }
50            Edit::Delete(node) => write!(f, "delete node {}", node.id()),
51            Edit::Update { old, new } => write!(
52                f,
53                "update node {}: {:?} -> {:?}",
54                old.id(),
55                old.value().unwrap().trim(),
56                new.value().unwrap().trim()
57            ),
58            Edit::ReplaceRoot => write!(f, "replace root node"),
59        }
60    }
61}
62
63type Diff<'a, 'tree1, 'tree2> = Vec<Edit<'a, 'tree1, 'tree2>>;
64
65/// Calculate the difference between two XML trees, represented by the minum edit operations to transform `tree1` to `tree2`.
66pub fn diff<'a, 'doc1, 'doc2>(
67    tree1: &'doc1 XTree<'doc1>,
68    tree2: &'doc2 XTree<'doc2>,
69) -> Diff<'a, 'doc1, 'doc2> {
70    fn diff_node<'a, 'doc1, 'doc2>(
71        node1: XNode<'a, 'doc1>,
72        ht1: &HashMap<String, Digest>,
73        node2: XNode<'a, 'doc2>,
74        ht2: &HashMap<String, Digest>,
75    ) -> Diff<'a, 'doc1, 'doc2> {
76        if ht1.get(&node1.id().to_string()) == ht2.get(&node2.id().to_string()) {
77            return Vec::new();
78        }
79
80        // Leaf nodes with different hashes mean different values
81        if (node1.is_attribute() && node2.is_attribute()) || (node1.is_text() && node2.is_text()) {
82            return vec![Edit::Update {
83                old: node1,
84                new: node2,
85            }];
86        }
87
88        let mut iht1: HashMap<_, _> = node1
89            .children()
90            .iter()
91            .map(|n| (*ht1.get(&n.id().to_string()).unwrap(), *n))
92            .collect();
93        let mut iht2: HashMap<_, _> = node2
94            .children()
95            .iter()
96            .map(|n| (*ht2.get(&n.id().to_string()).unwrap(), *n))
97            .collect();
98        let children_hashes1: HashSet<_> = iht1.keys().copied().collect();
99        let children_hashes2: HashSet<_> = iht2.keys().copied().collect();
100        let same_hashes: HashSet<_> = children_hashes1.intersection(&children_hashes2).collect();
101        iht1.retain(|k, _| !same_hashes.contains(&k));
102        iht2.retain(|k, _| !same_hashes.contains(&k));
103        let mut remaining_children1: HashSet<_> = iht1.into_values().collect();
104        let mut remaining_children2: HashSet<_> = iht2.into_values().collect();
105        let mut diff_pairs = Vec::new();
106        for n1 in &remaining_children1 {
107            for n2 in &remaining_children2 {
108                if n1.signature() == n2.signature() {
109                    diff_pairs.push((*n1, *n2, diff_node(*n1, ht1, *n2, ht2)));
110                }
111            }
112        }
113        diff_pairs.sort_by_key(|item| item.2.len());
114        let mut diff = Vec::new();
115        for (n1, n2, mut d) in diff_pairs {
116            if remaining_children1.contains(&n1) && remaining_children2.contains(&n2) {
117                diff.append(&mut d);
118                remaining_children1.remove(&n1);
119                remaining_children2.remove(&n2);
120            }
121        }
122        for n1 in remaining_children1 {
123            diff.push(Edit::Delete(n1));
124        }
125        for n2 in remaining_children2 {
126            diff.push(Edit::Insert {
127                child_node: n2,
128                to_node: node1,
129            });
130        }
131        diff
132    }
133    if tree1.root().signature() != tree2.root().signature() {
134        return vec![Edit::ReplaceRoot];
135    }
136    let ht1 = calculate_hash_table(tree1);
137    let ht2 = calculate_hash_table(tree2);
138    diff_node(tree1.root(), &ht1, tree2.root(), &ht2)
139}
140
141fn calculate_hash_table(tree: &XTree) -> HashMap<String, Digest> {
142    fn hash_of_node(node: XNode, ht: &mut HashMap<String, Digest>) -> Digest {
143        let hash = if node.children().is_empty() {
144            node.hash()
145        } else {
146            let mut acc = node.hash();
147            for child in node.children() {
148                acc = acc.concat(hash_of_node(child, ht));
149            }
150            acc
151        };
152        ht.insert(node.id().to_string(), hash);
153        hash
154    }
155    let mut hash_table = HashMap::new();
156    hash_of_node(tree.root(), &mut hash_table);
157    hash_table
158}
159
160#[cfg(test)]
161mod test {
162    #[cfg(feature = "print")]
163    use crate::tree::print::{PrintTreeOptions, print_tree};
164
165    use super::*;
166    use std::fs;
167
168    #[test]
169    fn test_calculate_hash_table_same_tree() {
170        let text1 = fs::read_to_string("test/file1.xml").unwrap();
171        let tree1 = XTree::parse(&text1).unwrap();
172        let ht1 = calculate_hash_table(&tree1);
173
174        let text2 = fs::read_to_string("test/file1.xml").unwrap();
175        let tree2 = XTree::parse(&text2).unwrap();
176        let ht2 = calculate_hash_table(&tree2);
177
178        assert_eq!(
179            ht1.get(&tree1.root().id().to_string()),
180            ht2.get(&tree2.root().id().to_string())
181        );
182    }
183
184    #[test]
185    fn test_calculate_hash_table_different_tree() {
186        let text1 = fs::read_to_string("test/file1.xml").unwrap();
187        let tree1 = XTree::parse(&text1).unwrap();
188        let ht1 = calculate_hash_table(&tree1);
189
190        let text2 = fs::read_to_string("test/file2.xml").unwrap();
191        let tree2 = XTree::parse(&text2).unwrap();
192        let ht2 = calculate_hash_table(&tree2);
193
194        assert_ne!(
195            ht1.get(&tree1.root().id().to_string()),
196            ht2.get(&tree2.root().id().to_string())
197        );
198    }
199
200    #[test]
201    fn test_diff() {
202        let text1 = fs::read_to_string("test/file1.xml").unwrap();
203        let tree1 = XTree::parse(&text1).unwrap();
204
205        let text2 = fs::read_to_string("test/file2.xml").unwrap();
206        let tree2 = XTree::parse(&text2).unwrap();
207
208        #[cfg(feature = "print")]
209        {
210            print_tree(&tree1, PrintTreeOptions::default().with_node_id());
211            print_tree(&tree2, PrintTreeOptions::default().with_node_id());
212        }
213
214        let diff = diff(&tree1, &tree2);
215        diff.iter().any(|d| {
216            regex::Regex::new(r#"update node \d+: \"George\" -> \"Fred\""#)
217                .unwrap()
218                .is_match(&d.to_string())
219        });
220        diff.iter().any(|d| {
221            regex::Regex::new(r#"insert node \d+[Attr] to node \d+"#)
222                .unwrap()
223                .is_match(&d.to_string())
224        });
225        diff.iter().any(|d| {
226            regex::Regex::new(r#"insert node \d+[Foo] to node \d+"#)
227                .unwrap()
228                .is_match(&d.to_string())
229        });
230        for e in diff {
231            println!("{}", e);
232        }
233    }
234}