x_diff_rs/
diff.rs

1use std::{
2    collections::{HashMap, HashSet},
3    fmt::Display,
4};
5
6use crate::tree::{XNode, XTree};
7use md5::Digest;
8
9trait Concat {
10    fn concat(self, other: Self) -> Self;
11}
12
13impl Concat for Digest {
14    fn concat(mut self, other: Self) -> Self {
15        for i in 0..self.0.len() {
16            self.0[i] = self.0[i].wrapping_add(other.0[i]);
17        }
18        self
19    }
20}
21
22#[derive(Debug, Clone)]
23pub enum Edit<'a, 'tree1, 'tree2> {
24    Insert {
25        child_node: XNode<'a, 'tree2>,
26        to_node: XNode<'a, 'tree1>,
27    },
28    Delete(XNode<'a, 'tree1>),
29    Update {
30        old: XNode<'a, 'tree1>,
31        new: XNode<'a, 'tree2>,
32    },
33    ReplaceRoot,
34}
35
36impl Display for Edit<'_, '_, '_> {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        match self {
39            Edit::Insert {
40                child_node,
41                to_node,
42            } => {
43                write!(
44                    f,
45                    "insert node {} to node {}",
46                    child_node.id(),
47                    to_node.id()
48                )
49            }
50            Edit::Delete(node) => write!(f, "delete node {}", node.id()),
51            Edit::Update { old, new } => write!(
52                f,
53                "update node {}: {:?} -> {:?}",
54                old.id(),
55                old.value().unwrap().trim(),
56                new.value().unwrap().trim()
57            ),
58            Edit::ReplaceRoot => write!(f, "replace root node"),
59        }
60    }
61}
62
63type Diff<'a, 'tree1, 'tree2> = Vec<Edit<'a, 'tree1, 'tree2>>;
64
65/// Calculate the difference between two XML trees, represented by the minum edit operations to transform `tree1` to `tree2`.
66pub fn diff<'a, 'doc1, 'doc2>(
67    tree1: &'doc1 XTree<'doc1>,
68    tree2: &'doc2 XTree<'doc2>,
69) -> Diff<'a, 'doc1, 'doc2> {
70    fn diff_node<'a, 'doc1, 'doc2>(
71        node1: XNode<'a, 'doc1>,
72        ht1: &HashMap<String, Digest>,
73        node2: XNode<'a, 'doc2>,
74        ht2: &HashMap<String, Digest>,
75    ) -> Diff<'a, 'doc1, 'doc2> {
76        if ht1.get(&node1.id().to_string()) == ht2.get(&node2.id().to_string()) {
77            return Vec::new();
78        }
79
80        // Leaf nodes with different hashes mean different values
81        if (node1.is_attribute() && node2.is_attribute()) || (node1.is_text() && node2.is_text()) {
82            return vec![Edit::Update {
83                old: node1,
84                new: node2,
85            }];
86        }
87
88        let mut iht1: HashMap<_, _> = node1
89            .children()
90            .iter()
91            .map(|n| (*ht1.get(&n.id().to_string()).unwrap(), *n))
92            .collect();
93        let mut iht2: HashMap<_, _> = node2
94            .children()
95            .iter()
96            .map(|n| (*ht2.get(&n.id().to_string()).unwrap(), *n))
97            .collect();
98        let children_hashes1: HashSet<_> = iht1.keys().copied().collect();
99        let children_hashes2: HashSet<_> = iht2.keys().copied().collect();
100        let same_hashes: HashSet<_> = children_hashes1.intersection(&children_hashes2).collect();
101        iht1.retain(|k, _| !same_hashes.contains(&k));
102        iht2.retain(|k, _| !same_hashes.contains(&k));
103        let mut remaining_children1: HashSet<_> = iht1.into_values().collect();
104        let mut remaining_children2: HashSet<_> = iht2.into_values().collect();
105        let mut diff_pairs = Vec::new();
106        for n1 in &remaining_children1 {
107            for n2 in &remaining_children2 {
108                if n1.signature() == n2.signature() {
109                    diff_pairs.push((*n1, *n2, diff_node(*n1, ht1, *n2, ht2)));
110                }
111            }
112        }
113        diff_pairs.sort_by_key(|item| item.2.len());
114        let mut diff = Vec::new();
115        for (n1, n2, mut d) in diff_pairs {
116            if remaining_children1.contains(&n1) && remaining_children2.contains(&n2) {
117                diff.append(&mut d);
118                remaining_children1.remove(&n1);
119                remaining_children2.remove(&n2);
120            }
121        }
122        for n1 in remaining_children1 {
123            diff.push(Edit::Delete(n1));
124        }
125        for n2 in remaining_children2 {
126            diff.push(Edit::Insert {
127                child_node: n2,
128                to_node: node1,
129            });
130        }
131        diff
132    }
133    if tree1.root().signature() != tree2.root().signature() {
134        return vec![Edit::ReplaceRoot];
135    }
136    let ht1 = calculate_hash_table(tree1);
137    let ht2 = calculate_hash_table(tree2);
138    diff_node(tree1.root(), &ht1, tree2.root(), &ht2)
139}
140
141fn calculate_hash_table(tree: &XTree) -> HashMap<String, Digest> {
142    fn hash_of_node(node: XNode, ht: &mut HashMap<String, Digest>) -> Digest {
143        let hash = if node.children().is_empty() {
144            node.hash()
145        } else {
146            let mut acc = node.hash();
147            for child in node.children() {
148                acc = acc.concat(hash_of_node(child, ht));
149            }
150            acc
151        };
152        ht.insert(node.id().to_string(), hash);
153        hash
154    }
155    let mut hash_table = HashMap::new();
156    hash_of_node(tree.root(), &mut hash_table);
157    hash_table
158}
159
160#[cfg(test)]
161mod test {
162    use crate::tree::print::{PrintTreeOptions, print_tree};
163
164    use super::*;
165    use std::fs;
166
167    #[test]
168    fn test_calculate_hash_table_same_tree() {
169        let text1 = fs::read_to_string("test/file1.xml").unwrap();
170        let tree1 = XTree::parse(&text1).unwrap();
171        let ht1 = calculate_hash_table(&tree1);
172
173        let text2 = fs::read_to_string("test/file1.xml").unwrap();
174        let tree2 = XTree::parse(&text2).unwrap();
175        let ht2 = calculate_hash_table(&tree2);
176
177        assert_eq!(
178            ht1.get(&tree1.root().id().to_string()),
179            ht2.get(&tree2.root().id().to_string())
180        );
181    }
182
183    #[test]
184    fn test_calculate_hash_table_different_tree() {
185        let text1 = fs::read_to_string("test/file1.xml").unwrap();
186        let tree1 = XTree::parse(&text1).unwrap();
187        let ht1 = calculate_hash_table(&tree1);
188
189        let text2 = fs::read_to_string("test/file2.xml").unwrap();
190        let tree2 = XTree::parse(&text2).unwrap();
191        let ht2 = calculate_hash_table(&tree2);
192
193        assert_ne!(
194            ht1.get(&tree1.root().id().to_string()),
195            ht2.get(&tree2.root().id().to_string())
196        );
197    }
198
199    #[test]
200    fn test_diff() {
201        let text1 = fs::read_to_string("test/file1.xml").unwrap();
202        let tree1 = XTree::parse(&text1).unwrap();
203
204        let text2 = fs::read_to_string("test/file2.xml").unwrap();
205        let tree2 = XTree::parse(&text2).unwrap();
206
207        #[cfg(feature = "print")]
208        {
209            print_tree(&tree1, PrintTreeOptions::default().with_node_id());
210            print_tree(&tree2, PrintTreeOptions::default().with_node_id());
211        }
212
213        let diff = diff(&tree1, &tree2);
214        diff.iter().any(|d| {
215            regex::Regex::new(r#"update node \d+: \"George\" -> \"Fred\""#)
216                .unwrap()
217                .is_match(&d.to_string())
218        });
219        diff.iter().any(|d| {
220            regex::Regex::new(r#"insert node \d+[Attr] to node \d+"#)
221                .unwrap()
222                .is_match(&d.to_string())
223        });
224        diff.iter().any(|d| {
225            regex::Regex::new(r#"insert node \d+[Foo] to node \d+"#)
226                .unwrap()
227                .is_match(&d.to_string())
228        });
229        for e in diff {
230            println!("{}", e);
231        }
232    }
233}