x_diff_rs/
lib.rs

1/*!
2A library to compare XML files unorderedly.
3
4This library implements the X-Diff algorithm from paper [X-Diff: An Effective Change Detection Algorithm for XML Documents](https://pages.cs.wisc.edu/~yuanwang/papers/xdiff.pdf).
5
6## Example
7
8```rust
9use x_diff_rs::{
10    diff,
11    tree::XTree,
12};
13
14let text1 = r#"
15<Profile>
16 <Customer>
17  <PersonName NameType="Default">
18   <NameTitle>Mr.</NameTitle>
19   <GivenName>George</GivenName>
20   <MiddleName>A.</MiddleName>
21   <SurName>Smith</SurName>
22  </PersonName>
23  <TelephoneInfo PhoneTech="Voice" PhoneUse="Work" >
24   <Telephone> <AreaCityCode>206</AreaCityCode>
25	<PhoneNumber>813-8698</PhoneNumber>
26   </Telephone>
27  </TelephoneInfo>
28  <PaymentForm>
29   ...
30  </PaymentForm>
31  <Address>
32   <StreetNmbr POBox="4321-01">From hell</StreetNmbr>
33   <BldgRoom>Suite 800</BldgRoom>
34   <CityName>Seattle</CityName>
35   <StateProv PostalCode="98108">WA</StateProv>
36   <CountryName>USA</CountryName>
37  </Address>
38  <Address>
39   <StreetNmbr POBox="4321-01">1200 Yakima St</StreetNmbr>
40   <BldgRoom>Suite 800</BldgRoom>
41   <CityName>Seattle</CityName>
42   <StateProv PostalCode="98108">WA</StateProv>
43   <CountryName>USA</CountryName>
44  </Address>
45 </Customer>
46</Profile>
47    "#;
48
49let text2 = r#"
50<Profile>
51 <Customer>
52  <PersonName NameType="Default">
53   <NameTitle>Mr.</NameTitle>
54   <GivenName>George</GivenName>
55   <MiddleName>A.</MiddleName>
56   <SurName>Smith</SurName>
57  </PersonName>
58  <TelephoneInfo PhoneTech="Voice" PhoneUse="Work" >
59   <Telephone> <AreaCityCode>206</AreaCityCode>
60	<PhoneNumber>813-8698</PhoneNumber>
61   </Telephone>
62  </TelephoneInfo>
63  <Address>
64   <StreetNmbr POBox="4321-01">From hell</StreetNmbr>
65   <BldgRoom>Suite 800</BldgRoom>
66   <CityName>Seattle</CityName>
67   <StateProv PostalCode="98108">WA</StateProv>
68   <CountryName>USA</CountryName>
69  </Address>
70  <Address>
71   <StreetNmbr POBox="1234-01">1200 Yakima St</StreetNmbr>
72   <BldgRoom>Suite 800</BldgRoom>
73   <CityName>Paris</CityName>
74   <StateProv PostalCode="98108">WA</StateProv>
75   <CountryName>USA</CountryName>
76  </Address>
77  <Status>Single</Status>
78 </Customer>
79</Profile>
80"#;
81let tree1 = XTree::parse(&text1).unwrap();
82let tree2 = XTree::parse(&text2).unwrap();
83let difference = diff(&tree1, &tree2);
84for d in difference {
85    println!("{d}");
86}
87```
88*/
89
90use std::{
91    collections::{HashMap, HashSet},
92    fmt::Display,
93};
94
95use md5::Digest;
96use tree::{XNode, XNodeId, XTree};
97
98/// XML parsing and tree operations.
99pub mod tree;
100
101trait Concat {
102    fn concat(self, other: Self) -> Self;
103}
104
105impl Concat for Digest {
106    fn concat(mut self, other: Self) -> Self {
107        for i in 0..self.0.len() {
108            self.0[i] = self.0[i].wrapping_add(other.0[i]);
109        }
110        self
111    }
112}
113
114#[derive(Debug, Clone)]
115pub enum Edit<'tree1, 'tree2> {
116    Insert {
117        child_node: XNodeId<'tree2>,
118        to: XNodeId<'tree1>,
119    },
120    Delete(XNodeId<'tree1>),
121    Update {
122        node_id: XNodeId<'tree1>,
123        old_value: String,
124        new_value: String,
125    },
126    ReplaceRoot,
127}
128
129impl Display for Edit<'_, '_> {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        match self {
132            Edit::Insert { child_node, to } => {
133                write!(f, "insert node {} to node {}", child_node, to)
134            }
135            Edit::Delete(node_id) => write!(f, "delete node {}", node_id),
136            Edit::Update {
137                node_id,
138                old_value,
139                new_value,
140            } => write!(
141                f,
142                "update node {}: {:?} -> {:?}",
143                node_id, old_value, new_value
144            ),
145            Edit::ReplaceRoot => write!(f, "replace root node"),
146        }
147    }
148}
149
150type Diff<'tree1, 'tree2> = Vec<Edit<'tree1, 'tree2>>;
151
152/// Calculate the difference between two XML trees, represented by the minum edit operations to transform `tree1` to `tree2`.
153pub fn diff<'doc1, 'doc2>(
154    tree1: &'doc1 XTree<'doc1>,
155    tree2: &'doc2 XTree<'doc2>,
156) -> Diff<'doc1, 'doc2> {
157    fn diff_node<'doc1, 'doc2>(
158        node1: XNode<'_, 'doc1>,
159        ht1: &HashMap<XNodeId<'doc1>, Digest>,
160        node2: XNode<'_, 'doc2>,
161        ht2: &HashMap<XNodeId<'doc2>, Digest>,
162    ) -> Diff<'doc1, 'doc2> {
163        if ht1.get(&node1.id()) == ht2.get(&node2.id()) {
164            return Vec::new();
165        }
166
167        // Leaf nodes with different hashes mean different values
168        if (node1.is_attribute() && node2.is_attribute()) || (node1.is_text() && node2.is_text()) {
169            return vec![Edit::Update {
170                node_id: node1.id(),
171                old_value: node1.value().unwrap_or_default().trim().to_string(),
172                new_value: node2.value().unwrap_or_default().trim().to_string(),
173            }];
174        }
175
176        let mut iht1: HashMap<_, _> = node1
177            .children()
178            .iter()
179            .map(|n| (*ht1.get(&n.id()).unwrap(), *n))
180            .collect();
181        let mut iht2: HashMap<_, _> = node2
182            .children()
183            .iter()
184            .map(|n| (*ht2.get(&n.id()).unwrap(), *n))
185            .collect();
186        let children_hashes1: HashSet<_> = iht1.keys().copied().collect();
187        let children_hashes2: HashSet<_> = iht2.keys().copied().collect();
188        let same_hashes: HashSet<_> = children_hashes1.intersection(&children_hashes2).collect();
189        iht1.retain(|k, _| !same_hashes.contains(&k));
190        iht2.retain(|k, _| !same_hashes.contains(&k));
191        let mut remaining_children1: HashSet<_> = iht1.into_values().collect();
192        let mut remaining_children2: HashSet<_> = iht2.into_values().collect();
193        let mut diff_pairs = Vec::new();
194        for n1 in &remaining_children1 {
195            for n2 in &remaining_children2 {
196                if n1.signature() == n2.signature() {
197                    diff_pairs.push((*n1, *n2, diff_node(*n1, ht1, *n2, ht2)));
198                }
199            }
200        }
201        diff_pairs.sort_by_key(|item| item.2.len());
202        let mut diff = Vec::new();
203        for (n1, n2, mut d) in diff_pairs {
204            if remaining_children1.contains(&n1) && remaining_children2.contains(&n2) {
205                diff.append(&mut d);
206                remaining_children1.remove(&n1);
207                remaining_children2.remove(&n2);
208            }
209        }
210        for n1 in remaining_children1 {
211            diff.push(Edit::Delete(n1.id()));
212        }
213        for n2 in remaining_children2 {
214            diff.push(Edit::Insert {
215                child_node: n2.id(),
216                to: node1.id(),
217            });
218        }
219        diff
220    }
221    if tree1.root().signature() != tree2.root().signature() {
222        return vec![Edit::ReplaceRoot];
223    }
224    let ht1 = calculate_hash_table(tree1);
225    let ht2 = calculate_hash_table(tree2);
226    diff_node(tree1.root(), &ht1, tree2.root(), &ht2)
227}
228
229fn calculate_hash_table<'doc>(tree: &'doc XTree) -> HashMap<XNodeId<'doc>, Digest> {
230    fn hash_of_node<'doc>(
231        node: XNode<'_, 'doc>,
232        ht: &mut HashMap<XNodeId<'doc>, Digest>,
233    ) -> Digest {
234        let hash = if node.children().is_empty() {
235            node.hash()
236        } else {
237            let mut acc = node.hash();
238            for child in node.children() {
239                acc = acc.concat(hash_of_node(child, ht));
240            }
241            acc
242        };
243        ht.insert(node.id(), hash);
244        hash
245    }
246    let mut hash_table = HashMap::new();
247    hash_of_node(tree.root(), &mut hash_table);
248    hash_table
249}
250
251#[cfg(test)]
252mod test {
253    use std::fs;
254    #[cfg(feature = "print")]
255    use tree::XTreePrintOptions;
256
257    use super::*;
258
259    #[test]
260    fn test_calculate_hash_table_same_tree() {
261        let text1 = fs::read_to_string("test/file1.xml").unwrap();
262        let tree1 = XTree::parse(&text1).unwrap();
263        let ht1 = calculate_hash_table(&tree1);
264
265        let text2 = fs::read_to_string("test/file1.xml").unwrap();
266        let tree2 = XTree::parse(&text2).unwrap();
267        let ht2 = calculate_hash_table(&tree2);
268
269        assert_eq!(ht1.get(&tree1.root().id()), ht2.get(&tree2.root().id()));
270    }
271
272    #[test]
273    fn test_calculate_hash_table_different_tree() {
274        let text1 = fs::read_to_string("test/file1.xml").unwrap();
275        let tree1 = XTree::parse(&text1).unwrap();
276        let ht1 = calculate_hash_table(&tree1);
277
278        let text2 = fs::read_to_string("test/file2.xml").unwrap();
279        let tree2 = XTree::parse(&text2).unwrap();
280        let ht2 = calculate_hash_table(&tree2);
281
282        #[cfg(feature = "print")]
283        {
284            let hex_marker1 = ht1.iter().map(|(k, v)| (*k, format!("{:x}", v))).collect();
285            tree1.print(
286                XTreePrintOptions::default()
287                    .with_node_marker(&hex_marker1)
288                    .with_node_id(),
289            );
290
291            let hex_marker2 = ht2.iter().map(|(k, v)| (*k, format!("{:x}", v))).collect();
292            tree2.print(
293                XTreePrintOptions::default()
294                    .with_node_marker(&hex_marker2)
295                    .with_node_id(),
296            );
297        }
298
299        assert_ne!(ht1.get(&tree1.root().id()), ht2.get(&tree2.root().id()));
300    }
301
302    #[test]
303    fn test_diff() {
304        let text1 = fs::read_to_string("test/file1.xml").unwrap();
305        let tree1 = XTree::parse(&text1).unwrap();
306
307        let text2 = fs::read_to_string("test/file2.xml").unwrap();
308        let tree2 = XTree::parse(&text2).unwrap();
309
310        #[cfg(feature = "print")]
311        {
312            tree1.print(XTreePrintOptions::default().with_node_id());
313            tree2.print(XTreePrintOptions::default().with_node_id());
314        }
315
316        let diff = diff(&tree1, &tree2);
317        diff.iter().any(|d| {
318            regex::Regex::new(r#"update node \d+: \"George\" -> \"Fred\""#)
319                .unwrap()
320                .is_match(&d.to_string())
321        });
322        diff.iter().any(|d| {
323            regex::Regex::new(r#"insert node \d+[Attr] to node \d+"#)
324                .unwrap()
325                .is_match(&d.to_string())
326        });
327        diff.iter().any(|d| {
328            regex::Regex::new(r#"insert node \d+[Foo] to node \d+"#)
329                .unwrap()
330                .is_match(&d.to_string())
331        });
332        for e in diff {
333            println!("{}", e);
334        }
335    }
336}