natural_xml_diff/
vtree.rs

1use ahash::AHashMap;
2use std::borrow::Cow;
3use xot::{Element, NameId, Node, Value, Xot};
4
5// A signature for an element. This includes the name and the attributes.
6#[derive(Debug, Hash, PartialEq, Eq)]
7pub(crate) struct ElementSignature<'a> {
8    name: NameId,
9    attributes: Cow<'a, [(NameId, Cow<'a, str>)]>,
10}
11
12impl<'a> ElementSignature<'a> {
13    fn new(element: &'a Element) -> Self {
14        let mut attributes: Vec<(NameId, Cow<'a, str>)> = element
15            .attributes()
16            .iter()
17            .map(|(name, value)| (*name, value.into()))
18            .collect();
19        // sort the attributes for consistency
20        attributes.sort();
21        Self {
22            name: element.name(),
23            attributes: attributes.into(),
24        }
25    }
26}
27
28#[derive(Debug, Hash, PartialEq, Eq)]
29pub(crate) enum NodeSignature<'a> {
30    Element(ElementSignature<'a>),
31    Text(Cow<'a, str>),
32    Root,
33    ProcessingInstruction(Cow<'a, str>, Option<Cow<'a, str>>),
34    Comment(Cow<'a, str>),
35}
36
37pub(crate) struct Signatures<T: Eq + std::hash::Hash> {
38    map: AHashMap<T, u32>,
39    next_id: u32,
40}
41
42impl<T: Eq + std::hash::Hash> Signatures<T> {
43    pub(crate) fn new() -> Self {
44        Self {
45            map: AHashMap::new(),
46            next_id: 0,
47        }
48    }
49
50    fn get_id(&mut self, signature: T) -> u32 {
51        if let Some(id) = self.map.get(&signature) {
52            *id
53        } else {
54            let id = self.next_id;
55            self.next_id += 1;
56            self.map.insert(signature, id);
57            id
58        }
59    }
60}
61
62// A tree signature is the node signature of its node,
63// combined with the tree signatures of any of its children
64// This way we can compare subtrees efficiently
65#[derive(Debug, Hash, PartialEq, Eq)]
66pub(crate) struct TreeSignature<'a> {
67    node_signature_id: u32,
68    tree_signature_ids: Cow<'a, [u32]>,
69}
70
71/// The raw status of a node.
72///
73/// This is used to derive the edit operations.
74#[derive(Debug, PartialEq, Eq, Copy, Clone)]
75pub enum Status {
76    /// The node (including descendants) is identical to a node in the
77    /// other XML document, where the `u32` gives the index to the node in the
78    /// other document.
79    Equal(u32),
80    /// The node is sufficiently similar to a node in the other XML document
81    /// and is therefore determined to be an update.
82    Update(u32),
83    /// The node is unique to this document. If it's in document A
84    /// it needs to be deleted, if it's in document B it needs to
85    /// be inserted.
86    Different,
87}
88
89#[derive(Debug)]
90pub(crate) struct Vnode {
91    pub(crate) node: Node,
92    // parent_id, first_child and next_sibling index into the vtree itself
93    pub(crate) parent_id: Option<usize>,
94    pub(crate) first_child: Option<usize>,
95    pub(crate) next_sibling: Option<usize>,
96    pub(crate) node_signature_id: u32,
97    pub tree_signature_id: u32,
98    pub descendant_count: u32,
99    pub weight: u32,
100    pub status: Status,
101}
102
103type NodeToIndex = AHashMap<Node, usize>;
104
105impl Vnode {
106    fn new(xot: &Xot, node_to_index: &NodeToIndex, node: Node, node_signature_id: u32) -> Self {
107        Self {
108            node,
109            parent_id: xot.parent(node).map(|n| node_to_index[&n]),
110            first_child: xot.first_child(node).map(|n| node_to_index[&n]),
111            next_sibling: xot.next_sibling(node).map(|n| node_to_index[&n]),
112            node_signature_id,
113            tree_signature_id: 0,
114            descendant_count: 0,
115            weight: 0,
116            status: Status::Different,
117        }
118    }
119
120    fn get_children_tree_signature_ids(&self, vnodes: &[Vnode]) -> Vec<u32> {
121        children(vnodes, self)
122            .map(|vnode| vnode.tree_signature_id)
123            .collect::<Vec<_>>()
124    }
125}
126
127pub(crate) struct Vtree {
128    pub(crate) nodes: Vec<Vnode>,
129}
130
131impl Vtree {
132    pub(crate) fn new<'a>(
133        xot: &'a Xot,
134        root: Node,
135        node_signatures: &mut Signatures<NodeSignature<'a>>,
136        tree_signatures: &mut Signatures<TreeSignature<'a>>,
137    ) -> Self {
138        // we need a temporary reverse mapping of Xot node to index in the vtree
139        let node_to_index = xot
140            .descendants(root)
141            .enumerate()
142            .map(|(i, n)| (n, i))
143            .collect::<AHashMap<_, _>>();
144
145        // now construct the vnodes array
146        let mut vnodes = xot
147            .descendants(root)
148            .map(|node| {
149                Vnode::new(
150                    xot,
151                    &node_to_index,
152                    node,
153                    get_node_signature_id(xot, node, node_signatures),
154                )
155            })
156            .collect::<Vec<_>>();
157
158        // fix up the vnodes array with information about the tree structure
159        for i in (0..vnodes.len()).rev() {
160            // get an immutable vnode first
161            let vnode = &vnodes[i];
162
163            // calculate its tree signature id based on its node signature id
164            // and the tree signature ids of its children
165            let ids: Vec<u32> = vnode.get_children_tree_signature_ids(&vnodes);
166            let tree_signature = TreeSignature {
167                node_signature_id: vnode.node_signature_id,
168                tree_signature_ids: ids.into(),
169            };
170            let tree_signature_id = tree_signatures.get_id(tree_signature);
171
172            // determine the weight based on the weight it already has
173            // plus its node type
174            let mut weight = vnode.weight;
175            weight += match xot.value(vnode.node) {
176                // XXX is this weight assignment correctly? instead we can assign
177                // nothing for element, and a 1 + len for text
178                // but if we do this, we actually get problems, we noticed
179                // with processing instructions
180                Value::Element(_) => 1,
181                Value::Text(text) => text.get().len() as u32,
182                Value::Root => 0,
183                Value::ProcessingInstruction(_) => 1,
184                Value::Comment(_) => 1,
185            };
186            // get the descendant count and parent id
187            let descendant_count = vnode.descendant_count;
188            let parent_id = vnode.parent_id;
189            // now get vnode again to mutate
190            let vnode = &mut vnodes[i];
191            // assign what we calculated
192            vnode.tree_signature_id = tree_signature_id;
193            vnode.weight = weight;
194
195            // update the parent node with weight and descendant count
196            if let Some(parent_id) = parent_id {
197                let parent = &mut vnodes[parent_id];
198                parent.weight += weight;
199                parent.descendant_count += descendant_count + 1;
200            }
201        }
202        Self { nodes: vnodes }
203    }
204
205    pub(crate) fn get(&self, index: usize) -> &Vnode {
206        &self.nodes[index]
207    }
208
209    pub(crate) fn children(&self, parent: &Vnode) -> impl Iterator<Item = &Vnode> {
210        children(&self.nodes, parent)
211    }
212}
213
214fn children<'a>(vnodes: &'a [Vnode], parent: &Vnode) -> impl Iterator<Item = &'a Vnode> {
215    let mut current = parent.first_child;
216    std::iter::from_fn(move || {
217        if let Some(current_node) = current {
218            let node = &vnodes[current_node];
219            current = node.next_sibling;
220            Some(node)
221        } else {
222            None
223        }
224    })
225}
226
227fn get_node_signature_id<'a>(
228    xot: &'a Xot,
229    node: Node,
230    node_signatures: &mut Signatures<NodeSignature<'a>>,
231) -> u32 {
232    match xot.value(node) {
233        Value::Element(element) => {
234            node_signatures.get_id(NodeSignature::Element(ElementSignature::new(element)))
235        }
236        Value::Text(text) => node_signatures.get_id(NodeSignature::Text(text.get().into())),
237        Value::Root => node_signatures.get_id(NodeSignature::Root),
238        Value::ProcessingInstruction(pi) => node_signatures.get_id(
239            NodeSignature::ProcessingInstruction(pi.target().into(), pi.data().map(|v| v.into())),
240        ),
241        Value::Comment(comment) => {
242            node_signatures.get_id(NodeSignature::Comment(comment.get().into()))
243        }
244    }
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250    use crate::comparison::Comparison;
251
252    #[test]
253    fn test_root_element() {
254        let mut xot = Xot::new();
255        let doc_a = xot.parse("<container />").unwrap();
256        let doc_b = xot.parse("<container />").unwrap();
257        let comparison = Comparison::new(&xot, doc_a, doc_b);
258        let (a, b) = comparison.vtrees();
259        assert_eq!(a.nodes.len(), b.nodes.len());
260        // root
261        assert_eq!(a.nodes[0].node_signature_id, b.nodes[0].node_signature_id);
262        // container
263        assert_eq!(a.nodes[1].node_signature_id, b.nodes[1].node_signature_id);
264    }
265
266    #[test]
267    fn test_multiple_elements() {
268        let mut xot = Xot::new();
269        let doc_a = xot.parse("<container><a/><b/></container>").unwrap();
270        let doc_b = xot.parse("<container><a/><c/></container>").unwrap();
271        let comparison = Comparison::new(&xot, doc_a, doc_b);
272        let (a, b) = comparison.vtrees();
273        assert_eq!(a.nodes.len(), b.nodes.len());
274        // root
275        assert_eq!(a.nodes[0].node_signature_id, b.nodes[0].node_signature_id);
276        // container
277        assert_eq!(a.nodes[1].node_signature_id, b.nodes[1].node_signature_id);
278        // a
279        assert_eq!(a.nodes[2].node_signature_id, b.nodes[2].node_signature_id);
280        // b vs c
281        assert_ne!(a.nodes[3].node_signature_id, b.nodes[3].node_signature_id);
282    }
283
284    #[test]
285    fn test_multiple_elements_attribute_difference() {
286        let mut xot = Xot::new();
287        let doc_a = xot
288            .parse("<container><a s='A'/><b s='B'/></container>")
289            .unwrap();
290        let doc_b = xot
291            .parse("<container><a s='A'/><b s='C'/></container>")
292            .unwrap();
293        let comparison = Comparison::new(&xot, doc_a, doc_b);
294        let (a, b) = comparison.vtrees();
295        assert_eq!(a.nodes.len(), b.nodes.len());
296        // root
297        assert_eq!(a.nodes[0].node_signature_id, b.nodes[0].node_signature_id);
298        // container
299        assert_eq!(a.nodes[1].node_signature_id, b.nodes[1].node_signature_id);
300        // a
301        assert_eq!(a.nodes[2].node_signature_id, b.nodes[2].node_signature_id);
302        // b
303        assert_ne!(a.nodes[3].node_signature_id, b.nodes[3].node_signature_id);
304    }
305
306    #[test]
307    fn test_text_nodes() {
308        let mut xot = Xot::new();
309        let doc_a = xot
310            .parse("<container><a>A</a><b>B</b></container>")
311            .unwrap();
312        let doc_b = xot
313            .parse("<container><a>A</a><b>C</b></container>")
314            .unwrap();
315        let comparison = Comparison::new(&xot, doc_a, doc_b);
316        let (a, b) = comparison.vtrees();
317        assert_eq!(a.nodes.len(), b.nodes.len());
318        // root
319        assert_eq!(a.nodes[0].node_signature_id, b.nodes[0].node_signature_id);
320        // container
321        assert_eq!(a.nodes[1].node_signature_id, b.nodes[1].node_signature_id);
322        // a
323        assert_eq!(a.nodes[2].node_signature_id, b.nodes[2].node_signature_id);
324        // text node: A
325        assert_eq!(a.nodes[3].node_signature_id, b.nodes[3].node_signature_id);
326        // b
327        assert_eq!(a.nodes[4].node_signature_id, b.nodes[4].node_signature_id);
328        // text node: B vs C
329        assert_ne!(a.nodes[5].node_signature_id, b.nodes[5].node_signature_id);
330    }
331
332    #[test]
333    fn test_simple_tree_signature_ids() {
334        let mut xot = Xot::new();
335        let doc_a = xot.parse("<a />").unwrap();
336        let doc_b = xot.parse("<b />").unwrap();
337        let comparison = Comparison::new(&xot, doc_a, doc_b);
338        let (a, b) = comparison.vtrees();
339        // root
340        assert_ne!(a.nodes[0].tree_signature_id, b.nodes[0].tree_signature_id);
341        assert_eq!(a.nodes[0].node_signature_id, b.nodes[0].node_signature_id);
342        // // a vs b
343        assert_ne!(a.nodes[1].tree_signature_id, b.nodes[1].tree_signature_id);
344        assert_ne!(a.nodes[1].node_signature_id, b.nodes[1].node_signature_id);
345    }
346
347    #[test]
348    fn test_tree_signature_ids() {
349        let mut xot = Xot::new();
350        let doc_a = xot
351            .parse("<container><a>A</a><a>A</a></container>")
352            .unwrap();
353        let doc_b = xot
354            .parse("<container><a>A</a><a>Aprime</a></container>")
355            .unwrap();
356        let comparison = Comparison::new(&xot, doc_a, doc_b);
357        let (a, b) = comparison.vtrees();
358        // root
359        assert_ne!(a.nodes[0].tree_signature_id, b.nodes[0].tree_signature_id);
360        // container
361        assert_ne!(a.nodes[1].tree_signature_id, b.nodes[1].tree_signature_id);
362        // first a
363        assert_eq!(a.nodes[2].tree_signature_id, b.nodes[2].tree_signature_id);
364        // second a matches first a in doc_a
365        assert_eq!(a.nodes[2].tree_signature_id, a.nodes[4].tree_signature_id);
366        // but they don't match between documents
367        assert_ne!(a.nodes[4].tree_signature_id, b.nodes[4].tree_signature_id);
368    }
369
370    #[test]
371    fn test_element_weights() {
372        let mut xot = Xot::new();
373        let doc_a = xot.parse("<a />").unwrap();
374        let doc_b = xot.parse("<a><b/></a>").unwrap();
375        let comparison = Comparison::new(&xot, doc_a, doc_b);
376        let (a, b) = comparison.vtrees();
377        // root
378        assert_eq!(a.nodes[0].weight, 1);
379        assert_eq!(b.nodes[0].weight, 2);
380        // a
381        assert_eq!(a.nodes[1].weight, 1);
382        assert_eq!(b.nodes[1].weight, 2);
383    }
384
385    #[test]
386    fn test_weights_with_text() {
387        let mut xot = Xot::new();
388        let doc_a = xot.parse("<a>AAAA</a>").unwrap();
389        let doc_b = xot.parse("<a>BBB</a>").unwrap();
390        let comparison = Comparison::new(&xot, doc_a, doc_b);
391        let (a, b) = comparison.vtrees();
392        // root
393        assert_eq!(a.nodes[0].weight, 5);
394        assert_eq!(b.nodes[0].weight, 4);
395        // a
396        assert_eq!(a.nodes[1].weight, 5);
397        assert_eq!(b.nodes[1].weight, 4);
398        // a text
399        assert_eq!(a.nodes[2].weight, 4);
400        assert_eq!(b.nodes[2].weight, 3);
401    }
402
403    #[test]
404    fn test_descendant_count() {
405        let mut xot = Xot::new();
406        let doc_a = xot.parse("<a><b><c/></b></a>").unwrap();
407        let doc_b = xot.parse("<a><b/></a>").unwrap();
408        let comparison = Comparison::new(&xot, doc_a, doc_b);
409        let (a, b) = comparison.vtrees();
410        // root
411        assert_eq!(a.nodes[0].descendant_count, 3);
412        assert_eq!(b.nodes[0].descendant_count, 2);
413        // a
414        assert_eq!(a.nodes[1].descendant_count, 2);
415        assert_eq!(b.nodes[1].descendant_count, 1);
416    }
417
418    #[test]
419    fn test_vtree_children() {
420        let mut xot = Xot::new();
421        let doc_a = xot.parse("<a><b><d/></b><c></c></a>").unwrap();
422        let doc_b = xot.parse("<a><b/></a>").unwrap();
423
424        let comparison = Comparison::new(&xot, doc_a, doc_b);
425        let (a, _b) = comparison.vtrees();
426
427        let children_root = a.children(&a.nodes[0]).collect::<Vec<_>>();
428        assert_eq!(children_root.len(), 1);
429        let children_doc_el = a.children(children_root[0]).collect::<Vec<_>>();
430        assert_eq!(children_doc_el.len(), 2);
431        assert_eq!(a.children(children_doc_el[0]).count(), 1);
432        assert_eq!(a.children(children_doc_el[1]).count(), 0);
433    }
434
435    #[test]
436    fn test_vtree_processing_instruction() {
437        let mut xot = Xot::new();
438        let doc_a = xot.parse("<a><?pi?><?pi foo?></a>").unwrap();
439        let doc_b = xot.parse("<a><?pi?><?pi bar?></a>").unwrap();
440
441        let comparison = Comparison::new(&xot, doc_a, doc_b);
442        let (a, b) = comparison.vtrees();
443
444        assert_eq!(a.nodes[2].node_signature_id, b.nodes[2].node_signature_id);
445        assert_ne!(a.nodes[3].node_signature_id, b.nodes[3].node_signature_id);
446    }
447
448    #[test]
449    fn test_vtree_comment() {
450        let mut xot = Xot::new();
451        let doc_a = xot.parse("<a><!--foo--><!--a--></a>").unwrap();
452        let doc_b = xot.parse("<a><!--foo--><!--b--></a>").unwrap();
453
454        let comparison = Comparison::new(&xot, doc_a, doc_b);
455        let (a, b) = comparison.vtrees();
456
457        assert_eq!(a.nodes[2].node_signature_id, b.nodes[2].node_signature_id);
458        assert_ne!(a.nodes[3].node_signature_id, b.nodes[3].node_signature_id);
459    }
460}