natural-xml-diff 0.2.0

Natural diffing between XML documents
Documentation
use xot::Xot;

use crate::comparison::{Comparison, EqualPair};
use crate::element::element_similarity;
use crate::vtree::Status;

const ELEMENT_SIMILARITY_THRESHOLD: f32 = 0.4;

impl Comparison {
    pub(crate) fn propagate(&mut self, xot: &Xot, equal_pairs: &[EqualPair]) -> Vec<EqualPair> {
        let mut propagated = Vec::new();
        let equal_pairs = self.equal_pairs_sorted_by_weight(equal_pairs);
        for equal_pair in equal_pairs {
            propagated.extend(self.propagate_ancestors(xot, equal_pair));
        }
        propagated
    }

    fn propagate_ancestors(&mut self, xot: &Xot, equal_pair: EqualPair) -> Vec<EqualPair> {
        let EqualPair(node_a_id, node_b_id) = equal_pair;
        let mut parent_a = self.vtree_a.nodes[node_a_id].parent_id;
        let mut parent_b = self.vtree_b.nodes[node_b_id].parent_id;

        let mut ancestor_pairs = Vec::new();

        while let (Some(current_parent_a), Some(current_parent_b)) = (parent_a, parent_b) {
            ancestor_pairs.push((current_parent_a, current_parent_b));
            parent_a = self.vtree_a.nodes[current_parent_a].parent_id;
            parent_b = self.vtree_b.nodes[current_parent_b].parent_id;
        }

        let mut propagated = Vec::new();
        // now set ancestors as equal if they have the same node signature
        // and they're not already declared equal by an earlier invocation
        for (ancestor_a_id, ancestor_b_id) in ancestor_pairs {
            let vnode_a = &mut self.vtree_a.nodes[ancestor_a_id];
            let vnode_b = &mut self.vtree_b.nodes[ancestor_b_id];
            if vnode_a.status == Status::Different && vnode_b.status == Status::Different {
                if vnode_a.node_signature_id == vnode_b.node_signature_id {
                    // if the node signatures are equal, we can propagate the equal status
                    vnode_a.status = Status::Equal(ancestor_b_id as u32);
                    vnode_b.status = Status::Equal(ancestor_a_id as u32);
                    propagated.push(EqualPair(ancestor_a_id, ancestor_b_id));
                } else {
                    let (element_a, element_b) =
                        (xot.element(vnode_a.node), xot.element(vnode_b.node));
                    if let (Some(element_a), Some(element_b)) = (element_a, element_b) {
                        if let Some(element_similarity) = element_similarity(element_a, element_b) {
                            // if elements are similar enough, we consider them updates
                            if element_similarity > ELEMENT_SIMILARITY_THRESHOLD {
                                vnode_a.status = Status::Update(ancestor_b_id as u32);
                                vnode_b.status = Status::Update(ancestor_a_id as u32);
                                propagated.push(EqualPair(ancestor_a_id, ancestor_b_id));
                            }
                        }
                    }
                }
            } else {
                // if we already have equal nodes, we don't need to propagate further
                break;
            }
        }
        propagated
    }

    fn equal_pairs_sorted_by_weight(&self, equal_pairs: &[EqualPair]) -> Vec<EqualPair> {
        let mut result = equal_pairs.to_vec();
        result.sort_unstable_by_key(|EqualPair(id_a, _id_b)| {
            // if the nodes are equal, weight of a and b are the same as the
            // nodes are the same, so just use node_a to sort
            // if the nodes are updates, weight of a and b can be different,
            // but weight of a should still offer good guidance towards which
            // node to propagate first
            let vnode_a = &self.vtree_a.nodes[*id_a];
            vnode_a.weight
        });
        // we need the biggest weights first
        result.reverse();
        result
    }

    // it's possible for propagation to create inconsistent information about
    // parents if there is a structure change without a value change, or for
    // certain value changes. We check all pairs of equal nodes, and check
    // whether their parents are consistent, comparing tree a and tree b. If
    // there is a difference in parents, we assume that the pair isn't truly
    // equal and needs to be inserted/deleted.
    pub(crate) fn propagation_consistency(&mut self, equal_pairs: &[EqualPair]) {
        let mut inconsistent = Vec::new();
        for EqualPair(node_a_id, node_b_id) in equal_pairs {
            // take parents of propagated node
            let vnode_a = &self.vtree_a.nodes[*node_a_id];
            let vnode_b = &self.vtree_b.nodes[*node_b_id];
            let parent_a_id = vnode_a.parent_id;
            let parent_b_id = vnode_b.parent_id;
            // if both parents exist, check whether they are consistent
            if let (Some(parent_a_id), Some(parent_b_id)) = (parent_a_id, parent_b_id) {
                let parent_a = &self.vtree_a.nodes[parent_a_id];
                let parent_b = &self.vtree_b.nodes[parent_b_id];
                if let (
                    Status::Equal(equal_b_id) | Status::Update(equal_b_id),
                    Status::Equal(equal_a_id) | Status::Update(equal_a_id),
                ) = (parent_a.status, parent_b.status)
                {
                    // check whether parents are declared equal to the same node
                    if equal_a_id as usize != parent_a_id || equal_b_id as usize != parent_b_id {
                        // if not, they're inconsistent
                        inconsistent.push((node_a_id, node_b_id));
                    }
                }
            }
        }
        // now set inconsistent nodes to different
        for (node_a_id, node_b_id) in inconsistent {
            let vnode_a = &mut self.vtree_a.nodes[*node_a_id];
            let vnode_b = &mut self.vtree_b.nodes[*node_b_id];
            vnode_a.status = Status::Different;
            vnode_b.status = Status::Different;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use xot::Xot;

    use crate::partition::equal_pairs;

    #[test]
    fn test_propagate() {
        let mut xot = Xot::new();
        // example taken from the jndiff paper, page 11
        let doc_a = xot
            .parse(concat!(
                "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
                "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
                "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
            ))
            .unwrap();
        let doc_b = xot
            .parse(concat!(
                "<book><chapter><para>Text 2</para></chapter>",
                "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
                "<chapter><para>Text 12</para></chapter></book>"
            ))
            .unwrap();

        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
        let overlaps = comparison.partition();
        let equal_pairs = equal_pairs(&overlaps);
        comparison.update_status(&equal_pairs, Status::Equal);

        comparison.propagate(&xot, &equal_pairs);

        assert_eq!(comparison.vtree_a.nodes[1].status, Status::Equal(1));
        assert_eq!(comparison.vtree_a.nodes[2].status, Status::Equal(2));
        assert_eq!(comparison.vtree_a.nodes[7].status, Status::Equal(5));
        assert_eq!(comparison.vtree_a.nodes[24].status, Status::Equal(24));
        assert_eq!(comparison.vtree_b.nodes[1].status, Status::Equal(1));
        assert_eq!(comparison.vtree_b.nodes[2].status, Status::Equal(2));
        assert_eq!(comparison.vtree_b.nodes[5].status, Status::Equal(7));
        assert_eq!(comparison.vtree_b.nodes[24].status, Status::Equal(24));
    }
}