natural-xml-diff 0.1.0

Natural diffing between XML documents
Documentation
use std::ops::Range;
use xot::{Node, Xot};

use crate::comparison::{Comparison, EqualPair};
use crate::element::element_similarity;
use crate::partition::Overlap;
use crate::text::text_similarity;

// how similar two strings need to be to be considered updates
const TEXT_SIMILARITY_THRESHOLD: f32 = 0.5;
// how similar two elements need to be to be considered updates
const ELEMENT_SIMILARITY_THRESHOLD: f32 = 0.4;

type NonOverlap = (Range<usize>, Range<usize>);

impl Comparison {
    pub(crate) fn find_updates(&mut self, xot: &Xot, overlaps: &[Overlap]) -> Vec<EqualPair> {
        let mut result = Vec::new();
        for (range_a, range_b) in
            non_overlapping(overlaps, self.vtree_a.nodes.len(), self.vtree_b.nodes.len())
        {
            result.extend(self.find_updates_in_range(xot, range_a, range_b));
        }
        result
    }

    fn find_updates_in_range(
        &self,
        xot: &Xot,
        range_a: Range<usize>,
        range_b: Range<usize>,
    ) -> Vec<EqualPair> {
        let mut ranges = vec![(range_a, range_b)];
        let mut result = Vec::new();
        while !ranges.is_empty() {
            let (range_a, range_b) = ranges.pop().unwrap();
            let similarity = self.find_most_similar_text(xot, &range_a, &range_b);
            let mut found = false;
            if let Some((pair, similarity)) = similarity {
                if similarity >= TEXT_SIMILARITY_THRESHOLD {
                    result.push(pair);
                    update_ranges(&mut ranges, pair, &range_a, &range_b);
                    found = true
                }
            }
            // if we didn't find any text updates, we look for element
            // updates. We consider text updates to weigh more strongly
            // than any element updates as they carry far more semantic weight
            if !found {
                let similarity = self.find_most_similar_element(xot, &range_a, &range_b);
                if let Some((pair, similarity)) = similarity {
                    if similarity >= ELEMENT_SIMILARITY_THRESHOLD {
                        result.push(pair);
                        update_ranges(&mut ranges, pair, &range_a, &range_b);
                    }
                }
            }
        }
        result.sort_by_key(|EqualPair(a, _)| *a);
        result
    }

    // XXX do we want to predispose this to longest similar strings outweighing
    // shorter similar strings?
    fn find_most_similar_text(
        &self,
        xot: &Xot,
        range_a: &Range<usize>,
        range_b: &Range<usize>,
    ) -> Option<(EqualPair, f32)> {
        self.find_most_similar(
            range_a,
            range_b,
            |node| xot.is_text(node),
            |node_a, node_b| {
                text_similarity(
                    xot.text(node_a).unwrap().get(),
                    xot.text(node_b).unwrap().get(),
                )
            },
        )
    }

    fn find_most_similar_element(
        &self,
        xot: &Xot,
        range_a: &Range<usize>,
        range_b: &Range<usize>,
    ) -> Option<(EqualPair, f32)> {
        // we only consider elements without children for updates
        // elements with children are considered later during propagation
        self.find_most_similar(
            range_a,
            range_b,
            |node| xot.is_element(node) && xot.first_child(node).is_none(),
            |node_a, node_b| {
                let element_a = xot.element(node_a).unwrap();
                let element_b = xot.element(node_b).unwrap();
                element_similarity(element_a, element_b)
            },
        )
    }

    fn find_most_similar<C, S>(
        &self,
        range_a: &Range<usize>,
        range_b: &Range<usize>,
        is_correct_value: C,
        similarity: S,
    ) -> Option<(EqualPair, f32)>
    where
        C: Fn(Node) -> bool,
        S: Fn(Node, Node) -> Option<f32>,
    {
        if range_a.is_empty() || range_b.is_empty() {
            return None;
        }
        let mut max = 0f32;
        let mut max_pair = None;

        // XXX could create a cache for similarity scores, though
        // finding similarity seems fast enough for now
        for a_id in range_a.clone() {
            let node_a = self.get_node_a(a_id);
            if !is_correct_value(node_a) {
                continue;
            }
            for b_id in range_b.clone() {
                let node_b = self.get_node_b(b_id);
                if !is_correct_value(node_b) {
                    continue;
                }
                if let Some(similarity) = similarity(node_a, node_b) {
                    if similarity > max {
                        max = similarity;
                        max_pair = Some(EqualPair(a_id, b_id));
                    }
                }
            }
        }
        max_pair.map(|pair| (pair, max))
    }
}

fn update_ranges(
    ranges: &mut Vec<(Range<usize>, Range<usize>)>,
    pair: EqualPair,
    range_a: &Range<usize>,
    range_b: &Range<usize>,
) {
    let EqualPair(id_a, id_b) = pair;
    let before_range_a = range_a.start..id_a;
    let after_range_a = (id_a + 1)..range_a.end;
    let before_range_b = range_b.start..id_b;
    let after_range_b = (id_b + 1)..range_b.end;
    if !before_range_a.is_empty() && !before_range_b.is_empty() {
        ranges.push((before_range_a, before_range_b));
    } else {
    }
    if !after_range_a.is_empty() && !after_range_b.is_empty() {
        ranges.push((after_range_a, after_range_b));
    }
}

fn non_overlapping(overlaps: &[Overlap], length_a: usize, length_b: usize) -> Vec<NonOverlap> {
    let mut result = Vec::new();
    let mut last_end_a = 0;
    let mut last_end_b = 0;
    for (range_a, range_b) in overlaps {
        if last_end_a < range_a.start || last_end_b < range_b.start {
            result.push((last_end_a..range_a.start, last_end_b..range_b.start));
        }
        last_end_a = range_a.end;
        last_end_b = range_b.end;
    }
    if last_end_a < length_a || last_end_b < length_b {
        result.push((last_end_a..length_a, last_end_b..length_b));
    }
    result
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_non_overlapping() {
        let overlaps = vec![(0..1, 1..2), (2..3, 3..4)];
        let result = non_overlapping(&overlaps, 4, 5);
        assert_eq!(result, vec![(0..0, 0..1), (1..2, 2..3), (3..4, 4..5)]);
    }

    #[test]
    fn test_non_overlapping_nothing() {
        let overlaps = vec![(0..4, 0..4)];
        let result = non_overlapping(&overlaps, 4, 4);
        assert_eq!(result, vec![]);
    }

    #[test]
    fn test_non_overlapping_everything() {
        let overlaps = vec![];
        let result = non_overlapping(&overlaps, 4, 5);
        assert_eq!(result, vec![(0..4, 0..5)]);
    }

    #[test]
    fn test_non_overlapping_middle() {
        let overlaps = vec![(2..4, 3..5)];
        let result = non_overlapping(&overlaps, 8, 8);
        assert_eq!(result, vec![(0..2, 0..3), (4..8, 5..8)]);
    }

    #[test]
    fn test_most_similar_xml() {
        let mut xot = Xot::new();
        let doc_a = xot
            .parse(concat!(
                "<c><p>hello</p><p>world</p><p>help</p><p>swirl</p></c>"
            ))
            .unwrap();
        let doc_b = xot
            .parse(concat!("<c><p>hell</p><p>swipe</p></c>"))
            .unwrap();
        let comparison = Comparison::new(&xot, doc_a, doc_b);
        let most_similar = comparison.find_most_similar_text(&xot, &(2..10), &(2..6));
        assert_eq!(most_similar, Some((EqualPair(3, 3), 0.8)));
        let most_similar = comparison.find_most_similar_text(&xot, &(4..10), &(2..6));
        assert_eq!(most_similar, Some((EqualPair(7, 3), 0.75)));
    }

    #[test]
    fn test_most_similar_elements_xml() {
        let mut xot = Xot::new();
        let doc_a = xot.parse(concat!("<r><a/><b/><c/></r>")).unwrap();
        let doc_b = xot.parse(concat!(r#"<r><d/><b a="A"/><f/></r>"#)).unwrap();
        let comparison = Comparison::new(&xot, doc_a, doc_b);
        let most_similar = comparison.find_most_similar_element(&xot, &(2..5), &(2..5));
        assert_eq!(most_similar, Some((EqualPair(3, 3), 0.44444445)));
    }

    #[test]
    fn test_find_overlaps_paper() {
        let mut xot = Xot::new();
        // example taken from the jndiff paper, page 11
        let doc_a = xot
            .parse(concat!(
                "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
                "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
                "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
            ))
            .unwrap();
        let doc_b = xot
            .parse(concat!(
                "
            <book><chapter><para>Text 2</para></chapter>",
                "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
                "<chapter><para>Text 12</para></chapter></book>"
            ))
            .unwrap();

        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
        let overlaps = comparison.partition();
        let equal_pairs = comparison.find_updates(&xot, &overlaps);

        // Text 5 and Text 25
        assert_eq!(equal_pairs, [EqualPair(11, 9)]);
    }

    #[test]
    fn test_find_element_overlaps() {
        let mut xot = Xot::new();
        let doc_a = xot.parse(concat!("<r><a/><b/><c/></r>")).unwrap();
        let doc_b = xot.parse(concat!(r#"<r><d/><b a="A"/><f/></r>"#)).unwrap();
        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
        let overlaps = comparison.partition();
        let equal_pairs = comparison.find_updates(&xot, &overlaps);

        assert_eq!(equal_pairs, [EqualPair(3, 3)]);
    }
}