natural-xml-diff 0.2.0

Natural diffing between XML documents
Documentation
use ahash::HashSet;
use xot::{Attributes, Element};

// same value weight means we count same attribute values stronger
// than just the same attribute names
const IDENTICAL_VALUE_WEIGHT: f32 = 4.0;

pub(crate) fn element_similarity(element_a: &Element, element_b: &Element) -> Option<f32> {
    if element_a.name() != element_b.name() {
        return None;
    }
    let attributes_a = element_a.attributes();
    let attributes_b = element_b.attributes();
    // let depth_a = xot.ancestors(vnode_a.node_id).count();
    // let depth_b = xot.ancestors(vnode_b.node_id).count();
    Some(attributes_similarity(attributes_a, attributes_b))
}

fn attributes_similarity(attributes_a: &Attributes, attributes_b: &Attributes) -> f32 {
    // this is a sanity check: in reality these elements
    // will considered to be identical by the earlier partitioning
    // algorithm
    if attributes_a.is_empty() && attributes_b.is_empty() {
        return 1.0;
    }

    let keys_a = HashSet::from_iter(attributes_a.keys());
    let keys_b = HashSet::from_iter(attributes_b.keys());
    let identical_keys = keys_a.intersection(&keys_b).count() as f32;

    let mut identical_values: f32 = 0.0;
    for key in keys_a.intersection(&keys_b) {
        let value_a = attributes_a.get(*key).unwrap();
        let value_b = attributes_b.get(*key).unwrap();
        if value_a == value_b {
            identical_values += 1.0;
        }
    }

    let max_attributes = std::cmp::max(attributes_a.len(), attributes_b.len()) as f32;

    (identical_keys + identical_values * IDENTICAL_VALUE_WEIGHT + 4.0)
        / (max_attributes + max_attributes * IDENTICAL_VALUE_WEIGHT + 4.0)
}

#[cfg(test)]
mod tests {
    use super::*;

    use xot::Xot;

    #[test]
    fn test_both_empty() {
        let attributes_a = Attributes::new();
        let attributes_b = Attributes::new();
        assert_eq!(attributes_similarity(&attributes_a, &attributes_b), 1.0);
    }

    #[test]
    fn test_one_empty() {
        let mut xot = Xot::new();
        let name = xot.add_name("a");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name, "b".to_string());
        let attributes_b = Attributes::new();
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b),
            0.44444445
        );
    }

    #[test]
    fn test_same_key_different_value() {
        let mut xot = Xot::new();
        let name = xot.add_name("a");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name, "b".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name, "c".to_string());
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b),
            0.5555556
        );
    }

    #[test]
    fn test_same_key_same_value() {
        // note: the partitioning algorithm should have found this one already
        let mut xot = Xot::new();
        let name = xot.add_name("a");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name, "b".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name, "b".to_string());
        assert_eq!(attributes_similarity(&attributes_a, &attributes_b), 1.0);
    }

    #[test]
    fn test_2_same_keys_different_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "D".to_string());
        attributes_b.insert(name_b, "E".to_string());
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b,),
            0.42857143
        );
    }

    #[test]
    fn test_2_same_keys_same_values() {
        // the partitioning algorithm should have found this one already
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "A".to_string());
        attributes_b.insert(name_b, "B".to_string());
        assert_eq!(attributes_similarity(&attributes_a, &attributes_b,), 1.0);
    }

    #[test]
    fn test_2_same_keys_half_same_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_b, "B".to_string());
        // we consider this enough overlap
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b),
            0.64285713
        );
    }

    #[test]
    fn test_2_half_different_keys_half_same_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let name_c = xot.add_name("c");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_c, "C".to_string());
        attributes_b.insert(name_b, "B".to_string());
        // we consider this enough overlap
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b),
            0.64285713
        );
    }

    #[test]
    fn test_3_same_keys_different_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let name_c = xot.add_name("c");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        attributes_a.insert(name_c, "C".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "D".to_string());
        attributes_b.insert(name_b, "E".to_string());
        attributes_b.insert(name_c, "F".to_string());
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b,),
            0.36842105
        );
    }

    #[test]
    fn test_3_same_keys_2_different_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let name_c = xot.add_name("c");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        attributes_a.insert(name_c, "C".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "A".to_string());
        attributes_b.insert(name_b, "E".to_string());
        attributes_b.insert(name_c, "F".to_string());
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b,),
            0.57894737
        );
    }

    #[test]
    fn test_3_same_keys_1_different_values() {
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let name_c = xot.add_name("c");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        attributes_a.insert(name_c, "C".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "A".to_string());
        attributes_b.insert(name_b, "B".to_string());
        attributes_b.insert(name_c, "F".to_string());
        assert_eq!(
            attributes_similarity(&attributes_a, &attributes_b,),
            0.7894737
        );
    }

    #[test]
    fn test_3_same_keys_same_values() {
        // the partitioning algorithm should have found this already
        let mut xot = Xot::new();
        let name_a = xot.add_name("a");
        let name_b = xot.add_name("b");
        let name_c = xot.add_name("c");
        let mut attributes_a = Attributes::new();
        attributes_a.insert(name_a, "A".to_string());
        attributes_a.insert(name_b, "B".to_string());
        attributes_a.insert(name_c, "C".to_string());
        let mut attributes_b = Attributes::new();
        attributes_b.insert(name_a, "A".to_string());
        attributes_b.insert(name_b, "B".to_string());
        attributes_b.insert(name_c, "C".to_string());
        assert_eq!(attributes_similarity(&attributes_a, &attributes_b,), 1.0);
    }
}