use std::ops::Range;
use xot::{Node, Xot};
use crate::comparison::{Comparison, EqualPair};
use crate::element::element_similarity;
use crate::partition::Overlap;
use crate::text::text_similarity;
const TEXT_SIMILARITY_THRESHOLD: f32 = 0.5;
const ELEMENT_SIMILARITY_THRESHOLD: f32 = 0.4;
type NonOverlap = (Range<usize>, Range<usize>);
impl Comparison {
pub(crate) fn find_updates(&mut self, xot: &Xot, overlaps: &[Overlap]) -> Vec<EqualPair> {
let mut result = Vec::new();
for (range_a, range_b) in
non_overlapping(overlaps, self.vtree_a.nodes.len(), self.vtree_b.nodes.len())
{
result.extend(self.find_updates_in_range(xot, range_a, range_b));
}
result
}
fn find_updates_in_range(
&self,
xot: &Xot,
range_a: Range<usize>,
range_b: Range<usize>,
) -> Vec<EqualPair> {
let mut ranges = vec![(range_a, range_b)];
let mut result = Vec::new();
while !ranges.is_empty() {
let (range_a, range_b) = ranges.pop().unwrap();
let similarity = self.find_most_similar_text(xot, &range_a, &range_b);
let mut found = false;
if let Some((pair, similarity)) = similarity {
if similarity >= TEXT_SIMILARITY_THRESHOLD {
result.push(pair);
update_ranges(&mut ranges, pair, &range_a, &range_b);
found = true
}
}
if !found {
let similarity = self.find_most_similar_element(xot, &range_a, &range_b);
if let Some((pair, similarity)) = similarity {
if similarity >= ELEMENT_SIMILARITY_THRESHOLD {
result.push(pair);
update_ranges(&mut ranges, pair, &range_a, &range_b);
}
}
}
}
result.sort_by_key(|EqualPair(a, _)| *a);
result
}
fn find_most_similar_text(
&self,
xot: &Xot,
range_a: &Range<usize>,
range_b: &Range<usize>,
) -> Option<(EqualPair, f32)> {
self.find_most_similar(
range_a,
range_b,
|node| xot.is_text(node),
|node_a, node_b| {
text_similarity(
xot.text(node_a).unwrap().get(),
xot.text(node_b).unwrap().get(),
)
},
)
}
fn find_most_similar_element(
&self,
xot: &Xot,
range_a: &Range<usize>,
range_b: &Range<usize>,
) -> Option<(EqualPair, f32)> {
self.find_most_similar(
range_a,
range_b,
|node| xot.is_element(node) && xot.first_child(node).is_none(),
|node_a, node_b| {
let element_a = xot.element(node_a).unwrap();
let element_b = xot.element(node_b).unwrap();
element_similarity(element_a, element_b)
},
)
}
fn find_most_similar<C, S>(
&self,
range_a: &Range<usize>,
range_b: &Range<usize>,
is_correct_value: C,
similarity: S,
) -> Option<(EqualPair, f32)>
where
C: Fn(Node) -> bool,
S: Fn(Node, Node) -> Option<f32>,
{
if range_a.is_empty() || range_b.is_empty() {
return None;
}
let mut max = 0f32;
let mut max_pair = None;
for a_id in range_a.clone() {
let node_a = self.get_node_a(a_id);
if !is_correct_value(node_a) {
continue;
}
for b_id in range_b.clone() {
let node_b = self.get_node_b(b_id);
if !is_correct_value(node_b) {
continue;
}
if let Some(similarity) = similarity(node_a, node_b) {
if similarity > max {
max = similarity;
max_pair = Some(EqualPair(a_id, b_id));
}
}
}
}
max_pair.map(|pair| (pair, max))
}
}
fn update_ranges(
ranges: &mut Vec<(Range<usize>, Range<usize>)>,
pair: EqualPair,
range_a: &Range<usize>,
range_b: &Range<usize>,
) {
let EqualPair(id_a, id_b) = pair;
let before_range_a = range_a.start..id_a;
let after_range_a = (id_a + 1)..range_a.end;
let before_range_b = range_b.start..id_b;
let after_range_b = (id_b + 1)..range_b.end;
if !before_range_a.is_empty() && !before_range_b.is_empty() {
ranges.push((before_range_a, before_range_b));
} else {
}
if !after_range_a.is_empty() && !after_range_b.is_empty() {
ranges.push((after_range_a, after_range_b));
}
}
fn non_overlapping(overlaps: &[Overlap], length_a: usize, length_b: usize) -> Vec<NonOverlap> {
let mut result = Vec::new();
let mut last_end_a = 0;
let mut last_end_b = 0;
for (range_a, range_b) in overlaps {
if last_end_a < range_a.start || last_end_b < range_b.start {
result.push((last_end_a..range_a.start, last_end_b..range_b.start));
}
last_end_a = range_a.end;
last_end_b = range_b.end;
}
if last_end_a < length_a || last_end_b < length_b {
result.push((last_end_a..length_a, last_end_b..length_b));
}
result
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_non_overlapping() {
let overlaps = vec![(0..1, 1..2), (2..3, 3..4)];
let result = non_overlapping(&overlaps, 4, 5);
assert_eq!(result, vec![(0..0, 0..1), (1..2, 2..3), (3..4, 4..5)]);
}
#[test]
fn test_non_overlapping_nothing() {
let overlaps = vec![(0..4, 0..4)];
let result = non_overlapping(&overlaps, 4, 4);
assert_eq!(result, vec![]);
}
#[test]
fn test_non_overlapping_everything() {
let overlaps = vec![];
let result = non_overlapping(&overlaps, 4, 5);
assert_eq!(result, vec![(0..4, 0..5)]);
}
#[test]
fn test_non_overlapping_middle() {
let overlaps = vec![(2..4, 3..5)];
let result = non_overlapping(&overlaps, 8, 8);
assert_eq!(result, vec![(0..2, 0..3), (4..8, 5..8)]);
}
#[test]
fn test_most_similar_xml() {
let mut xot = Xot::new();
let doc_a = xot
.parse(concat!(
"<c><p>hello</p><p>world</p><p>help</p><p>swirl</p></c>"
))
.unwrap();
let doc_b = xot
.parse(concat!("<c><p>hell</p><p>swipe</p></c>"))
.unwrap();
let comparison = Comparison::new(&xot, doc_a, doc_b);
let most_similar = comparison.find_most_similar_text(&xot, &(2..10), &(2..6));
assert_eq!(most_similar, Some((EqualPair(3, 3), 0.8)));
let most_similar = comparison.find_most_similar_text(&xot, &(4..10), &(2..6));
assert_eq!(most_similar, Some((EqualPair(7, 3), 0.75)));
}
#[test]
fn test_most_similar_elements_xml() {
let mut xot = Xot::new();
let doc_a = xot.parse(concat!("<r><a/><b/><c/></r>")).unwrap();
let doc_b = xot.parse(concat!(r#"<r><d/><b a="A"/><f/></r>"#)).unwrap();
let comparison = Comparison::new(&xot, doc_a, doc_b);
let most_similar = comparison.find_most_similar_element(&xot, &(2..5), &(2..5));
assert_eq!(most_similar, Some((EqualPair(3, 3), 0.44444445)));
}
#[test]
fn test_find_overlaps_paper() {
let mut xot = Xot::new();
let doc_a = xot
.parse(concat!(
"<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
"<chapter><title>Text 4</title><para>Text 5</para></chapter>",
"<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
"<chapter><title>Text 9</title><para>Text 10</para></chapter>",
"<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
))
.unwrap();
let doc_b = xot
.parse(concat!(
"
<book><chapter><para>Text 2</para></chapter>",
"<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
"<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
"<chapter><title>Text 9</title><para>Text 10</para></chapter>",
"<chapter><para>Text 12</para></chapter></book>"
))
.unwrap();
let mut comparison = Comparison::new(&xot, doc_a, doc_b);
let overlaps = comparison.partition();
let equal_pairs = comparison.find_updates(&xot, &overlaps);
assert_eq!(equal_pairs, [EqualPair(11, 9)]);
}
#[test]
fn test_find_element_overlaps() {
let mut xot = Xot::new();
let doc_a = xot.parse(concat!("<r><a/><b/><c/></r>")).unwrap();
let doc_b = xot.parse(concat!(r#"<r><d/><b a="A"/><f/></r>"#)).unwrap();
let mut comparison = Comparison::new(&xot, doc_a, doc_b);
let overlaps = comparison.partition();
let equal_pairs = comparison.find_updates(&xot, &overlaps);
assert_eq!(equal_pairs, [EqualPair(3, 3)]);
}
}