1use std::collections::HashSet;
8use std::rc::Rc;
9
10use crate::matching::constants::*;
11use crate::matching::{CandidateEntry, DfsTreeIterator, Matching};
12use crate::measure::Measure;
13use crate::node::{MatchArea, MatchType, NodeInner, NodeRef, XmlContent};
14
15pub struct HeuristicMatching {
25 base_root: Option<NodeRef>,
27 branch_root: Option<NodeRef>,
29 measure: Measure,
31 is_left: bool,
33 copy_threshold: i32,
36}
37
38impl Default for HeuristicMatching {
39 fn default() -> Self {
40 Self::new()
41 }
42}
43
44impl HeuristicMatching {
45 pub fn new() -> Self {
47 Self::with_copy_threshold(COPY_THRESHOLD)
48 }
49
50 pub fn with_copy_threshold(copy_threshold: i32) -> Self {
52 HeuristicMatching {
53 base_root: None,
54 branch_root: None,
55 measure: Measure::new(),
56 is_left: true,
57 copy_threshold,
58 }
59 }
60
61 pub fn with_trees(base: &NodeRef, branch: &NodeRef) -> Self {
63 let mut matching = Self::new();
64 matching.build_matching(base, branch);
65 matching
66 }
67
68 pub fn set_is_left(&mut self, is_left: bool) {
70 self.is_left = is_left;
71 }
72
73 pub fn copy_threshold(&self) -> i32 {
75 self.copy_threshold
76 }
77
78 fn match_subtrees(&mut self, base: &NodeRef, branch: &NodeRef) {
80 let candidates = self.find_candidates(base, branch);
82
83 let mut best_count = 0;
85 let mut best_candidates: Vec<CandidateEntry> = Vec::new();
86
87 for candidate in candidates {
88 let this_count = self.dfs_match_count(&candidate.candidate, branch, 0);
89 if this_count == best_count {
90 best_candidates.push(candidate);
91 } else if this_count > best_count {
92 best_count = this_count;
93 best_candidates.clear();
94 best_candidates.push(candidate);
95 }
96 }
97
98 let best = self.get_best_candidate(branch, &mut best_candidates, best_count);
100 let mut stop_nodes: Vec<NodeRef> = Vec::new();
101
102 if let Some(best_entry) = best {
103 let match_area = Rc::new(MatchArea::new(Rc::downgrade(branch)));
104 self.dfs_match_add(&best_entry.candidate, branch, &mut stop_nodes, &match_area);
105 } else {
106 let branch_borrowed = branch.borrow();
108 for child in branch_borrowed.children() {
109 stop_nodes.push(child.clone());
110 }
111 }
112
113 for stop_node in stop_nodes {
115 self.match_subtrees(base, &stop_node);
116 }
117 }
118
119 fn find_candidates(&mut self, tree: &NodeRef, key: &NodeRef) -> Vec<CandidateEntry> {
121 let mut candidates = Vec::new();
122 self.find_exact_matches(tree, key, &mut candidates);
123 if candidates.is_empty() {
124 self.find_fuzzy_matches(tree, key, &mut candidates);
125 }
126 candidates
127 }
128
129 fn find_exact_matches(
131 &self,
132 tree: &NodeRef,
133 key: &NodeRef,
134 candidates: &mut Vec<CandidateEntry>,
135 ) {
136 let key_borrowed = key.borrow();
137 let key_content = key_borrowed.content();
138
139 for cand in DfsTreeIterator::new(tree.clone()) {
140 let cand_borrowed = cand.borrow();
141 if let (Some(key_c), Some(cand_c)) = (key_content, cand_borrowed.content()) {
142 if cand_c.content_equals(key_c) {
143 drop(cand_borrowed);
144 candidates.push(CandidateEntry::new(cand, 0.0, -1.0));
145 }
146 }
147 }
148 }
149
150 fn find_fuzzy_matches(
152 &mut self,
153 tree: &NodeRef,
154 key: &NodeRef,
155 candidates: &mut Vec<CandidateEntry>,
156 ) {
157 let mut cset: Vec<CandidateEntry> = Vec::new();
158 let mut cutoff = 2.0 * MAX_FUZZY_MATCH;
159
160 for cand in DfsTreeIterator::new(tree.clone()) {
161 let lrd_dist = self
162 .get_distance_of_left(key, &cand)
163 .min(self.get_distance_of_right(key, &cand))
164 .min(self.measure.child_list_distance(key, &cand));
165
166 let content_dist = self.measure.get_distance(Some(&cand), Some(key));
167 let min_dist = lrd_dist.min(content_dist);
168
169 if min_dist < 2.0 * MAX_FUZZY_MATCH {
170 cset.push(CandidateEntry::new(cand, min_dist, lrd_dist));
171 cutoff = cutoff.min(2.0 * min_dist);
172 }
173 }
174
175 cset.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
177
178 for entry in cset {
180 if entry.distance > cutoff {
181 break;
182 }
183 candidates.push(entry);
184 }
185 }
186
187 fn dfs_match_count(&self, a: &NodeRef, b: &NodeRef, count: i32) -> i32 {
189 let a_borrowed = a.borrow();
190 let b_borrowed = b.borrow();
191
192 let mut children_match = a_borrowed.child_count() == b_borrowed.child_count();
193
194 if children_match {
195 for i in 0..a_borrowed.child_count() {
197 let a_child = &a_borrowed.children()[i];
198 let b_child = &b_borrowed.children()[i];
199
200 let a_child_content = a_child.borrow().content().cloned();
201 let b_child_content = b_child.borrow().content().cloned();
202
203 if let (Some(ac), Some(bc)) = (a_child_content, b_child_content) {
204 if !ac.content_equals(&bc) {
205 children_match = false;
206 break;
207 }
208 } else {
209 children_match = false;
210 break;
211 }
212 }
213 }
214
215 let mut result = count + 1;
216
217 if children_match && a_borrowed.child_count() > 0 {
218 let a_child = a_borrowed.children()[0].clone();
219 let b_child = b_borrowed.children()[0].clone();
220 drop(a_borrowed);
221 drop(b_borrowed);
222 result += self.dfs_match_count(&a_child, &b_child, 0);
223 result += self.dfs_match_count_remaining(a, b, 1);
224 }
225
226 result
227 }
228
229 fn dfs_match_count_remaining(&self, a: &NodeRef, b: &NodeRef, start: usize) -> i32 {
231 let a_borrowed = a.borrow();
232 let b_borrowed = b.borrow();
233
234 if start < a_borrowed.child_count() {
235 let a_child = a_borrowed.children()[start].clone();
236 let b_child = b_borrowed.children()[start].clone();
237 drop(a_borrowed);
238 drop(b_borrowed);
239 let result = self.dfs_match_count(&a_child, &b_child, 0);
240 result + self.dfs_match_count_remaining(a, b, start + 1)
241 } else {
242 0
243 }
244 }
245
246 fn dfs_match_add(
248 &mut self,
249 a: &NodeRef,
250 b: &NodeRef,
251 stop_nodes: &mut Vec<NodeRef>,
252 match_area: &Rc<MatchArea>,
253 ) {
254 self.add_matching(b, a);
256
257 {
259 let a_borrowed = a.borrow();
260 if let Some(content) = a_borrowed.content() {
261 match_area.add_info_bytes(content.info_size());
262 }
263 }
264
265 b.borrow_mut().set_match_area(Some(match_area.clone()));
267
268 let a_borrowed = a.borrow();
269 let b_borrowed = b.borrow();
270
271 let mut children_match = a_borrowed.child_count() == b_borrowed.child_count();
272
273 if children_match {
274 for i in 0..a_borrowed.child_count() {
275 let a_child = &a_borrowed.children()[i];
276 let b_child = &b_borrowed.children()[i];
277
278 let a_child_content = a_child.borrow().content().cloned();
279 let b_child_content = b_child.borrow().content().cloned();
280
281 if let (Some(ac), Some(bc)) = (a_child_content, b_child_content) {
282 if !ac.content_equals(&bc) {
283 children_match = false;
284 break;
285 }
286 } else {
287 children_match = false;
288 break;
289 }
290 }
291 }
292
293 if !children_match {
294 for child in b_borrowed.children() {
296 stop_nodes.push(child.clone());
297 }
298 } else {
299 match_area.add_info_bytes(a_borrowed.child_count() as i32 * EDGE_BYTES);
301
302 let children_a: Vec<NodeRef> = a_borrowed.children().to_vec();
303 let children_b: Vec<NodeRef> = b_borrowed.children().to_vec();
304 drop(a_borrowed);
305 drop(b_borrowed);
306
307 for (a_child, b_child) in children_a.iter().zip(children_b.iter()) {
308 self.dfs_match_add(a_child, b_child, stop_nodes, match_area);
309 }
310 }
311 }
312
313 fn get_best_candidate(
315 &mut self,
316 branch: &NodeRef,
317 best_candidates: &mut [CandidateEntry],
318 best_count: i32,
319 ) -> Option<CandidateEntry> {
320 if best_candidates.is_empty() {
321 return None;
322 }
323
324 if best_candidates.len() > 1 {
326 let branch_borrowed = branch.borrow();
328 if let Some(left_sibling) = NodeInner::left_sibling_of_ref(branch) {
329 let left_base_match = {
330 let left_borrowed = left_sibling.borrow();
331 left_borrowed.get_base_match().and_then(|w| w.upgrade())
332 };
333
334 if let Some(left_base) = left_base_match {
335 for candidate in best_candidates.iter() {
336 let cand_left = NodeInner::left_sibling_of_ref(&candidate.candidate);
337 if let Some(cl) = cand_left {
338 if cl.borrow().id() == left_base.borrow().id() {
339 return Some(candidate.clone());
340 }
341 }
342 }
343 }
344 }
345 drop(branch_borrowed);
346
347 for candidate in best_candidates.iter_mut() {
349 if candidate.left_right_down < 0.0 {
350 let child_dist = self
351 .measure
352 .child_list_distance(&candidate.candidate, branch);
353 let left_dist = self.get_distance_of_left(&candidate.candidate, branch);
354 let right_dist = self.get_distance_of_right(&candidate.candidate, branch);
355 candidate.left_right_down = child_dist.min(left_dist).min(right_dist);
356 }
357 }
358
359 best_candidates
361 .sort_by(|a, b| a.left_right_down.partial_cmp(&b.left_right_down).unwrap());
362 }
363
364 let best = best_candidates.first().cloned();
365
366 if let Some(ref b) = best {
368 if best_count == 1 && b.left_right_down.min(b.distance) > 0.1 {
369 return None;
370 }
371 }
372
373 best
374 }
375
376 fn match_similar_unmatched(&mut self, _base: &NodeRef, branch: &NodeRef) {
378 let children: Vec<NodeRef> = branch.borrow().children().to_vec();
380 for child in &children {
381 self.match_similar_unmatched(_base, child);
382 }
383
384 let base_match = {
385 let branch_borrowed = branch.borrow();
386 branch_borrowed.get_base_match().and_then(|w| w.upgrade())
387 };
388
389 if let Some(base_match) = base_match {
390 let base_child_count = base_match.borrow().child_count();
391 if base_child_count == 0 {
392 return;
393 }
394
395 let branch_children: Vec<NodeRef> = branch.borrow().children().to_vec();
396 let branch_child_count = branch_children.len();
397
398 for (i, n) in branch_children.iter().enumerate() {
399 if n.borrow().get_base_match().is_some() {
401 continue;
402 }
403
404 let last_base_child = base_child_count - 1;
405
406 if i == 0 {
408 let first_base_child = base_match.borrow().child(0).cloned();
409 if let Some(fbc) = first_base_child {
410 if !self.is_matched(&fbc) {
411 self.add_matching_if_same_type(n, &fbc);
412 continue;
413 }
414 }
415 } else if i == branch_child_count - 1 {
416 let last_base = base_match.borrow().child(last_base_child).cloned();
417 if let Some(lbc) = last_base {
418 if !self.is_matched(&lbc) {
419 self.add_matching_if_same_type(n, &lbc);
420 continue;
421 }
422 }
423 }
424
425 if i > 0 {
427 let pred = &branch_children[i - 1];
428 let x = pred.borrow().get_base_match().and_then(|w| w.upgrade());
429 if let Some(x) = x {
430 if x.borrow().has_right_sibling() {
431 let right_sib = NodeInner::right_sibling_of_ref(&x);
432 if let Some(rs) = right_sib {
433 if !self.is_matched(&rs) {
434 self.add_matching_if_same_type(n, &rs);
435 continue;
436 }
437 }
438 }
439 }
440 }
441
442 if i < branch_child_count - 1 {
444 let succ = &branch_children[i + 1];
445 let x = succ.borrow().get_base_match().and_then(|w| w.upgrade());
446 if let Some(x) = x {
447 if x.borrow().has_left_sibling() {
448 let left_sib = NodeInner::left_sibling_of_ref(&x);
449 if let Some(ls) = left_sib {
450 if !self.is_matched(&ls) {
451 self.add_matching_if_same_type(n, &ls);
452 continue;
453 }
454 }
455 }
456 }
457 }
458 }
459 }
460 }
461
462 fn remove_small_copies(&mut self, root: &NodeRef) {
464 let base = {
465 let root_borrowed = root.borrow();
466 root_borrowed.get_base_match().and_then(|w| w.upgrade())
467 };
468
469 if let Some(base) = base {
470 let match_count = self.get_match_count(&base);
471
472 if match_count > 1 {
473 let matches = self.get_matches(&base);
474 let mut deletia: HashSet<u64> = HashSet::new();
475
476 for copy in &matches {
478 let info_bytes = {
479 let copy_borrowed = copy.borrow();
480 copy_borrowed
481 .match_area()
482 .map(|ma| ma.info_bytes())
483 .unwrap_or(0)
484 };
485
486 if info_bytes < self.copy_threshold {
487 deletia.insert(copy.borrow().id());
488 }
489 }
490
491 if deletia.len() == matches.len() {
493 let mut max_copy_bytes = 0;
494 let mut orig_instance: Option<NodeRef> = None;
495
496 for copy in &matches {
497 let copy_bytes = self.calc_copy_context_bytes(copy);
498 if copy_bytes > max_copy_bytes {
499 max_copy_bytes = copy_bytes;
500 orig_instance = Some(copy.clone());
501 }
502 }
503
504 if let Some(orig) = orig_instance {
505 deletia.remove(&orig.borrow().id());
506 if let Some(ma) = orig.borrow().match_area() {
508 ma.add_info_bytes(self.copy_threshold + 1);
509 }
510 }
511 }
512
513 let match_areas_to_delete: Vec<Rc<MatchArea>> = deletia
515 .iter()
516 .filter_map(|id| {
517 matches
518 .iter()
519 .find(|c| c.borrow().id() == *id)
520 .and_then(|copy| copy.borrow().match_area().cloned())
521 })
522 .collect();
523
524 for ma in match_areas_to_delete {
526 self.del_match_area(&ma);
527 }
528 }
529 }
530
531 let children: Vec<NodeRef> = root.borrow().children().to_vec();
533 for child in children {
534 self.remove_small_copies(&child);
535 }
536 }
537
538 fn calc_copy_context_bytes(&self, copy: &NodeRef) -> i32 {
540 let match_area = copy.borrow().match_area().cloned();
541 let mut copy_bytes = match_area.as_ref().map(|ma| ma.info_bytes()).unwrap_or(0);
542
543 if copy_bytes >= self.copy_threshold {
544 return copy_bytes;
545 }
546
547 let copy_root = match_area.as_ref().and_then(|ma| ma.root_strong());
548 if copy_root.is_none() {
549 return copy_bytes;
550 }
551
552 let copy_root = copy_root.unwrap();
553 let copy_base = {
554 let cr_borrowed = copy_root.borrow();
555 cr_borrowed.get_base_match().and_then(|w| w.upgrade())
556 };
557
558 if copy_base.is_none() {
559 return copy_bytes;
560 }
561
562 let mut curr_copy = copy_root.clone();
564 let mut curr_base = copy_base.clone().unwrap();
565
566 while copy_bytes < self.copy_threshold {
567 let left_copy = NodeInner::left_sibling_of_ref(&curr_copy);
568 let left_base = NodeInner::left_sibling_of_ref(&curr_base);
569
570 match (left_copy, left_base) {
571 (Some(lc), Some(lb)) => {
572 let lc_base = lc.borrow().get_base_match().and_then(|w| w.upgrade());
573 if lc_base.map(|b| b.borrow().id()) == Some(lb.borrow().id()) {
574 if let Some(ma) = lc.borrow().match_area() {
575 copy_bytes += ma.info_bytes();
576 }
577 curr_copy = lc;
578 curr_base = lb;
579 } else {
580 break;
581 }
582 }
583 _ => break,
584 }
585 }
586
587 let copy_root2 = match_area.as_ref().and_then(|ma| ma.root_strong());
589 if let Some(cr2) = copy_root2 {
590 curr_copy = cr2;
591 curr_base = copy_base.unwrap();
592
593 while copy_bytes < self.copy_threshold {
594 let right_copy = NodeInner::right_sibling_of_ref(&curr_copy);
595 let right_base = NodeInner::right_sibling_of_ref(&curr_base);
596
597 match (right_copy, right_base) {
598 (Some(rc), Some(rb)) => {
599 let rc_base = rc.borrow().get_base_match().and_then(|w| w.upgrade());
600 if rc_base.map(|b| b.borrow().id()) == Some(rb.borrow().id()) {
601 if let Some(ma) = rc.borrow().match_area() {
602 copy_bytes += ma.info_bytes();
603 }
604 curr_copy = rc;
605 curr_base = rb;
606 } else {
607 break;
608 }
609 }
610 _ => break,
611 }
612 }
613 }
614
615 copy_bytes
616 }
617
618 fn set_match_types(&mut self, base: &NodeRef) {
620 let children: Vec<NodeRef> = base.borrow().children().to_vec();
622 for child in children {
623 self.set_match_types(&child);
624 }
625
626 let match_count = self.get_match_count(base);
627 if match_count > 1 {
628 let matches = self.get_matches(base);
630 let mut min_dist = i32::MAX;
631 let mut min_content_dist = f64::MAX;
632 let mut master: Option<NodeRef> = None;
633
634 for cand in &matches {
635 let dist = if self.exact_child_list_match(base, cand) {
636 0
637 } else {
638 self.measure
639 .matched_child_list_distance(base, cand, self.is_left)
640 };
641
642 if dist < min_dist {
643 min_dist = dist;
644 master = Some(cand.clone());
645 } else if dist == min_dist {
646 let c_dist = self.measure.get_distance(Some(base), Some(cand));
648 if master.is_none() || c_dist < min_content_dist {
649 min_content_dist = c_dist;
650 master = Some(cand.clone());
651 }
652 }
653 }
654
655 let mut removed_matchings: Vec<NodeRef> = Vec::new();
657
658 for cand in &matches {
659 if master.as_ref().map(|m| m.borrow().id()) == Some(cand.borrow().id()) {
660 continue;
661 }
662
663 let struct_match = self.exact_child_list_match(base, cand);
664 let cont_match = {
665 let cand_borrowed = cand.borrow();
666 let base_borrowed = base.borrow();
667 match (cand_borrowed.content(), base_borrowed.content()) {
668 (Some(cc), Some(bc)) => cc.content_equals(bc),
669 _ => false,
670 }
671 };
672
673 if !struct_match && !cont_match {
674 removed_matchings.push(cand.clone());
675 } else {
676 let mut match_type = MatchType::NONE;
677 if cont_match {
678 match_type |= MatchType::CONTENT;
679 }
680 if struct_match {
681 match_type |= MatchType::CHILDREN;
682 }
683 cand.borrow_mut().set_match_type(match_type);
684 }
685 }
686
687 for cand in removed_matchings {
689 if let Some(base_match) = cand.borrow().get_base_match().and_then(|w| w.upgrade()) {
690 self.del_matching(&cand, &base_match);
691 }
692 }
693 }
694 }
695
696 fn exact_child_list_match(&self, base: &NodeRef, branch: &NodeRef) -> bool {
698 let base_borrowed = base.borrow();
699 let branch_borrowed = branch.borrow();
700
701 if base_borrowed.child_count() != branch_borrowed.child_count() {
702 return false;
703 }
704
705 for i in 0..branch_borrowed.child_count() {
706 let branch_child = &branch_borrowed.children()[i];
707 let base_child = &base_borrowed.children()[i];
708
709 let branch_child_base = branch_child
710 .borrow()
711 .get_base_match()
712 .and_then(|w| w.upgrade());
713
714 match branch_child_base {
715 Some(bcb) if bcb.borrow().id() == base_child.borrow().id() => continue,
716 _ => return false,
717 }
718 }
719
720 true
721 }
722
723 fn get_distance_of_left(&mut self, a: &NodeRef, b: &NodeRef) -> f64 {
725 let a_borrowed = a.borrow();
726 let b_borrowed = b.borrow();
727
728 if a_borrowed.parent().upgrade().is_none() || b_borrowed.parent().upgrade().is_none() {
729 return crate::measure::MAX_DIST;
730 }
731
732 let a_pos = a_borrowed.child_pos();
733 let b_pos = b_borrowed.child_pos();
734
735 drop(a_borrowed);
736 drop(b_borrowed);
737
738 if a_pos > 0 && b_pos > 0 {
739 let a_left = NodeInner::left_sibling_of_ref(a);
740 let b_left = NodeInner::left_sibling_of_ref(b);
741
742 if let (Some(al), Some(bl)) = (a_left, b_left) {
743 return self.measure.get_distance(Some(&al), Some(&bl));
744 }
745 }
746
747 END_MATCH
748 }
749
750 fn get_distance_of_right(&mut self, a: &NodeRef, b: &NodeRef) -> f64 {
752 let a_borrowed = a.borrow();
753 let b_borrowed = b.borrow();
754
755 let a_parent = a_borrowed.parent().upgrade();
756 let b_parent = b_borrowed.parent().upgrade();
757
758 if a_parent.is_none() || b_parent.is_none() {
759 return crate::measure::MAX_DIST;
760 }
761
762 let a_pos = a_borrowed.child_pos() as usize;
763 let b_pos = b_borrowed.child_pos() as usize;
764 let a_parent_count = a_parent.as_ref().unwrap().borrow().child_count();
765 let b_parent_count = b_parent.as_ref().unwrap().borrow().child_count();
766
767 drop(a_borrowed);
768 drop(b_borrowed);
769
770 if a_pos < a_parent_count - 1 && b_pos < b_parent_count - 1 {
771 let a_right = NodeInner::right_sibling_of_ref(a);
772 let b_right = NodeInner::right_sibling_of_ref(b);
773
774 if let (Some(ar), Some(br)) = (a_right, b_right) {
775 return self.measure.get_distance(Some(&ar), Some(&br));
776 }
777 }
778
779 END_MATCH
780 }
781
782 fn add_matching_if_same_type(&mut self, branch: &NodeRef, base: &NodeRef) {
784 let branch_content = branch.borrow().content().cloned();
785 let base_content = base.borrow().content().cloned();
786
787 let same_type = matches!(
788 (&branch_content, &base_content),
789 (Some(XmlContent::Text(_)), Some(XmlContent::Text(_)))
790 | (Some(XmlContent::Element(_)), Some(XmlContent::Element(_)))
791 );
792
793 if same_type {
794 self.add_matching(branch, base);
795 }
796 }
797
798 fn add_matching(&mut self, branch: &NodeRef, base: &NodeRef) {
800 branch.borrow_mut().set_base_match(base);
801 branch.borrow_mut().set_match_type(MatchType::FULL);
802
803 let base_borrowed = base.borrow();
805 if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
806 left.borrow_mut().add_match(branch.clone());
807 }
808 }
809
810 fn del_matching(&mut self, branch: &NodeRef, base: &NodeRef) {
812 branch.borrow_mut().del_base_match();
813
814 let base_borrowed = base.borrow();
816 if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
817 left.borrow_mut().del_match(branch);
818 }
819 }
820
821 fn del_match_area(&mut self, match_area: &Rc<MatchArea>) {
823 if let Some(root) = match_area.root_strong() {
824 self.del_match_area_recursive(&root, match_area);
825 }
826 }
827
828 fn del_match_area_recursive(&mut self, node: &NodeRef, match_area: &Rc<MatchArea>) {
830 let should_process = {
833 let borrowed = node.borrow();
834 borrowed
835 .match_area()
836 .map(|ma| Rc::ptr_eq(ma, match_area))
837 .unwrap_or(false)
838 };
839
840 if should_process {
841 node.borrow_mut().set_match_area(None);
842
843 let base_match = {
845 let borrowed = node.borrow();
846 borrowed.get_base_match().and_then(|w| w.upgrade())
847 };
848 if let Some(base_match) = base_match {
849 self.del_matching(node, &base_match);
850 }
851
852 let children: Vec<NodeRef> = node.borrow().children().to_vec();
854 for child in children {
855 self.del_match_area_recursive(&child, match_area);
856 }
857 }
858 }
859
860 fn is_matched(&self, base: &NodeRef) -> bool {
862 self.get_match_count(base) > 0
863 }
864
865 fn get_match_count(&self, base: &NodeRef) -> usize {
867 let base_borrowed = base.borrow();
868 if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
869 left.borrow().match_count()
870 } else {
871 0
872 }
873 }
874
875 fn get_matches(&self, base: &NodeRef) -> Vec<NodeRef> {
877 let base_borrowed = base.borrow();
878 if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
879 left.borrow().matches().to_vec()
880 } else {
881 Vec::new()
882 }
883 }
884}
885
886impl Matching for HeuristicMatching {
887 fn build_matching(&mut self, base: &NodeRef, branch: &NodeRef) {
888 self.base_root = Some(base.clone());
889 self.branch_root = Some(branch.clone());
890
891 self.match_subtrees(base, branch);
892 self.remove_small_copies(branch);
893 self.match_similar_unmatched(base, branch);
894 self.set_match_types(base);
895
896 branch.borrow_mut().set_base_match(base);
898 branch.borrow_mut().set_match_type(MatchType::FULL);
899 }
900
901 fn get_area_stop_nodes(&self, stop_nodes: &mut Vec<NodeRef>, node: &NodeRef) {
902 let parent_area = node.borrow().match_area().cloned();
903
904 if parent_area.is_none() {
905 panic!("ASSERT FAILED: node has no match area");
906 }
907
908 let parent_area = parent_area.unwrap();
909 let mut children_in_same_area = true;
910
911 {
912 let node_borrowed = node.borrow();
913 for child in node_borrowed.children() {
914 let child_area = child.borrow().match_area().cloned();
915 if child_area.as_ref().map(|ca| Rc::ptr_eq(ca, &parent_area)) != Some(true) {
916 children_in_same_area = false;
917 break;
918 }
919 }
920
921 if node_borrowed.child_count() == 0 {
923 if let Some(base_match) = node_borrowed.get_base_match().and_then(|w| w.upgrade()) {
924 if base_match.borrow().child_count() != 0 {
925 children_in_same_area = false;
926 }
927 }
928 }
929 }
930
931 if !children_in_same_area {
932 stop_nodes.push(node.clone());
933 } else {
934 let children: Vec<NodeRef> = node.borrow().children().to_vec();
935 for child in children {
936 self.get_area_stop_nodes(stop_nodes, &child);
937 }
938 }
939 }
940
941 fn base_root(&self) -> Option<&NodeRef> {
942 self.base_root.as_ref()
943 }
944
945 fn branch_root(&self) -> Option<&NodeRef> {
946 self.branch_root.as_ref()
947 }
948}
949
950#[cfg(test)]
951mod tests {
952 use super::*;
953 use crate::node::{new_base_node, new_branch_node, XmlText};
954 use std::collections::HashMap;
955
956 fn make_element(name: &str) -> XmlContent {
957 XmlContent::Element(crate::node::XmlElement::new(
958 name.to_string(),
959 HashMap::new(),
960 ))
961 }
962
963 fn make_text(text: &str) -> XmlContent {
964 XmlContent::Text(XmlText::new(text))
965 }
966
967 #[test]
968 fn test_simple_matching() {
969 let base_root = new_base_node(Some(make_element("$ROOT$")));
972 let base_a = new_base_node(Some(make_element("root")));
973 let base_b = new_base_node(Some(make_element("a")));
974 NodeInner::add_child_to_ref(&base_root, base_a.clone());
975 NodeInner::add_child_to_ref(&base_a, base_b.clone());
976
977 let branch_root = new_branch_node(Some(make_element("$ROOT$")));
978 let branch_a = new_branch_node(Some(make_element("root")));
979 let branch_b = new_branch_node(Some(make_element("a")));
980 NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
981 NodeInner::add_child_to_ref(&branch_a, branch_b.clone());
982
983 let mut matching = HeuristicMatching::new();
984 matching.build_matching(&base_root, &branch_root);
985
986 assert!(branch_root.borrow().get_base_match().is_some());
988 assert!(branch_a.borrow().get_base_match().is_some());
989 assert!(branch_b.borrow().get_base_match().is_some());
990 }
991
992 #[test]
993 fn test_text_matching() {
994 let base_root = new_base_node(Some(make_element("$ROOT$")));
995 let base_text = new_base_node(Some(make_text("hello world")));
996 NodeInner::add_child_to_ref(&base_root, base_text.clone());
997
998 let branch_root = new_branch_node(Some(make_element("$ROOT$")));
999 let branch_text = new_branch_node(Some(make_text("hello world")));
1000 NodeInner::add_child_to_ref(&branch_root, branch_text.clone());
1001
1002 let mut matching = HeuristicMatching::new();
1003 matching.build_matching(&base_root, &branch_root);
1004
1005 assert!(branch_text.borrow().get_base_match().is_some());
1006 }
1007
1008 #[test]
1009 fn test_unmatched_node() {
1010 let base_root = new_base_node(Some(make_element("$ROOT$")));
1013 let base_a = new_base_node(Some(make_element("root")));
1014 let base_b = new_base_node(Some(make_element("a")));
1015 NodeInner::add_child_to_ref(&base_root, base_a.clone());
1016 NodeInner::add_child_to_ref(&base_a, base_b);
1017
1018 let branch_root = new_branch_node(Some(make_element("$ROOT$")));
1019 let branch_a = new_branch_node(Some(make_element("root")));
1020 let branch_b = new_branch_node(Some(make_element("different_tag")));
1021 NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
1022 NodeInner::add_child_to_ref(&branch_a, branch_b.clone());
1023
1024 let mut matching = HeuristicMatching::new();
1025 matching.build_matching(&base_root, &branch_root);
1026
1027 assert!(branch_root.borrow().get_base_match().is_some());
1029 assert!(branch_a.borrow().get_base_match().is_some());
1030 }
1032
1033 #[test]
1034 fn test_dfs_iterator() {
1035 let root = new_base_node(Some(make_element("root")));
1036 let a = new_base_node(Some(make_element("a")));
1037 let b = new_base_node(Some(make_element("b")));
1038 NodeInner::add_child_to_ref(&root, a.clone());
1039 NodeInner::add_child_to_ref(&root, b.clone());
1040
1041 let iter = DfsTreeIterator::new(root);
1042 let count = iter.count();
1043 assert_eq!(count, 3);
1044 }
1045}