Skip to main content

xml_3dm/matching/
heuristic.rs

1//! Heuristic tree matching algorithm.
2//!
3//! This module implements the heuristic matching algorithm described in the
4//! 3DM thesis. It finds correspondences between nodes in a base tree and a
5//! branch tree using content hashing, fuzzy matching, and structural analysis.
6
7use std::collections::HashSet;
8use std::rc::Rc;
9
10use crate::matching::constants::*;
11use crate::matching::{CandidateEntry, DfsTreeIterator, Matching};
12use crate::measure::Measure;
13use crate::node::{MatchArea, MatchType, NodeInner, NodeRef, XmlContent};
14
15/// Heuristic matching algorithm for building correspondences between trees.
16///
17/// This implementation follows the algorithm described in the 3DM thesis:
18/// 1. Find candidate matches using exact content hash or fuzzy matching
19/// 2. Score candidates by counting matching descendants (DFS match)
20/// 3. Select best candidate, resolving ambiguities using sibling context
21/// 4. Remove small copies below copy_threshold
22/// 5. Match similar unmatched nodes to neighbors' siblings
23/// 6. Set match types (FULL/CONTENT/CHILDREN) for copies
24pub struct HeuristicMatching {
25    /// Root of the base tree.
26    base_root: Option<NodeRef>,
27    /// Root of the branch tree.
28    branch_root: Option<NodeRef>,
29    /// Measure instance for distance calculations.
30    measure: Measure,
31    /// Whether this matching is for the left branch (vs right).
32    is_left: bool,
33    /// Threshold for considering a subtree to be copied.
34    /// Subtrees smaller than this (in info bytes) are not considered copies.
35    copy_threshold: i32,
36}
37
38impl Default for HeuristicMatching {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl HeuristicMatching {
45    /// Creates a new HeuristicMatching instance with default copy threshold.
46    pub fn new() -> Self {
47        Self::with_copy_threshold(COPY_THRESHOLD)
48    }
49
50    /// Creates a new HeuristicMatching instance with custom copy threshold.
51    pub fn with_copy_threshold(copy_threshold: i32) -> Self {
52        HeuristicMatching {
53            base_root: None,
54            branch_root: None,
55            measure: Measure::new(),
56            is_left: true,
57            copy_threshold,
58        }
59    }
60
61    /// Creates a new HeuristicMatching and builds the matching.
62    pub fn with_trees(base: &NodeRef, branch: &NodeRef) -> Self {
63        let mut matching = Self::new();
64        matching.build_matching(base, branch);
65        matching
66    }
67
68    /// Sets whether this matching is for the left branch.
69    pub fn set_is_left(&mut self, is_left: bool) {
70        self.is_left = is_left;
71    }
72
73    /// Returns the copy threshold.
74    pub fn copy_threshold(&self) -> i32 {
75        self.copy_threshold
76    }
77
78    /// Main matching entry point for subtrees.
79    fn match_subtrees(&mut self, base: &NodeRef, branch: &NodeRef) {
80        // Find candidates for node branch in base
81        let candidates = self.find_candidates(base, branch);
82
83        // Find best matching trees by scoring with DFS
84        let mut best_count = 0;
85        let mut best_candidates: Vec<CandidateEntry> = Vec::new();
86
87        for candidate in candidates {
88            let this_count = self.dfs_match_count(&candidate.candidate, branch, 0);
89            if this_count == best_count {
90                best_candidates.push(candidate);
91            } else if this_count > best_count {
92                best_count = this_count;
93                best_candidates.clear();
94                best_candidates.push(candidate);
95            }
96        }
97
98        // Add matchings and find nodes below matching subtree
99        let best = self.get_best_candidate(branch, &mut best_candidates, best_count);
100        let mut stop_nodes: Vec<NodeRef> = Vec::new();
101
102        if let Some(best_entry) = best {
103            let match_area = Rc::new(MatchArea::new(Rc::downgrade(branch)));
104            self.dfs_match_add(&best_entry.candidate, branch, &mut stop_nodes, &match_area);
105        } else {
106            // Unmatched - all children become stop nodes
107            let branch_borrowed = branch.borrow();
108            for child in branch_borrowed.children() {
109                stop_nodes.push(child.clone());
110            }
111        }
112
113        // Recurse for stop nodes
114        for stop_node in stop_nodes {
115            self.match_subtrees(base, &stop_node);
116        }
117    }
118
119    /// Find matching candidates for a node.
120    fn find_candidates(&mut self, tree: &NodeRef, key: &NodeRef) -> Vec<CandidateEntry> {
121        let mut candidates = Vec::new();
122        self.find_exact_matches(tree, key, &mut candidates);
123        if candidates.is_empty() {
124            self.find_fuzzy_matches(tree, key, &mut candidates);
125        }
126        candidates
127    }
128
129    /// Find exact content matches.
130    fn find_exact_matches(
131        &self,
132        tree: &NodeRef,
133        key: &NodeRef,
134        candidates: &mut Vec<CandidateEntry>,
135    ) {
136        let key_borrowed = key.borrow();
137        let key_content = key_borrowed.content();
138
139        for cand in DfsTreeIterator::new(tree.clone()) {
140            let cand_borrowed = cand.borrow();
141            if let (Some(key_c), Some(cand_c)) = (key_content, cand_borrowed.content()) {
142                if cand_c.content_equals(key_c) {
143                    drop(cand_borrowed);
144                    candidates.push(CandidateEntry::new(cand, 0.0, -1.0));
145                }
146            }
147        }
148    }
149
150    /// Find fuzzy matches for a node.
151    fn find_fuzzy_matches(
152        &mut self,
153        tree: &NodeRef,
154        key: &NodeRef,
155        candidates: &mut Vec<CandidateEntry>,
156    ) {
157        let mut cset: Vec<CandidateEntry> = Vec::new();
158        let mut cutoff = 2.0 * MAX_FUZZY_MATCH;
159
160        for cand in DfsTreeIterator::new(tree.clone()) {
161            let lrd_dist = self
162                .get_distance_of_left(key, &cand)
163                .min(self.get_distance_of_right(key, &cand))
164                .min(self.measure.child_list_distance(key, &cand));
165
166            let content_dist = self.measure.get_distance(Some(&cand), Some(key));
167            let min_dist = lrd_dist.min(content_dist);
168
169            if min_dist < 2.0 * MAX_FUZZY_MATCH {
170                cset.push(CandidateEntry::new(cand, min_dist, lrd_dist));
171                cutoff = cutoff.min(2.0 * min_dist);
172            }
173        }
174
175        // Sort by distance
176        cset.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap());
177
178        // Add candidates within cutoff
179        for entry in cset {
180            if entry.distance > cutoff {
181                break;
182            }
183            candidates.push(entry);
184        }
185    }
186
187    /// Count matching nodes in subtrees (DFS match without adding matchings).
188    fn dfs_match_count(&self, a: &NodeRef, b: &NodeRef, count: i32) -> i32 {
189        let a_borrowed = a.borrow();
190        let b_borrowed = b.borrow();
191
192        let mut children_match = a_borrowed.child_count() == b_borrowed.child_count();
193
194        if children_match {
195            // Only match children if there are equally many
196            for i in 0..a_borrowed.child_count() {
197                let a_child = &a_borrowed.children()[i];
198                let b_child = &b_borrowed.children()[i];
199
200                let a_child_content = a_child.borrow().content().cloned();
201                let b_child_content = b_child.borrow().content().cloned();
202
203                if let (Some(ac), Some(bc)) = (a_child_content, b_child_content) {
204                    if !ac.content_equals(&bc) {
205                        children_match = false;
206                        break;
207                    }
208                } else {
209                    children_match = false;
210                    break;
211                }
212            }
213        }
214
215        let mut result = count + 1;
216
217        if children_match && a_borrowed.child_count() > 0 {
218            let a_child = a_borrowed.children()[0].clone();
219            let b_child = b_borrowed.children()[0].clone();
220            drop(a_borrowed);
221            drop(b_borrowed);
222            result += self.dfs_match_count(&a_child, &b_child, 0);
223            result += self.dfs_match_count_remaining(a, b, 1);
224        }
225
226        result
227    }
228
229    /// Helper to count remaining children after one iteration.
230    fn dfs_match_count_remaining(&self, a: &NodeRef, b: &NodeRef, start: usize) -> i32 {
231        let a_borrowed = a.borrow();
232        let b_borrowed = b.borrow();
233
234        if start < a_borrowed.child_count() {
235            let a_child = a_borrowed.children()[start].clone();
236            let b_child = b_borrowed.children()[start].clone();
237            drop(a_borrowed);
238            drop(b_borrowed);
239            let result = self.dfs_match_count(&a_child, &b_child, 0);
240            result + self.dfs_match_count_remaining(a, b, start + 1)
241        } else {
242            0
243        }
244    }
245
246    /// DFS match with adding matchings and collecting stop nodes.
247    fn dfs_match_add(
248        &mut self,
249        a: &NodeRef,
250        b: &NodeRef,
251        stop_nodes: &mut Vec<NodeRef>,
252        match_area: &Rc<MatchArea>,
253    ) {
254        // Add matching
255        self.add_matching(b, a);
256
257        // Update match area info
258        {
259            let a_borrowed = a.borrow();
260            if let Some(content) = a_borrowed.content() {
261                match_area.add_info_bytes(content.info_size());
262            }
263        }
264
265        // Set match area on branch node
266        b.borrow_mut().set_match_area(Some(match_area.clone()));
267
268        let a_borrowed = a.borrow();
269        let b_borrowed = b.borrow();
270
271        let mut children_match = a_borrowed.child_count() == b_borrowed.child_count();
272
273        if children_match {
274            for i in 0..a_borrowed.child_count() {
275                let a_child = &a_borrowed.children()[i];
276                let b_child = &b_borrowed.children()[i];
277
278                let a_child_content = a_child.borrow().content().cloned();
279                let b_child_content = b_child.borrow().content().cloned();
280
281                if let (Some(ac), Some(bc)) = (a_child_content, b_child_content) {
282                    if !ac.content_equals(&bc) {
283                        children_match = false;
284                        break;
285                    }
286                } else {
287                    children_match = false;
288                    break;
289                }
290            }
291        }
292
293        if !children_match {
294            // Mark all children as stop nodes
295            for child in b_borrowed.children() {
296                stop_nodes.push(child.clone());
297            }
298        } else {
299            // All children match - add edge info and recurse
300            match_area.add_info_bytes(a_borrowed.child_count() as i32 * EDGE_BYTES);
301
302            let children_a: Vec<NodeRef> = a_borrowed.children().to_vec();
303            let children_b: Vec<NodeRef> = b_borrowed.children().to_vec();
304            drop(a_borrowed);
305            drop(b_borrowed);
306
307            for (a_child, b_child) in children_a.iter().zip(children_b.iter()) {
308                self.dfs_match_add(a_child, b_child, stop_nodes, match_area);
309            }
310        }
311    }
312
313    /// Select the best matching candidate from a set of candidates.
314    fn get_best_candidate(
315        &mut self,
316        branch: &NodeRef,
317        best_candidates: &mut [CandidateEntry],
318        best_count: i32,
319    ) -> Option<CandidateEntry> {
320        if best_candidates.is_empty() {
321            return None;
322        }
323
324        // Resolve ambiguities if multiple candidates
325        if best_candidates.len() > 1 {
326            // Check if left neighbor of candidate matches left of base
327            let branch_borrowed = branch.borrow();
328            if let Some(left_sibling) = NodeInner::left_sibling_of_ref(branch) {
329                let left_base_match = {
330                    let left_borrowed = left_sibling.borrow();
331                    left_borrowed.get_base_match().and_then(|w| w.upgrade())
332                };
333
334                if let Some(left_base) = left_base_match {
335                    for candidate in best_candidates.iter() {
336                        let cand_left = NodeInner::left_sibling_of_ref(&candidate.candidate);
337                        if let Some(cl) = cand_left {
338                            if cl.borrow().id() == left_base.borrow().id() {
339                                return Some(candidate.clone());
340                            }
341                        }
342                    }
343                }
344            }
345            drop(branch_borrowed);
346
347            // Calculate missing left-right-down correlations
348            for candidate in best_candidates.iter_mut() {
349                if candidate.left_right_down < 0.0 {
350                    let child_dist = self
351                        .measure
352                        .child_list_distance(&candidate.candidate, branch);
353                    let left_dist = self.get_distance_of_left(&candidate.candidate, branch);
354                    let right_dist = self.get_distance_of_right(&candidate.candidate, branch);
355                    candidate.left_right_down = child_dist.min(left_dist).min(right_dist);
356                }
357            }
358
359            // Sort by left-right-down distance
360            best_candidates
361                .sort_by(|a, b| a.left_right_down.partial_cmp(&b.left_right_down).unwrap());
362        }
363
364        let best = best_candidates.first().cloned();
365
366        // Reject if single node match with poor context
367        if let Some(ref b) = best {
368            if best_count == 1 && b.left_right_down.min(b.distance) > 0.1 {
369                return None;
370            }
371        }
372
373        best
374    }
375
376    /// Match similar unmatched nodes to neighbors' siblings.
377    fn match_similar_unmatched(&mut self, _base: &NodeRef, branch: &NodeRef) {
378        // Traverse in preorder
379        let children: Vec<NodeRef> = branch.borrow().children().to_vec();
380        for child in &children {
381            self.match_similar_unmatched(_base, child);
382        }
383
384        let base_match = {
385            let branch_borrowed = branch.borrow();
386            branch_borrowed.get_base_match().and_then(|w| w.upgrade())
387        };
388
389        if let Some(base_match) = base_match {
390            let base_child_count = base_match.borrow().child_count();
391            if base_child_count == 0 {
392                return;
393            }
394
395            let branch_children: Vec<NodeRef> = branch.borrow().children().to_vec();
396            let branch_child_count = branch_children.len();
397
398            for (i, n) in branch_children.iter().enumerate() {
399                // Skip if already matched
400                if n.borrow().get_base_match().is_some() {
401                    continue;
402                }
403
404                let last_base_child = base_child_count - 1;
405
406                // Check endpoints
407                if i == 0 {
408                    let first_base_child = base_match.borrow().child(0).cloned();
409                    if let Some(fbc) = first_base_child {
410                        if !self.is_matched(&fbc) {
411                            self.add_matching_if_same_type(n, &fbc);
412                            continue;
413                        }
414                    }
415                } else if i == branch_child_count - 1 {
416                    let last_base = base_match.borrow().child(last_base_child).cloned();
417                    if let Some(lbc) = last_base {
418                        if !self.is_matched(&lbc) {
419                            self.add_matching_if_same_type(n, &lbc);
420                            continue;
421                        }
422                    }
423                }
424
425                // Check predecessor's right sibling
426                if i > 0 {
427                    let pred = &branch_children[i - 1];
428                    let x = pred.borrow().get_base_match().and_then(|w| w.upgrade());
429                    if let Some(x) = x {
430                        if x.borrow().has_right_sibling() {
431                            let right_sib = NodeInner::right_sibling_of_ref(&x);
432                            if let Some(rs) = right_sib {
433                                if !self.is_matched(&rs) {
434                                    self.add_matching_if_same_type(n, &rs);
435                                    continue;
436                                }
437                            }
438                        }
439                    }
440                }
441
442                // Check successor's left sibling
443                if i < branch_child_count - 1 {
444                    let succ = &branch_children[i + 1];
445                    let x = succ.borrow().get_base_match().and_then(|w| w.upgrade());
446                    if let Some(x) = x {
447                        if x.borrow().has_left_sibling() {
448                            let left_sib = NodeInner::left_sibling_of_ref(&x);
449                            if let Some(ls) = left_sib {
450                                if !self.is_matched(&ls) {
451                                    self.add_matching_if_same_type(n, &ls);
452                                    continue;
453                                }
454                            }
455                        }
456                    }
457                }
458            }
459        }
460    }
461
462    /// Remove copies smaller than COPY_THRESHOLD.
463    fn remove_small_copies(&mut self, root: &NodeRef) {
464        let base = {
465            let root_borrowed = root.borrow();
466            root_borrowed.get_base_match().and_then(|w| w.upgrade())
467        };
468
469        if let Some(base) = base {
470            let match_count = self.get_match_count(&base);
471
472            if match_count > 1 {
473                let matches = self.get_matches(&base);
474                let mut deletia: HashSet<u64> = HashSet::new();
475
476                // Find copies that are too small
477                for copy in &matches {
478                    let info_bytes = {
479                        let copy_borrowed = copy.borrow();
480                        copy_borrowed
481                            .match_area()
482                            .map(|ma| ma.info_bytes())
483                            .unwrap_or(0)
484                    };
485
486                    if info_bytes < self.copy_threshold {
487                        deletia.insert(copy.borrow().id());
488                    }
489                }
490
491                // If deleting all matches, keep the "original" instance
492                if deletia.len() == matches.len() {
493                    let mut max_copy_bytes = 0;
494                    let mut orig_instance: Option<NodeRef> = None;
495
496                    for copy in &matches {
497                        let copy_bytes = self.calc_copy_context_bytes(copy);
498                        if copy_bytes > max_copy_bytes {
499                            max_copy_bytes = copy_bytes;
500                            orig_instance = Some(copy.clone());
501                        }
502                    }
503
504                    if let Some(orig) = orig_instance {
505                        deletia.remove(&orig.borrow().id());
506                        // Mark as original by adding info bytes
507                        if let Some(ma) = orig.borrow().match_area() {
508                            ma.add_info_bytes(self.copy_threshold + 1);
509                        }
510                    }
511                }
512
513                // Collect match areas to delete first
514                let match_areas_to_delete: Vec<Rc<MatchArea>> = deletia
515                    .iter()
516                    .filter_map(|id| {
517                        matches
518                            .iter()
519                            .find(|c| c.borrow().id() == *id)
520                            .and_then(|copy| copy.borrow().match_area().cloned())
521                    })
522                    .collect();
523
524                // Now delete them (no active borrows from matches)
525                for ma in match_areas_to_delete {
526                    self.del_match_area(&ma);
527                }
528            }
529        }
530
531        // Recurse for children
532        let children: Vec<NodeRef> = root.borrow().children().to_vec();
533        for child in children {
534            self.remove_small_copies(&child);
535        }
536    }
537
538    /// Calculate copy context bytes by scanning left and right siblings.
539    fn calc_copy_context_bytes(&self, copy: &NodeRef) -> i32 {
540        let match_area = copy.borrow().match_area().cloned();
541        let mut copy_bytes = match_area.as_ref().map(|ma| ma.info_bytes()).unwrap_or(0);
542
543        if copy_bytes >= self.copy_threshold {
544            return copy_bytes;
545        }
546
547        let copy_root = match_area.as_ref().and_then(|ma| ma.root_strong());
548        if copy_root.is_none() {
549            return copy_bytes;
550        }
551
552        let copy_root = copy_root.unwrap();
553        let copy_base = {
554            let cr_borrowed = copy_root.borrow();
555            cr_borrowed.get_base_match().and_then(|w| w.upgrade())
556        };
557
558        if copy_base.is_none() {
559            return copy_bytes;
560        }
561
562        // Scan left siblings
563        let mut curr_copy = copy_root.clone();
564        let mut curr_base = copy_base.clone().unwrap();
565
566        while copy_bytes < self.copy_threshold {
567            let left_copy = NodeInner::left_sibling_of_ref(&curr_copy);
568            let left_base = NodeInner::left_sibling_of_ref(&curr_base);
569
570            match (left_copy, left_base) {
571                (Some(lc), Some(lb)) => {
572                    let lc_base = lc.borrow().get_base_match().and_then(|w| w.upgrade());
573                    if lc_base.map(|b| b.borrow().id()) == Some(lb.borrow().id()) {
574                        if let Some(ma) = lc.borrow().match_area() {
575                            copy_bytes += ma.info_bytes();
576                        }
577                        curr_copy = lc;
578                        curr_base = lb;
579                    } else {
580                        break;
581                    }
582                }
583                _ => break,
584            }
585        }
586
587        // Scan right siblings - get fresh copy_root reference
588        let copy_root2 = match_area.as_ref().and_then(|ma| ma.root_strong());
589        if let Some(cr2) = copy_root2 {
590            curr_copy = cr2;
591            curr_base = copy_base.unwrap();
592
593            while copy_bytes < self.copy_threshold {
594                let right_copy = NodeInner::right_sibling_of_ref(&curr_copy);
595                let right_base = NodeInner::right_sibling_of_ref(&curr_base);
596
597                match (right_copy, right_base) {
598                    (Some(rc), Some(rb)) => {
599                        let rc_base = rc.borrow().get_base_match().and_then(|w| w.upgrade());
600                        if rc_base.map(|b| b.borrow().id()) == Some(rb.borrow().id()) {
601                            if let Some(ma) = rc.borrow().match_area() {
602                                copy_bytes += ma.info_bytes();
603                            }
604                            curr_copy = rc;
605                            curr_base = rb;
606                        } else {
607                            break;
608                        }
609                    }
610                    _ => break,
611                }
612            }
613        }
614
615        copy_bytes
616    }
617
618    /// Set match types for nodes with multiple matches.
619    fn set_match_types(&mut self, base: &NodeRef) {
620        // Postorder traversal
621        let children: Vec<NodeRef> = base.borrow().children().to_vec();
622        for child in children {
623            self.set_match_types(&child);
624        }
625
626        let match_count = self.get_match_count(base);
627        if match_count > 1 {
628            // Has multiple matches - determine type of each copy
629            let matches = self.get_matches(base);
630            let mut min_dist = i32::MAX;
631            let mut min_content_dist = f64::MAX;
632            let mut master: Option<NodeRef> = None;
633
634            for cand in &matches {
635                let dist = if self.exact_child_list_match(base, cand) {
636                    0
637                } else {
638                    self.measure
639                        .matched_child_list_distance(base, cand, self.is_left)
640                };
641
642                if dist < min_dist {
643                    min_dist = dist;
644                    master = Some(cand.clone());
645                } else if dist == min_dist {
646                    // Calculate content distance for tie-breaking
647                    let c_dist = self.measure.get_distance(Some(base), Some(cand));
648                    if master.is_none() || c_dist < min_content_dist {
649                        min_content_dist = c_dist;
650                        master = Some(cand.clone());
651                    }
652                }
653            }
654
655            // Set match types for non-master copies
656            let mut removed_matchings: Vec<NodeRef> = Vec::new();
657
658            for cand in &matches {
659                if master.as_ref().map(|m| m.borrow().id()) == Some(cand.borrow().id()) {
660                    continue;
661                }
662
663                let struct_match = self.exact_child_list_match(base, cand);
664                let cont_match = {
665                    let cand_borrowed = cand.borrow();
666                    let base_borrowed = base.borrow();
667                    match (cand_borrowed.content(), base_borrowed.content()) {
668                        (Some(cc), Some(bc)) => cc.content_equals(bc),
669                        _ => false,
670                    }
671                };
672
673                if !struct_match && !cont_match {
674                    removed_matchings.push(cand.clone());
675                } else {
676                    let mut match_type = MatchType::NONE;
677                    if cont_match {
678                        match_type |= MatchType::CONTENT;
679                    }
680                    if struct_match {
681                        match_type |= MatchType::CHILDREN;
682                    }
683                    cand.borrow_mut().set_match_type(match_type);
684                }
685            }
686
687            // Delete removed matchings
688            for cand in removed_matchings {
689                if let Some(base_match) = cand.borrow().get_base_match().and_then(|w| w.upgrade()) {
690                    self.del_matching(&cand, &base_match);
691                }
692            }
693        }
694    }
695
696    /// Check if child lists are matched exactly.
697    fn exact_child_list_match(&self, base: &NodeRef, branch: &NodeRef) -> bool {
698        let base_borrowed = base.borrow();
699        let branch_borrowed = branch.borrow();
700
701        if base_borrowed.child_count() != branch_borrowed.child_count() {
702            return false;
703        }
704
705        for i in 0..branch_borrowed.child_count() {
706            let branch_child = &branch_borrowed.children()[i];
707            let base_child = &base_borrowed.children()[i];
708
709            let branch_child_base = branch_child
710                .borrow()
711                .get_base_match()
712                .and_then(|w| w.upgrade());
713
714            match branch_child_base {
715                Some(bcb) if bcb.borrow().id() == base_child.borrow().id() => continue,
716                _ => return false,
717            }
718        }
719
720        true
721    }
722
723    /// Get content distance of left siblings.
724    fn get_distance_of_left(&mut self, a: &NodeRef, b: &NodeRef) -> f64 {
725        let a_borrowed = a.borrow();
726        let b_borrowed = b.borrow();
727
728        if a_borrowed.parent().upgrade().is_none() || b_borrowed.parent().upgrade().is_none() {
729            return crate::measure::MAX_DIST;
730        }
731
732        let a_pos = a_borrowed.child_pos();
733        let b_pos = b_borrowed.child_pos();
734
735        drop(a_borrowed);
736        drop(b_borrowed);
737
738        if a_pos > 0 && b_pos > 0 {
739            let a_left = NodeInner::left_sibling_of_ref(a);
740            let b_left = NodeInner::left_sibling_of_ref(b);
741
742            if let (Some(al), Some(bl)) = (a_left, b_left) {
743                return self.measure.get_distance(Some(&al), Some(&bl));
744            }
745        }
746
747        END_MATCH
748    }
749
750    /// Get content distance of right siblings.
751    fn get_distance_of_right(&mut self, a: &NodeRef, b: &NodeRef) -> f64 {
752        let a_borrowed = a.borrow();
753        let b_borrowed = b.borrow();
754
755        let a_parent = a_borrowed.parent().upgrade();
756        let b_parent = b_borrowed.parent().upgrade();
757
758        if a_parent.is_none() || b_parent.is_none() {
759            return crate::measure::MAX_DIST;
760        }
761
762        let a_pos = a_borrowed.child_pos() as usize;
763        let b_pos = b_borrowed.child_pos() as usize;
764        let a_parent_count = a_parent.as_ref().unwrap().borrow().child_count();
765        let b_parent_count = b_parent.as_ref().unwrap().borrow().child_count();
766
767        drop(a_borrowed);
768        drop(b_borrowed);
769
770        if a_pos < a_parent_count - 1 && b_pos < b_parent_count - 1 {
771            let a_right = NodeInner::right_sibling_of_ref(a);
772            let b_right = NodeInner::right_sibling_of_ref(b);
773
774            if let (Some(ar), Some(br)) = (a_right, b_right) {
775                return self.measure.get_distance(Some(&ar), Some(&br));
776            }
777        }
778
779        END_MATCH
780    }
781
782    /// Add matching if node types match (both text or both element).
783    fn add_matching_if_same_type(&mut self, branch: &NodeRef, base: &NodeRef) {
784        let branch_content = branch.borrow().content().cloned();
785        let base_content = base.borrow().content().cloned();
786
787        let same_type = matches!(
788            (&branch_content, &base_content),
789            (Some(XmlContent::Text(_)), Some(XmlContent::Text(_)))
790                | (Some(XmlContent::Element(_)), Some(XmlContent::Element(_)))
791        );
792
793        if same_type {
794            self.add_matching(branch, base);
795        }
796    }
797
798    /// Add a matching between branch and base nodes.
799    fn add_matching(&mut self, branch: &NodeRef, base: &NodeRef) {
800        branch.borrow_mut().set_base_match(base);
801        branch.borrow_mut().set_match_type(MatchType::FULL);
802
803        // Add to base's left matches
804        let base_borrowed = base.borrow();
805        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
806            left.borrow_mut().add_match(branch.clone());
807        }
808    }
809
810    /// Delete a matching between branch and base nodes.
811    fn del_matching(&mut self, branch: &NodeRef, base: &NodeRef) {
812        branch.borrow_mut().del_base_match();
813
814        // Remove from base's left matches
815        let base_borrowed = base.borrow();
816        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
817            left.borrow_mut().del_match(branch);
818        }
819    }
820
821    /// Delete a match area and all its matchings.
822    fn del_match_area(&mut self, match_area: &Rc<MatchArea>) {
823        if let Some(root) = match_area.root_strong() {
824            self.del_match_area_recursive(&root, match_area);
825        }
826    }
827
828    /// Recursive helper for deleting match area.
829    fn del_match_area_recursive(&mut self, node: &NodeRef, match_area: &Rc<MatchArea>) {
830        // Check if this node's match area matches what we're deleting
831        // Use explicit scope to ensure borrow is dropped before mutable borrow
832        let should_process = {
833            let borrowed = node.borrow();
834            borrowed
835                .match_area()
836                .map(|ma| Rc::ptr_eq(ma, match_area))
837                .unwrap_or(false)
838        };
839
840        if should_process {
841            node.borrow_mut().set_match_area(None);
842
843            // Get base match in its own scope
844            let base_match = {
845                let borrowed = node.borrow();
846                borrowed.get_base_match().and_then(|w| w.upgrade())
847            };
848            if let Some(base_match) = base_match {
849                self.del_matching(node, &base_match);
850            }
851
852            // Get children in their own scope
853            let children: Vec<NodeRef> = node.borrow().children().to_vec();
854            for child in children {
855                self.del_match_area_recursive(&child, match_area);
856            }
857        }
858    }
859
860    /// Check if a base node has any matches.
861    fn is_matched(&self, base: &NodeRef) -> bool {
862        self.get_match_count(base) > 0
863    }
864
865    /// Get match count for a base node.
866    fn get_match_count(&self, base: &NodeRef) -> usize {
867        let base_borrowed = base.borrow();
868        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
869            left.borrow().match_count()
870        } else {
871            0
872        }
873    }
874
875    /// Get matches for a base node.
876    fn get_matches(&self, base: &NodeRef) -> Vec<NodeRef> {
877        let base_borrowed = base.borrow();
878        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
879            left.borrow().matches().to_vec()
880        } else {
881            Vec::new()
882        }
883    }
884}
885
886impl Matching for HeuristicMatching {
887    fn build_matching(&mut self, base: &NodeRef, branch: &NodeRef) {
888        self.base_root = Some(base.clone());
889        self.branch_root = Some(branch.clone());
890
891        self.match_subtrees(base, branch);
892        self.remove_small_copies(branch);
893        self.match_similar_unmatched(base, branch);
894        self.set_match_types(base);
895
896        // Artificial roots always matched
897        branch.borrow_mut().set_base_match(base);
898        branch.borrow_mut().set_match_type(MatchType::FULL);
899    }
900
901    fn get_area_stop_nodes(&self, stop_nodes: &mut Vec<NodeRef>, node: &NodeRef) {
902        let parent_area = node.borrow().match_area().cloned();
903
904        if parent_area.is_none() {
905            panic!("ASSERT FAILED: node has no match area");
906        }
907
908        let parent_area = parent_area.unwrap();
909        let mut children_in_same_area = true;
910
911        {
912            let node_borrowed = node.borrow();
913            for child in node_borrowed.children() {
914                let child_area = child.borrow().match_area().cloned();
915                if child_area.as_ref().map(|ca| Rc::ptr_eq(ca, &parent_area)) != Some(true) {
916                    children_in_same_area = false;
917                    break;
918                }
919            }
920
921            // BUGFIX 020115
922            if node_borrowed.child_count() == 0 {
923                if let Some(base_match) = node_borrowed.get_base_match().and_then(|w| w.upgrade()) {
924                    if base_match.borrow().child_count() != 0 {
925                        children_in_same_area = false;
926                    }
927                }
928            }
929        }
930
931        if !children_in_same_area {
932            stop_nodes.push(node.clone());
933        } else {
934            let children: Vec<NodeRef> = node.borrow().children().to_vec();
935            for child in children {
936                self.get_area_stop_nodes(stop_nodes, &child);
937            }
938        }
939    }
940
941    fn base_root(&self) -> Option<&NodeRef> {
942        self.base_root.as_ref()
943    }
944
945    fn branch_root(&self) -> Option<&NodeRef> {
946        self.branch_root.as_ref()
947    }
948}
949
950#[cfg(test)]
951mod tests {
952    use super::*;
953    use crate::node::{new_base_node, new_branch_node, XmlText};
954    use std::collections::HashMap;
955
956    fn make_element(name: &str) -> XmlContent {
957        XmlContent::Element(crate::node::XmlElement::new(
958            name.to_string(),
959            HashMap::new(),
960        ))
961    }
962
963    fn make_text(text: &str) -> XmlContent {
964        XmlContent::Text(XmlText::new(text))
965    }
966
967    #[test]
968    fn test_simple_matching() {
969        // Base: <root><a/></root>
970        // Branch: <root><a/></root>
971        let base_root = new_base_node(Some(make_element("$ROOT$")));
972        let base_a = new_base_node(Some(make_element("root")));
973        let base_b = new_base_node(Some(make_element("a")));
974        NodeInner::add_child_to_ref(&base_root, base_a.clone());
975        NodeInner::add_child_to_ref(&base_a, base_b.clone());
976
977        let branch_root = new_branch_node(Some(make_element("$ROOT$")));
978        let branch_a = new_branch_node(Some(make_element("root")));
979        let branch_b = new_branch_node(Some(make_element("a")));
980        NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
981        NodeInner::add_child_to_ref(&branch_a, branch_b.clone());
982
983        let mut matching = HeuristicMatching::new();
984        matching.build_matching(&base_root, &branch_root);
985
986        // All nodes should be matched
987        assert!(branch_root.borrow().get_base_match().is_some());
988        assert!(branch_a.borrow().get_base_match().is_some());
989        assert!(branch_b.borrow().get_base_match().is_some());
990    }
991
992    #[test]
993    fn test_text_matching() {
994        let base_root = new_base_node(Some(make_element("$ROOT$")));
995        let base_text = new_base_node(Some(make_text("hello world")));
996        NodeInner::add_child_to_ref(&base_root, base_text.clone());
997
998        let branch_root = new_branch_node(Some(make_element("$ROOT$")));
999        let branch_text = new_branch_node(Some(make_text("hello world")));
1000        NodeInner::add_child_to_ref(&branch_root, branch_text.clone());
1001
1002        let mut matching = HeuristicMatching::new();
1003        matching.build_matching(&base_root, &branch_root);
1004
1005        assert!(branch_text.borrow().get_base_match().is_some());
1006    }
1007
1008    #[test]
1009    fn test_unmatched_node() {
1010        // Base: <root><a/></root>
1011        // Branch: <root><b/></root>  (b doesn't exist in base)
1012        let base_root = new_base_node(Some(make_element("$ROOT$")));
1013        let base_a = new_base_node(Some(make_element("root")));
1014        let base_b = new_base_node(Some(make_element("a")));
1015        NodeInner::add_child_to_ref(&base_root, base_a.clone());
1016        NodeInner::add_child_to_ref(&base_a, base_b);
1017
1018        let branch_root = new_branch_node(Some(make_element("$ROOT$")));
1019        let branch_a = new_branch_node(Some(make_element("root")));
1020        let branch_b = new_branch_node(Some(make_element("different_tag")));
1021        NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
1022        NodeInner::add_child_to_ref(&branch_a, branch_b.clone());
1023
1024        let mut matching = HeuristicMatching::new();
1025        matching.build_matching(&base_root, &branch_root);
1026
1027        // Root and 'a' should match, 'b' should not (different tag)
1028        assert!(branch_root.borrow().get_base_match().is_some());
1029        assert!(branch_a.borrow().get_base_match().is_some());
1030        // 'different_tag' won't find exact match, fuzzy might or might not match
1031    }
1032
1033    #[test]
1034    fn test_dfs_iterator() {
1035        let root = new_base_node(Some(make_element("root")));
1036        let a = new_base_node(Some(make_element("a")));
1037        let b = new_base_node(Some(make_element("b")));
1038        NodeInner::add_child_to_ref(&root, a.clone());
1039        NodeInner::add_child_to_ref(&root, b.clone());
1040
1041        let iter = DfsTreeIterator::new(root);
1042        let count = iter.count();
1043        assert_eq!(count, 3);
1044    }
1045}