Skip to main content

xml_3dm/matching/
diff.rs

1//! Diff-optimized tree matching.
2//!
3//! `DiffMatching` is a specialized matching algorithm for producing diffs.
4//! Compared to the standard `HeuristicMatching`, it:
5//! - Never uses fuzzy matching (only exact content matches)
6//! - Prefers sequential matches (for efficient run-length encoding in diffs)
7//! - Skips copy detection and similar-unmatched matching
8
9use std::collections::HashMap;
10
11use crate::node::{NodeInner, NodeRef};
12
13use super::{CandidateEntry, DfsTreeIterator, Matching};
14
15/// Tree matching algorithm optimized for diff generation.
16///
17/// This matching algorithm only uses exact content matches and prefers
18/// sequential matches to enable efficient run-length encoding in the
19/// generated diff output.
20#[derive(Default)]
21pub struct DiffMatching {
22    base_root: Option<NodeRef>,
23    branch_root: Option<NodeRef>,
24    /// Hash index for fast candidate lookup.
25    hash_index: HashMap<i32, Vec<NodeRef>>,
26}
27
28impl DiffMatching {
29    /// Creates a new DiffMatching instance.
30    pub fn new() -> Self {
31        DiffMatching {
32            base_root: None,
33            branch_root: None,
34            hash_index: HashMap::new(),
35        }
36    }
37
38    /// Builds a hash index of all nodes in the base tree for fast lookup.
39    fn build_hash_index(&mut self, base: &NodeRef) {
40        self.hash_index.clear();
41
42        for node in DfsTreeIterator::new(base.clone()) {
43            if let Some(content) = node.borrow().content() {
44                let hash = content.content_hash();
45                self.hash_index.entry(hash).or_default().push(node.clone());
46            }
47        }
48    }
49
50    /// Matches subtrees starting from the given roots.
51    fn match_subtrees(&mut self, base: &NodeRef, branch: &NodeRef) {
52        self.build_hash_index(base);
53
54        // Process all branch nodes
55        let mut queue: Vec<NodeRef> = vec![branch.clone()];
56
57        while let Some(current) = queue.pop() {
58            // Find candidates for this node
59            let candidates = self.find_candidates(&current);
60
61            if candidates.is_empty() {
62                // No match - all children become candidates
63                let children: Vec<NodeRef> = current.borrow().children().to_vec();
64                for child in children {
65                    queue.push(child);
66                }
67            } else {
68                // Get best candidate and add matching
69                let best = self.get_best_candidate(&current, candidates);
70
71                if let Some(best_entry) = best {
72                    // Add matching and recurse for children
73                    self.add_matching(&current, &best_entry.candidate);
74
75                    // Get stop nodes (unmatched children)
76                    let stop_nodes = self.get_stop_nodes(&best_entry.candidate, &current);
77                    for stop_node in stop_nodes {
78                        queue.push(stop_node);
79                    }
80                } else {
81                    // No good candidate - all children become candidates
82                    let children: Vec<NodeRef> = current.borrow().children().to_vec();
83                    for child in children {
84                        queue.push(child);
85                    }
86                }
87            }
88        }
89    }
90
91    /// Finds exact match candidates for a branch node.
92    ///
93    /// Unlike HeuristicMatching, DiffMatching only uses exact content matches.
94    fn find_candidates(&self, branch: &NodeRef) -> Vec<CandidateEntry> {
95        let mut candidates = Vec::new();
96
97        let branch_borrowed = branch.borrow();
98        if let Some(content) = branch_borrowed.content() {
99            let hash = content.content_hash();
100
101            if let Some(nodes) = self.hash_index.get(&hash) {
102                for node in nodes {
103                    // Verify exact content match
104                    let node_borrowed = node.borrow();
105                    if let Some(node_content) = node_borrowed.content() {
106                        if content.content_equals(node_content) {
107                            drop(node_borrowed);
108                            candidates.push(CandidateEntry::new(node.clone(), 0.0, -1.0));
109                        }
110                    }
111                }
112            }
113        }
114
115        candidates
116    }
117
118    /// Selects the best candidate, preferring sequential matches.
119    ///
120    /// To enable efficient run-length encoding in diffs, we prefer candidates
121    /// that are next to the previously matched node's base match.
122    fn get_best_candidate(
123        &self,
124        branch: &NodeRef,
125        candidates: Vec<CandidateEntry>,
126    ) -> Option<CandidateEntry> {
127        if candidates.is_empty() {
128            return None;
129        }
130
131        // If multiple candidates, prefer one adjacent to left sibling's match
132        if candidates.len() > 1 {
133            if let Some(left_sibling) = NodeInner::left_sibling_of_ref(branch) {
134                let left_base_match = left_sibling
135                    .borrow()
136                    .get_base_match()
137                    .and_then(|w| w.upgrade());
138
139                if let Some(left_base) = left_base_match {
140                    // Find a candidate that is the right sibling of left's base match
141                    for candidate in &candidates {
142                        if let Some(cand_left) =
143                            NodeInner::left_sibling_of_ref(&candidate.candidate)
144                        {
145                            if cand_left.borrow().id() == left_base.borrow().id() {
146                                return Some(candidate.clone());
147                            }
148                        }
149                    }
150                }
151            }
152        }
153
154        // Return first candidate
155        candidates.into_iter().next()
156    }
157
158    /// Gets unmatched children after matching (stop nodes for further processing).
159    fn get_stop_nodes(&self, base: &NodeRef, branch: &NodeRef) -> Vec<NodeRef> {
160        let mut stop_nodes = Vec::new();
161
162        let base_borrowed = base.borrow();
163        let branch_borrowed = branch.borrow();
164
165        let base_child_count = base_borrowed.child_count();
166        let branch_child_count = branch_borrowed.child_count();
167
168        // If same number of children, try to match them in order
169        // Collect children first to avoid borrow issues
170        let base_children: Vec<NodeRef> = base_borrowed.children().to_vec();
171        let branch_children: Vec<NodeRef> = branch_borrowed.children().to_vec();
172        drop(base_borrowed);
173        drop(branch_borrowed);
174
175        if base_child_count == branch_child_count {
176            for i in 0..branch_child_count {
177                let base_child = &base_children[i];
178                let branch_child = &branch_children[i];
179
180                // Check if contents match
181                let base_content = base_child.borrow().content().cloned();
182                let branch_content = branch_child.borrow().content().cloned();
183
184                let matches = match (&base_content, &branch_content) {
185                    (Some(bc), Some(rc)) => bc.content_equals(rc),
186                    _ => false,
187                };
188
189                if matches {
190                    // Add matching for children
191                    self.add_matching(branch_child, base_child);
192
193                    // Recursively get stop nodes from matched children
194                    let child_stops = self.get_stop_nodes(base_child, branch_child);
195                    stop_nodes.extend(child_stops);
196                } else {
197                    stop_nodes.push(branch_child.clone());
198                }
199            }
200        } else {
201            // Different child counts - all branch children are stop nodes
202            for child in &branch_children {
203                stop_nodes.push(child.clone());
204            }
205        }
206
207        stop_nodes
208    }
209
210    /// Adds a matching between branch and base nodes.
211    fn add_matching(&self, branch: &NodeRef, base: &NodeRef) {
212        branch.borrow_mut().set_base_match(base);
213
214        // Add to base's left matches
215        let base_borrowed = base.borrow();
216        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
217            left.borrow_mut().add_match(branch.clone());
218        }
219    }
220
221    /// Returns a reference to the branch root.
222    pub fn branch_root_ref(&self) -> Option<&NodeRef> {
223        self.branch_root.as_ref()
224    }
225}
226
227impl Matching for DiffMatching {
228    fn build_matching(&mut self, base: &NodeRef, branch: &NodeRef) {
229        self.base_root = Some(base.clone());
230        self.branch_root = Some(branch.clone());
231
232        self.match_subtrees(base, branch);
233
234        // Artificial roots always matched
235        branch.borrow_mut().set_base_match(base);
236    }
237
238    fn get_area_stop_nodes(&self, stop_nodes: &mut Vec<NodeRef>, node: &NodeRef) {
239        // For DiffMatching, stop nodes are simply unmatched children
240        let children: Vec<NodeRef> = node.borrow().children().to_vec();
241        for child in &children {
242            if child.borrow().get_base_match().is_none() {
243                stop_nodes.push(child.clone());
244            } else {
245                self.get_area_stop_nodes(stop_nodes, child);
246            }
247        }
248    }
249
250    fn base_root(&self) -> Option<&NodeRef> {
251        self.base_root.as_ref()
252    }
253
254    fn branch_root(&self) -> Option<&NodeRef> {
255        self.branch_root.as_ref()
256    }
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262    use crate::node::{new_base_node, new_branch_node, XmlContent, XmlElement, XmlText};
263
264    fn make_element(name: &str) -> XmlContent {
265        XmlContent::Element(XmlElement::new(
266            name.to_string(),
267            std::collections::HashMap::new(),
268        ))
269    }
270
271    fn make_text(text: &str) -> XmlContent {
272        XmlContent::Text(XmlText::new(text))
273    }
274
275    #[test]
276    fn test_diff_matching_exact_only() {
277        // Verify that DiffMatching only uses exact matches
278        let base_root = new_base_node(Some(make_element("root")));
279        let base_a = new_base_node(Some(make_element("a")));
280        NodeInner::add_child_to_ref(&base_root, base_a.clone());
281
282        let branch_root = new_branch_node(Some(make_element("root")));
283        let branch_a = new_branch_node(Some(make_element("a")));
284        NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
285
286        let mut matching = DiffMatching::new();
287        matching.build_matching(&base_root, &branch_root);
288
289        // Should match exactly
290        assert!(branch_a.borrow().get_base_match().is_some());
291    }
292
293    #[test]
294    fn test_diff_matching_no_fuzzy() {
295        // Verify that similar but not identical content does NOT match
296        let base_root = new_base_node(Some(make_element("root")));
297        let base_text = new_base_node(Some(make_text("hello world")));
298        NodeInner::add_child_to_ref(&base_root, base_text);
299
300        let branch_root = new_branch_node(Some(make_element("root")));
301        let branch_text = new_branch_node(Some(make_text("hello world!"))); // Different!
302        NodeInner::add_child_to_ref(&branch_root, branch_text.clone());
303
304        let mut matching = DiffMatching::new();
305        matching.build_matching(&base_root, &branch_root);
306
307        // Should NOT match (no fuzzy matching in DiffMatching)
308        assert!(branch_text.borrow().get_base_match().is_none());
309    }
310
311    #[test]
312    fn test_diff_matching_sequential_preference() {
313        // Verify that DiffMatching prefers sequential matches
314        // Base:  <root><a/><b/><a/></root>
315        // Branch: <root><a/><b/><a/></root>
316        // The second 'a' should match the second base 'a' (sequential), not the first
317
318        let base_root = new_base_node(Some(make_element("root")));
319        let base_a1 = new_base_node(Some(make_element("a")));
320        let base_b = new_base_node(Some(make_element("b")));
321        let base_a2 = new_base_node(Some(make_element("a")));
322        NodeInner::add_child_to_ref(&base_root, base_a1.clone());
323        NodeInner::add_child_to_ref(&base_root, base_b.clone());
324        NodeInner::add_child_to_ref(&base_root, base_a2.clone());
325
326        let branch_root = new_branch_node(Some(make_element("root")));
327        let branch_a1 = new_branch_node(Some(make_element("a")));
328        let branch_b = new_branch_node(Some(make_element("b")));
329        let branch_a2 = new_branch_node(Some(make_element("a")));
330        NodeInner::add_child_to_ref(&branch_root, branch_a1.clone());
331        NodeInner::add_child_to_ref(&branch_root, branch_b.clone());
332        NodeInner::add_child_to_ref(&branch_root, branch_a2.clone());
333
334        let mut matching = DiffMatching::new();
335        matching.build_matching(&base_root, &branch_root);
336
337        // First 'a' should match first base 'a'
338        let match1 = branch_a1
339            .borrow()
340            .get_base_match()
341            .and_then(|w| w.upgrade());
342        assert!(match1.is_some());
343        assert_eq!(match1.unwrap().borrow().id(), base_a1.borrow().id());
344
345        // Second 'a' should match second base 'a' (sequential preference)
346        let match2 = branch_a2
347            .borrow()
348            .get_base_match()
349            .and_then(|w| w.upgrade());
350        assert!(match2.is_some());
351        assert_eq!(match2.unwrap().borrow().id(), base_a2.borrow().id());
352    }
353
354    #[test]
355    fn test_get_area_stop_nodes_all_siblings() {
356        // Regression: get_area_stop_nodes must visit ALL children, not return
357        // after the first matched child.
358        // Base:   <root><a/><b/><c/></root>
359        // Branch: <root><a/><x/><c/></root>
360        // After matching, a and c are matched, x is unmatched.
361        // Stop nodes from root should include: x (unmatched) and nothing else
362        // (a and c are matched leaves with no children, so recursing into them yields nothing).
363        // Previously the return bug would only recurse into a and then stop.
364
365        let base_root = new_base_node(Some(make_element("root")));
366        let base_a = new_base_node(Some(make_element("a")));
367        let base_b = new_base_node(Some(make_element("b")));
368        let base_c = new_base_node(Some(make_element("c")));
369        NodeInner::add_child_to_ref(&base_root, base_a.clone());
370        NodeInner::add_child_to_ref(&base_root, base_b.clone());
371        NodeInner::add_child_to_ref(&base_root, base_c.clone());
372
373        let branch_root = new_branch_node(Some(make_element("root")));
374        let branch_a = new_branch_node(Some(make_element("a")));
375        let branch_x = new_branch_node(Some(make_element("x"))); // unmatched
376        let branch_c = new_branch_node(Some(make_element("c")));
377        NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
378        NodeInner::add_child_to_ref(&branch_root, branch_x.clone());
379        NodeInner::add_child_to_ref(&branch_root, branch_c.clone());
380
381        let mut matching = DiffMatching::new();
382        matching.build_matching(&base_root, &branch_root);
383
384        // a and c should be matched
385        assert!(
386            branch_a.borrow().get_base_match().is_some(),
387            "a should match"
388        );
389        assert!(
390            branch_c.borrow().get_base_match().is_some(),
391            "c should match"
392        );
393        assert!(
394            branch_x.borrow().get_base_match().is_none(),
395            "x should not match"
396        );
397
398        // get_area_stop_nodes on root should find x as a stop node
399        let mut stop_nodes = Vec::new();
400        matching.get_area_stop_nodes(&mut stop_nodes, &branch_root);
401
402        // x must be in stop nodes (this failed before the fix)
403        let stop_ids: Vec<_> = stop_nodes.iter().map(|n| n.borrow().id()).collect();
404        assert!(
405            stop_ids.contains(&branch_x.borrow().id()),
406            "unmatched sibling x must appear in stop nodes, got {:?}",
407            stop_ids
408        );
409    }
410}