Skip to main content

xml_3dm/matching/
diff.rs

1//! Diff-optimized tree matching.
2//!
3//! `DiffMatching` is a specialized matching algorithm for producing diffs.
4//! Compared to the standard `HeuristicMatching`, it:
5//! - Never uses fuzzy matching (only exact content matches)
6//! - Prefers sequential matches (for efficient run-length encoding in diffs)
7//! - Skips copy detection and similar-unmatched matching
8
9use std::collections::HashMap;
10
11use crate::node::{NodeInner, NodeRef};
12
13use super::{CandidateEntry, DfsTreeIterator, Matching};
14
15/// Tree matching algorithm optimized for diff generation.
16///
17/// This matching algorithm only uses exact content matches and prefers
18/// sequential matches to enable efficient run-length encoding in the
19/// generated diff output.
20#[derive(Default)]
21pub struct DiffMatching {
22    base_root: Option<NodeRef>,
23    branch_root: Option<NodeRef>,
24    /// Hash index for fast candidate lookup.
25    hash_index: HashMap<i32, Vec<NodeRef>>,
26}
27
28impl DiffMatching {
29    /// Creates a new DiffMatching instance.
30    pub fn new() -> Self {
31        DiffMatching {
32            base_root: None,
33            branch_root: None,
34            hash_index: HashMap::new(),
35        }
36    }
37
38    /// Builds a hash index of all nodes in the base tree for fast lookup.
39    fn build_hash_index(&mut self, base: &NodeRef) {
40        self.hash_index.clear();
41
42        for node in DfsTreeIterator::new(base.clone()) {
43            if let Some(content) = node.borrow().content() {
44                let hash = content.content_hash();
45                self.hash_index.entry(hash).or_default().push(node.clone());
46            }
47        }
48    }
49
50    /// Matches subtrees starting from the given roots.
51    fn match_subtrees(&mut self, base: &NodeRef, branch: &NodeRef) {
52        self.build_hash_index(base);
53
54        // Process all branch nodes
55        let mut queue: Vec<NodeRef> = vec![branch.clone()];
56
57        while let Some(current) = queue.pop() {
58            // Find candidates for this node
59            let candidates = self.find_candidates(&current);
60
61            if candidates.is_empty() {
62                // No match - all children become candidates
63                let children: Vec<NodeRef> = current.borrow().children().to_vec();
64                for child in children {
65                    queue.push(child);
66                }
67            } else {
68                // Get best candidate and add matching
69                let best = self.get_best_candidate(&current, candidates);
70
71                if let Some(best_entry) = best {
72                    // Add matching and recurse for children
73                    self.add_matching(&current, &best_entry.candidate);
74
75                    // Get stop nodes (unmatched children)
76                    let stop_nodes = self.get_stop_nodes(&best_entry.candidate, &current);
77                    for stop_node in stop_nodes {
78                        queue.push(stop_node);
79                    }
80                } else {
81                    // No good candidate - all children become candidates
82                    let children: Vec<NodeRef> = current.borrow().children().to_vec();
83                    for child in children {
84                        queue.push(child);
85                    }
86                }
87            }
88        }
89    }
90
91    /// Finds exact match candidates for a branch node.
92    ///
93    /// Unlike HeuristicMatching, DiffMatching only uses exact content matches.
94    fn find_candidates(&self, branch: &NodeRef) -> Vec<CandidateEntry> {
95        let mut candidates = Vec::new();
96
97        let branch_borrowed = branch.borrow();
98        if let Some(content) = branch_borrowed.content() {
99            let hash = content.content_hash();
100
101            if let Some(nodes) = self.hash_index.get(&hash) {
102                for node in nodes {
103                    // Verify exact content match
104                    let node_borrowed = node.borrow();
105                    if let Some(node_content) = node_borrowed.content() {
106                        if content.content_equals(node_content) {
107                            drop(node_borrowed);
108                            candidates.push(CandidateEntry::new(node.clone(), 0.0, -1.0));
109                        }
110                    }
111                }
112            }
113        }
114
115        candidates
116    }
117
118    /// Selects the best candidate, preferring sequential matches.
119    ///
120    /// To enable efficient run-length encoding in diffs, we prefer candidates
121    /// that are next to the previously matched node's base match.
122    fn get_best_candidate(
123        &self,
124        branch: &NodeRef,
125        candidates: Vec<CandidateEntry>,
126    ) -> Option<CandidateEntry> {
127        if candidates.is_empty() {
128            return None;
129        }
130
131        // If multiple candidates, prefer one adjacent to left sibling's match
132        if candidates.len() > 1 {
133            if let Some(left_sibling) = NodeInner::left_sibling_of_ref(branch) {
134                let left_base_match = left_sibling
135                    .borrow()
136                    .get_base_match()
137                    .and_then(|w| w.upgrade());
138
139                if let Some(left_base) = left_base_match {
140                    // Find a candidate that is the right sibling of left's base match
141                    for candidate in &candidates {
142                        if let Some(cand_left) =
143                            NodeInner::left_sibling_of_ref(&candidate.candidate)
144                        {
145                            if cand_left.borrow().id() == left_base.borrow().id() {
146                                return Some(candidate.clone());
147                            }
148                        }
149                    }
150                }
151            }
152        }
153
154        // Return first candidate
155        candidates.into_iter().next()
156    }
157
158    /// Gets unmatched children after matching (stop nodes for further processing).
159    fn get_stop_nodes(&self, base: &NodeRef, branch: &NodeRef) -> Vec<NodeRef> {
160        let mut stop_nodes = Vec::new();
161
162        let base_borrowed = base.borrow();
163        let branch_borrowed = branch.borrow();
164
165        let base_child_count = base_borrowed.child_count();
166        let branch_child_count = branch_borrowed.child_count();
167
168        // If same number of children, try to match them in order
169        // Collect children first to avoid borrow issues
170        let base_children: Vec<NodeRef> = base_borrowed.children().to_vec();
171        let branch_children: Vec<NodeRef> = branch_borrowed.children().to_vec();
172        drop(base_borrowed);
173        drop(branch_borrowed);
174
175        if base_child_count == branch_child_count {
176            for i in 0..branch_child_count {
177                let base_child = &base_children[i];
178                let branch_child = &branch_children[i];
179
180                // Check if contents match
181                let base_content = base_child.borrow().content().cloned();
182                let branch_content = branch_child.borrow().content().cloned();
183
184                let matches = match (&base_content, &branch_content) {
185                    (Some(bc), Some(rc)) => bc.content_equals(rc),
186                    _ => false,
187                };
188
189                if matches {
190                    // Add matching for children
191                    self.add_matching(branch_child, base_child);
192
193                    // Recursively get stop nodes from matched children
194                    let child_stops = self.get_stop_nodes(base_child, branch_child);
195                    stop_nodes.extend(child_stops);
196                } else {
197                    stop_nodes.push(branch_child.clone());
198                }
199            }
200        } else {
201            // Different child counts - all branch children are stop nodes
202            for child in &branch_children {
203                stop_nodes.push(child.clone());
204            }
205        }
206
207        stop_nodes
208    }
209
210    /// Adds a matching between branch and base nodes.
211    fn add_matching(&self, branch: &NodeRef, base: &NodeRef) {
212        branch.borrow_mut().set_base_match(base);
213
214        // Add to base's left matches
215        let base_borrowed = base.borrow();
216        if let crate::node::NodeKind::Base { left, .. } = base_borrowed.kind() {
217            left.borrow_mut().add_match(branch.clone());
218        }
219    }
220
221    /// Returns a reference to the branch root.
222    pub fn branch_root_ref(&self) -> Option<&NodeRef> {
223        self.branch_root.as_ref()
224    }
225}
226
227impl Matching for DiffMatching {
228    fn build_matching(&mut self, base: &NodeRef, branch: &NodeRef) {
229        self.base_root = Some(base.clone());
230        self.branch_root = Some(branch.clone());
231
232        self.match_subtrees(base, branch);
233
234        // Artificial roots always matched
235        branch.borrow_mut().set_base_match(base);
236    }
237
238    fn get_area_stop_nodes(&self, stop_nodes: &mut Vec<NodeRef>, node: &NodeRef) {
239        // For DiffMatching, stop nodes are simply unmatched children
240        let children: Vec<NodeRef> = node.borrow().children().to_vec();
241        for child in &children {
242            if child.borrow().get_base_match().is_none() {
243                stop_nodes.push(child.clone());
244            } else {
245                self.get_area_stop_nodes(stop_nodes, child);
246                return;
247            }
248        }
249    }
250
251    fn base_root(&self) -> Option<&NodeRef> {
252        self.base_root.as_ref()
253    }
254
255    fn branch_root(&self) -> Option<&NodeRef> {
256        self.branch_root.as_ref()
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use crate::node::{new_base_node, new_branch_node, XmlContent, XmlElement, XmlText};
264
265    fn make_element(name: &str) -> XmlContent {
266        XmlContent::Element(XmlElement::new(
267            name.to_string(),
268            std::collections::HashMap::new(),
269        ))
270    }
271
272    fn make_text(text: &str) -> XmlContent {
273        XmlContent::Text(XmlText::new(text))
274    }
275
276    #[test]
277    fn test_diff_matching_exact_only() {
278        // Verify that DiffMatching only uses exact matches
279        let base_root = new_base_node(Some(make_element("root")));
280        let base_a = new_base_node(Some(make_element("a")));
281        NodeInner::add_child_to_ref(&base_root, base_a.clone());
282
283        let branch_root = new_branch_node(Some(make_element("root")));
284        let branch_a = new_branch_node(Some(make_element("a")));
285        NodeInner::add_child_to_ref(&branch_root, branch_a.clone());
286
287        let mut matching = DiffMatching::new();
288        matching.build_matching(&base_root, &branch_root);
289
290        // Should match exactly
291        assert!(branch_a.borrow().get_base_match().is_some());
292    }
293
294    #[test]
295    fn test_diff_matching_no_fuzzy() {
296        // Verify that similar but not identical content does NOT match
297        let base_root = new_base_node(Some(make_element("root")));
298        let base_text = new_base_node(Some(make_text("hello world")));
299        NodeInner::add_child_to_ref(&base_root, base_text);
300
301        let branch_root = new_branch_node(Some(make_element("root")));
302        let branch_text = new_branch_node(Some(make_text("hello world!"))); // Different!
303        NodeInner::add_child_to_ref(&branch_root, branch_text.clone());
304
305        let mut matching = DiffMatching::new();
306        matching.build_matching(&base_root, &branch_root);
307
308        // Should NOT match (no fuzzy matching in DiffMatching)
309        assert!(branch_text.borrow().get_base_match().is_none());
310    }
311
312    #[test]
313    fn test_diff_matching_sequential_preference() {
314        // Verify that DiffMatching prefers sequential matches
315        // Base:  <root><a/><b/><a/></root>
316        // Branch: <root><a/><b/><a/></root>
317        // The second 'a' should match the second base 'a' (sequential), not the first
318
319        let base_root = new_base_node(Some(make_element("root")));
320        let base_a1 = new_base_node(Some(make_element("a")));
321        let base_b = new_base_node(Some(make_element("b")));
322        let base_a2 = new_base_node(Some(make_element("a")));
323        NodeInner::add_child_to_ref(&base_root, base_a1.clone());
324        NodeInner::add_child_to_ref(&base_root, base_b.clone());
325        NodeInner::add_child_to_ref(&base_root, base_a2.clone());
326
327        let branch_root = new_branch_node(Some(make_element("root")));
328        let branch_a1 = new_branch_node(Some(make_element("a")));
329        let branch_b = new_branch_node(Some(make_element("b")));
330        let branch_a2 = new_branch_node(Some(make_element("a")));
331        NodeInner::add_child_to_ref(&branch_root, branch_a1.clone());
332        NodeInner::add_child_to_ref(&branch_root, branch_b.clone());
333        NodeInner::add_child_to_ref(&branch_root, branch_a2.clone());
334
335        let mut matching = DiffMatching::new();
336        matching.build_matching(&base_root, &branch_root);
337
338        // First 'a' should match first base 'a'
339        let match1 = branch_a1
340            .borrow()
341            .get_base_match()
342            .and_then(|w| w.upgrade());
343        assert!(match1.is_some());
344        assert_eq!(match1.unwrap().borrow().id(), base_a1.borrow().id());
345
346        // Second 'a' should match second base 'a' (sequential preference)
347        let match2 = branch_a2
348            .borrow()
349            .get_base_match()
350            .and_then(|w| w.upgrade());
351        assert!(match2.is_some());
352        assert_eq!(match2.unwrap().borrow().id(), base_a2.borrow().id());
353    }
354}