xml_3dm/diff/
generator.rs

1//! Diff generation algorithm.
2//!
3//! Produces an XML diff document that encodes the transformations needed
4//! to convert the base tree into the branch tree.
5
6use std::io::Write;
7
8use crate::matching::Matching;
9use crate::node::{BranchNode, NodeRef, XmlContent};
10
11use super::bfs_index::BfsIndex;
12use super::diff_operation::{DiffOpType, DiffOperation};
13use super::{
14    DIFF_COPY_TAG, DIFF_CPYDST_ATTR, DIFF_CPYRUN_ATTR, DIFF_CPYSRC_ATTR, DIFF_INSERT_TAG,
15    DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS, DIFF_ROOT_TAG,
16};
17
18/// Sequence for run-length encoding of consecutive copies.
19struct Sequence {
20    /// Source node ID for start of sequence.
21    src: Option<u64>,
22    /// Destination node ID.
23    dst: Option<u64>,
24    /// Tail node ID (last in sequence).
25    tail: Option<u64>,
26    /// Number of consecutive nodes.
27    run: i64,
28}
29
30impl Sequence {
31    fn new() -> Self {
32        Sequence {
33            src: None,
34            dst: None,
35            tail: None,
36            run: -1,
37        }
38    }
39
40    fn set_empty(&mut self) {
41        self.run = -1;
42    }
43
44    fn is_empty(&self) -> bool {
45        self.run == -1
46    }
47
48    fn init(&mut self, src: u64, dst: Option<u64>) {
49        self.src = Some(src);
50        self.tail = Some(src);
51        self.dst = dst;
52        self.run = 1;
53    }
54
55    fn append(&mut self, src: u64) {
56        self.run += 1;
57        self.tail = Some(src);
58    }
59}
60
61/// Diff generator for matched trees.
62pub struct Diff<'a> {
63    /// The matching between base and branch trees.
64    matching: &'a dyn Matching,
65    /// BFS index for the base tree.
66    index: BfsIndex,
67    /// Current indentation level for formatted output.
68    indent: std::cell::Cell<usize>,
69}
70
71impl<'a> Diff<'a> {
72    /// Creates a new diff generator.
73    pub fn new(matching: &'a dyn Matching) -> Option<Self> {
74        let base_root = matching.base_root()?;
75        let index = BfsIndex::new(base_root);
76
77        Some(Diff {
78            matching,
79            index,
80            indent: std::cell::Cell::new(0),
81        })
82    }
83
84    /// Creates a diff generator with a custom index.
85    pub fn with_index(matching: &'a dyn Matching, index: BfsIndex) -> Self {
86        Diff {
87            matching,
88            index,
89            indent: std::cell::Cell::new(0),
90        }
91    }
92
93    /// Generates the diff and writes it to the given writer.
94    pub fn diff<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
95        if let Some(branch_root) = self.matching.branch_root() {
96            self.diff_from(writer, branch_root)
97        } else {
98            Err(std::io::Error::new(
99                std::io::ErrorKind::InvalidInput,
100                "No branch root",
101            ))
102        }
103    }
104
105    /// Generates diff starting from a specific branch node.
106    fn diff_from<W: Write>(&self, writer: &mut W, branch_root: &NodeRef) -> std::io::Result<()> {
107        let root_has_match = BranchNode::base_match(branch_root).is_some();
108
109        writeln!(writer, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
110
111        if root_has_match {
112            // Root matches - use copy operation
113            let stop_nodes = self.get_stop_nodes(branch_root);
114            let op = DiffOperation::root_copy();
115            self.write_op_open(writer, &op)?;
116            self.write_copy(writer, &stop_nodes)?;
117            self.write_op_close(writer, &op)?;
118        } else {
119            // Root doesn't match - insert operation
120            let op = DiffOperation::root_insert();
121            self.write_op_open(writer, &op)?;
122            self.write_insert(writer, branch_root)?;
123            self.write_op_close(writer, &op)?;
124        }
125
126        Ok(())
127    }
128
129    /// Gets stop nodes for a matched subtree.
130    fn get_stop_nodes(&self, node: &NodeRef) -> Vec<NodeRef> {
131        let mut stop_nodes = Vec::new();
132        self.matching.get_area_stop_nodes(&mut stop_nodes, node);
133        stop_nodes
134    }
135
136    /// Writes the copy operations for the given stop nodes.
137    fn write_copy<W: Write>(&self, writer: &mut W, stop_nodes: &[NodeRef]) -> std::io::Result<()> {
138        let mut seq = Sequence::new();
139
140        for stop_node in stop_nodes {
141            let dst = BranchNode::base_match(stop_node).and_then(|n| self.index.get_id(&n));
142
143            if !self.emit_child_list(writer, &mut seq, stop_node, dst, false)? {
144                // Empty insert tag for nodes with no children
145                let op = DiffOperation::insert(dst);
146                self.write_op_open(writer, &op)?;
147                self.write_op_close(writer, &op)?;
148            }
149        }
150
151        Ok(())
152    }
153
154    /// Writes an insert operation for a branch node.
155    fn write_insert<W: Write>(&self, writer: &mut W, branch: &NodeRef) -> std::io::Result<()> {
156        let mut seq = Sequence::new();
157        self.write_content_open(writer, branch)?;
158        self.emit_child_list(writer, &mut seq, branch, None, true)?;
159        self.write_content_close(writer, branch)?;
160        Ok(())
161    }
162
163    /// Emits the child list with run-length encoding for copies.
164    fn emit_child_list<W: Write>(
165        &self,
166        writer: &mut W,
167        seq: &mut Sequence,
168        parent: &NodeRef,
169        dst: Option<u64>,
170        ins_mode: bool,
171    ) -> std::io::Result<bool> {
172        let children: Vec<_> = parent.borrow().children().to_vec();
173        let child_count = children.len();
174
175        for (ic, child) in children.iter().enumerate() {
176            let last_stop_node = ic == child_count - 1;
177            let base_match = BranchNode::base_match(child);
178
179            if let Some(ref base) = base_match {
180                // Child has a base match - potential copy
181                let child_stop_nodes = self.get_stop_nodes(child);
182                let src = self.index.get_id(base);
183
184                if let Some(src_id) = src {
185                    if child_stop_nodes.is_empty() && !last_stop_node {
186                        // Can potentially extend sequence
187                        if seq.is_empty() {
188                            seq.init(src_id, dst);
189                            continue;
190                        } else if self.sequence_appends(seq, src_id, dst) {
191                            seq.append(src_id);
192                            continue;
193                        }
194                    }
195
196                    // Did not append to sequence (or at last stop node) => emit sequence
197                    if !self.sequence_appends(seq, src_id, dst) {
198                        // Current does not append to prev seq -> output prev seq + new in separate tags
199                        if !seq.is_empty() {
200                            let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
201                            self.write_op_open(writer, &op)?;
202                            self.write_op_close(writer, &op)?;
203                        }
204
205                        if !child_stop_nodes.is_empty() || last_stop_node {
206                            let op = DiffOperation::copy(src_id, dst, 1);
207                            self.write_op_open(writer, &op)?;
208                            self.write_copy(writer, &child_stop_nodes)?;
209                            self.write_op_close(writer, &op)?;
210                            seq.set_empty();
211                        } else {
212                            seq.init(src_id, dst);
213                        }
214                    } else {
215                        // Appends to open sequence (other reason for seq break)
216                        seq.append(src_id);
217                        let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
218                        self.write_op_open(writer, &op)?;
219                        self.write_copy(writer, &child_stop_nodes)?;
220                        self.write_op_close(writer, &op)?;
221                        seq.set_empty();
222                    }
223                }
224            } else {
225                // No base match - insert
226                if !seq.is_empty() {
227                    let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
228                    self.write_op_open(writer, &op)?;
229                    self.write_op_close(writer, &op)?;
230                    seq.set_empty();
231                }
232
233                if !ins_mode {
234                    // Check if we need to open an insert tag (SHORTINS optimization)
235                    let prev_has_match =
236                        ic > 0 && BranchNode::base_match(&children[ic - 1]).is_some();
237                    if ic == 0 || prev_has_match {
238                        let op = DiffOperation::insert(dst);
239                        self.write_op_open(writer, &op)?;
240                    }
241                }
242
243                self.write_insert(writer, child)?;
244
245                if !ins_mode {
246                    // Check if we need to close the insert tag (SHORTINS optimization)
247                    let next_has_match = !last_stop_node
248                        && ic + 1 < child_count
249                        && BranchNode::base_match(&children[ic + 1]).is_some();
250                    if last_stop_node || next_has_match {
251                        let op = DiffOperation::insert(None);
252                        self.write_op_close(writer, &op)?;
253                    }
254                }
255            }
256        }
257
258        Ok(!children.is_empty())
259    }
260
261    /// Checks if a source ID appends to the current sequence.
262    fn sequence_appends(&self, seq: &Sequence, src: u64, dst: Option<u64>) -> bool {
263        if seq.is_empty() {
264            return false;
265        }
266        if dst != seq.dst {
267            return false;
268        }
269        match (seq.tail, self.index.lookup(seq.tail.unwrap())) {
270            (Some(_), Some(tail_node)) => {
271                if let Some(next_node) = self.index.lookup(src) {
272                    self.index.appends(&tail_node, &next_node)
273                } else {
274                    false
275                }
276            }
277            _ => false,
278        }
279    }
280
281    /// Writes the opening tag for a diff operation.
282    fn write_op_open<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
283        let indent_str = Self::indent_str_for(self.indent.get());
284        match op.op_type {
285            DiffOpType::RootCopy => {
286                writeln!(writer, "{}<{}>", indent_str, DIFF_ROOT_TAG)?;
287                self.indent.set(self.indent.get() + 1);
288            }
289            DiffOpType::RootInsert => {
290                writeln!(
291                    writer,
292                    "{}<{} {}=\"{}\">",
293                    indent_str, DIFF_ROOT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS
294                )?;
295                self.indent.set(self.indent.get() + 1);
296            }
297            DiffOpType::Copy => {
298                write!(writer, "{}<{}", indent_str, DIFF_COPY_TAG)?;
299                if let Some(src) = op.source {
300                    write!(writer, " {}=\"{}\"", DIFF_CPYSRC_ATTR, src)?;
301                }
302                if let Some(dst) = op.destination {
303                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
304                }
305                if let Some(run) = op.run {
306                    if run > 1 {
307                        write!(writer, " {}=\"{}\"", DIFF_CPYRUN_ATTR, run)?;
308                    }
309                }
310                writeln!(writer, ">")?;
311                self.indent.set(self.indent.get() + 1);
312            }
313            DiffOpType::Insert => {
314                write!(writer, "{}<{}", indent_str, DIFF_INSERT_TAG)?;
315                if let Some(dst) = op.destination {
316                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
317                }
318                writeln!(writer, ">")?;
319                self.indent.set(self.indent.get() + 1);
320            }
321        }
322        Ok(())
323    }
324
325    /// Writes the closing tag for a diff operation.
326    fn write_op_close<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
327        self.indent.set(self.indent.get() - 1);
328        let indent_str = Self::indent_str_for(self.indent.get());
329        match op.op_type {
330            DiffOpType::RootCopy | DiffOpType::RootInsert => {
331                writeln!(writer, "{}</{}>", indent_str, DIFF_ROOT_TAG)?;
332            }
333            DiffOpType::Copy => {
334                writeln!(writer, "{}</{}>", indent_str, DIFF_COPY_TAG)?;
335            }
336            DiffOpType::Insert => {
337                writeln!(writer, "{}</{}>", indent_str, DIFF_INSERT_TAG)?;
338            }
339        }
340        Ok(())
341    }
342
343    /// Writes the opening content for a node.
344    fn write_content_open<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
345        let borrowed = node.borrow();
346        match borrowed.content() {
347            Some(XmlContent::Text(text)) => {
348                // Convert char slice to String
349                let text_str: String = text.text().iter().collect();
350                write!(
351                    writer,
352                    "{}{}",
353                    Self::indent_str_for(self.indent.get()),
354                    escape_xml(&text_str)
355                )?;
356            }
357            Some(XmlContent::Comment(comment)) => {
358                // Output comment
359                let comment_text: String = comment.text().iter().collect();
360                writeln!(
361                    writer,
362                    "{}<!-- {} -->",
363                    Self::indent_str_for(self.indent.get()),
364                    comment_text
365                )?;
366            }
367            Some(XmlContent::Element(elem)) => {
368                write!(
369                    writer,
370                    "{}<{}",
371                    Self::indent_str_for(self.indent.get()),
372                    elem.qname()
373                )?;
374                // Attributes is HashMap<String, String>
375                let mut attr_names: Vec<_> = elem.attributes().keys().collect();
376                attr_names.sort();
377                for name in attr_names {
378                    if let Some(value) = elem.attributes().get(name) {
379                        write!(writer, " {}=\"{}\"", name, escape_xml_attr(value))?;
380                    }
381                }
382                writeln!(writer, ">")?;
383                self.indent.set(self.indent.get() + 1);
384            }
385            None => {}
386        }
387        Ok(())
388    }
389
390    /// Writes the closing content for a node.
391    fn write_content_close<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
392        let borrowed = node.borrow();
393        if let Some(XmlContent::Element(elem)) = borrowed.content() {
394            self.indent.set(self.indent.get() - 1);
395            writeln!(
396                writer,
397                "{}</{}>",
398                Self::indent_str_for(self.indent.get()),
399                elem.qname()
400            )?;
401        }
402        Ok(())
403    }
404
405    /// Returns the indentation string for a given level.
406    fn indent_str_for(level: usize) -> String {
407        "  ".repeat(level)
408    }
409}
410
411/// Escapes special XML characters in text content.
412fn escape_xml(s: &str) -> String {
413    s.replace('&', "&amp;")
414        .replace('<', "&lt;")
415        .replace('>', "&gt;")
416}
417
418/// Escapes special XML characters in attribute values.
419fn escape_xml_attr(s: &str) -> String {
420    s.replace('&', "&amp;")
421        .replace('<', "&lt;")
422        .replace('>', "&gt;")
423        .replace('"', "&quot;")
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429    use crate::matching::HeuristicMatching;
430    use crate::node::{new_base_node, new_branch_node, NodeInner, XmlElement};
431    use std::collections::HashMap;
432
433    #[test]
434    fn test_escape_xml() {
435        assert_eq!(escape_xml("hello"), "hello");
436        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
437        assert_eq!(escape_xml("a & b"), "a &amp; b");
438    }
439
440    #[test]
441    fn test_escape_xml_attr() {
442        assert_eq!(escape_xml_attr("hello"), "hello");
443        assert_eq!(escape_xml_attr("say \"hi\""), "say &quot;hi&quot;");
444    }
445
446    #[test]
447    fn test_diff_identical_trees() {
448        // Create identical base and branch trees
449        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
450            "root".to_string(),
451            HashMap::new(),
452        ))));
453        let base_child = new_base_node(Some(XmlContent::Element(XmlElement::new(
454            "child".to_string(),
455            HashMap::new(),
456        ))));
457        NodeInner::add_child_to_ref(&base, base_child);
458
459        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
460            "root".to_string(),
461            HashMap::new(),
462        ))));
463        let branch_child = new_branch_node(Some(XmlContent::Element(XmlElement::new(
464            "child".to_string(),
465            HashMap::new(),
466        ))));
467        NodeInner::add_child_to_ref(&branch, branch_child);
468
469        // Create matching
470        let mut matching = HeuristicMatching::new();
471        matching.build_matching(&base, &branch);
472
473        // Generate diff
474        if let Some(diff) = Diff::new(&matching) {
475            let mut output = Vec::new();
476            diff.diff(&mut output).unwrap();
477
478            let result = String::from_utf8(output).unwrap();
479            assert!(result.contains("<diff>"));
480            assert!(result.contains("</diff>"));
481        }
482    }
483}