Skip to main content

xml_3dm/diff/
generator.rs

1//! Diff generation algorithm.
2//!
3//! Produces an XML diff document that encodes the transformations needed
4//! to convert the base tree into the branch tree.
5
6use std::io::Write;
7
8use crate::matching::Matching;
9use crate::node::{BranchNode, NodeRef, XmlContent};
10
11use super::bfs_index::BfsIndex;
12use super::diff_operation::{DiffOpType, DiffOperation};
13use super::{
14    DIFF_COPY_TAG, DIFF_CPYDST_ATTR, DIFF_CPYRUN_ATTR, DIFF_CPYSRC_ATTR, DIFF_ESC_TAG,
15    DIFF_INSERT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS, DIFF_ROOT_TAG,
16};
17
18/// Sequence for run-length encoding of consecutive copies.
19struct Sequence {
20    /// Source node ID for start of sequence.
21    src: Option<u64>,
22    /// Destination node ID.
23    dst: Option<u64>,
24    /// Tail node ID (last in sequence).
25    tail: Option<u64>,
26    /// Number of consecutive nodes.
27    run: i64,
28}
29
30impl Sequence {
31    fn new() -> Self {
32        Sequence {
33            src: None,
34            dst: None,
35            tail: None,
36            run: -1,
37        }
38    }
39
40    fn set_empty(&mut self) {
41        self.run = -1;
42    }
43
44    fn is_empty(&self) -> bool {
45        self.run == -1
46    }
47
48    fn init(&mut self, src: u64, dst: Option<u64>) {
49        self.src = Some(src);
50        self.tail = Some(src);
51        self.dst = dst;
52        self.run = 1;
53    }
54
55    fn append(&mut self, src: u64) {
56        self.run += 1;
57        self.tail = Some(src);
58    }
59}
60
61/// Diff generator for matched trees.
62pub struct Diff<'a> {
63    /// The matching between base and branch trees.
64    matching: &'a dyn Matching,
65    /// BFS index for the base tree.
66    index: BfsIndex,
67    /// Current indentation level for formatted output.
68    indent: std::cell::Cell<usize>,
69}
70
71impl<'a> Diff<'a> {
72    /// Creates a new diff generator.
73    pub fn new(matching: &'a dyn Matching) -> Option<Self> {
74        let base_root = matching.base_root()?;
75        let index = BfsIndex::new(base_root);
76
77        Some(Diff {
78            matching,
79            index,
80            indent: std::cell::Cell::new(0),
81        })
82    }
83
84    /// Creates a diff generator with a custom index.
85    pub fn with_index(matching: &'a dyn Matching, index: BfsIndex) -> Self {
86        Diff {
87            matching,
88            index,
89            indent: std::cell::Cell::new(0),
90        }
91    }
92
93    /// Generates the diff and writes it to the given writer.
94    pub fn diff<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
95        if let Some(branch_root) = self.matching.branch_root() {
96            self.diff_from(writer, branch_root)
97        } else {
98            Err(std::io::Error::new(
99                std::io::ErrorKind::InvalidInput,
100                "No branch root",
101            ))
102        }
103    }
104
105    /// Generates diff starting from a specific branch node.
106    fn diff_from<W: Write>(&self, writer: &mut W, branch_root: &NodeRef) -> std::io::Result<()> {
107        let root_has_match = BranchNode::base_match(branch_root).is_some();
108
109        writeln!(writer, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
110
111        if root_has_match {
112            // Root matches - use copy operation
113            let stop_nodes = self.get_stop_nodes(branch_root);
114            let op = DiffOperation::root_copy();
115            self.write_op_open(writer, &op)?;
116            self.write_copy(writer, &stop_nodes)?;
117            self.write_op_close(writer, &op)?;
118        } else {
119            // Root doesn't match - insert operation
120            let op = DiffOperation::root_insert();
121            self.write_op_open(writer, &op)?;
122            self.write_insert(writer, branch_root)?;
123            self.write_op_close(writer, &op)?;
124        }
125
126        Ok(())
127    }
128
129    /// Gets stop nodes for a matched subtree.
130    fn get_stop_nodes(&self, node: &NodeRef) -> Vec<NodeRef> {
131        let mut stop_nodes = Vec::new();
132        self.matching.get_area_stop_nodes(&mut stop_nodes, node);
133        stop_nodes
134    }
135
136    /// Writes the copy operations for the given stop nodes.
137    fn write_copy<W: Write>(&self, writer: &mut W, stop_nodes: &[NodeRef]) -> std::io::Result<()> {
138        let mut seq = Sequence::new();
139
140        for stop_node in stop_nodes {
141            let dst = BranchNode::base_match(stop_node).and_then(|n| self.index.get_id(&n));
142
143            if !self.emit_child_list(writer, &mut seq, stop_node, dst, false)? {
144                // Empty insert tag for nodes with no children
145                let op = DiffOperation::insert(dst);
146                self.write_op_open(writer, &op)?;
147                self.write_op_close(writer, &op)?;
148            }
149        }
150
151        Ok(())
152    }
153
154    /// Writes an insert operation for a branch node.
155    fn write_insert<W: Write>(&self, writer: &mut W, branch: &NodeRef) -> std::io::Result<()> {
156        let mut seq = Sequence::new();
157        self.write_content_open(writer, branch)?;
158        self.emit_child_list(writer, &mut seq, branch, None, true)?;
159        self.write_content_close(writer, branch)?;
160        Ok(())
161    }
162
163    /// Emits the child list with run-length encoding for copies.
164    fn emit_child_list<W: Write>(
165        &self,
166        writer: &mut W,
167        seq: &mut Sequence,
168        parent: &NodeRef,
169        dst: Option<u64>,
170        ins_mode: bool,
171    ) -> std::io::Result<bool> {
172        let children: Vec<_> = parent.borrow().children().to_vec();
173        let child_count = children.len();
174
175        for (ic, child) in children.iter().enumerate() {
176            let last_stop_node = ic == child_count - 1;
177            let base_match = BranchNode::base_match(child);
178
179            if let Some(ref base) = base_match {
180                // Child has a base match - potential copy
181                let child_stop_nodes = self.get_stop_nodes(child);
182                let src = self.index.get_id(base);
183
184                if let Some(src_id) = src {
185                    if child_stop_nodes.is_empty() && !last_stop_node {
186                        // Can potentially extend sequence
187                        if seq.is_empty() {
188                            seq.init(src_id, dst);
189                            continue;
190                        } else if self.sequence_appends(seq, src_id, dst) {
191                            seq.append(src_id);
192                            continue;
193                        }
194                    }
195
196                    // Did not append to sequence (or at last stop node) => emit sequence
197                    if !self.sequence_appends(seq, src_id, dst) {
198                        // Current does not append to prev seq -> output prev seq + new in separate tags
199                        if !seq.is_empty() {
200                            let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
201                            self.write_op_open(writer, &op)?;
202                            self.write_op_close(writer, &op)?;
203                        }
204
205                        if !child_stop_nodes.is_empty() || last_stop_node {
206                            let op = DiffOperation::copy(src_id, dst, 1);
207                            self.write_op_open(writer, &op)?;
208                            self.write_copy(writer, &child_stop_nodes)?;
209                            self.write_op_close(writer, &op)?;
210                            seq.set_empty();
211                        } else {
212                            seq.init(src_id, dst);
213                        }
214                    } else {
215                        // Appends to open sequence (other reason for seq break)
216                        seq.append(src_id);
217                        let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
218                        self.write_op_open(writer, &op)?;
219                        self.write_copy(writer, &child_stop_nodes)?;
220                        self.write_op_close(writer, &op)?;
221                        seq.set_empty();
222                    }
223                }
224            } else {
225                // No base match - insert
226                if !seq.is_empty() {
227                    let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
228                    self.write_op_open(writer, &op)?;
229                    self.write_op_close(writer, &op)?;
230                    seq.set_empty();
231                }
232
233                if !ins_mode {
234                    // Check if we need to open an insert tag (SHORTINS optimization)
235                    let prev_has_match =
236                        ic > 0 && BranchNode::base_match(&children[ic - 1]).is_some();
237                    if ic == 0 || prev_has_match {
238                        let op = DiffOperation::insert(dst);
239                        self.write_op_open(writer, &op)?;
240                    }
241                }
242
243                self.write_insert(writer, child)?;
244
245                if !ins_mode {
246                    // Check if we need to close the insert tag (SHORTINS optimization)
247                    let next_has_match = !last_stop_node
248                        && ic + 1 < child_count
249                        && BranchNode::base_match(&children[ic + 1]).is_some();
250                    if last_stop_node || next_has_match {
251                        let op = DiffOperation::insert(None);
252                        self.write_op_close(writer, &op)?;
253                    }
254                }
255            }
256        }
257
258        Ok(!children.is_empty())
259    }
260
261    /// Checks if a source ID appends to the current sequence.
262    fn sequence_appends(&self, seq: &Sequence, src: u64, dst: Option<u64>) -> bool {
263        if seq.is_empty() {
264            return false;
265        }
266        if dst != seq.dst {
267            return false;
268        }
269        match (seq.tail, self.index.lookup(seq.tail.unwrap())) {
270            (Some(_), Some(tail_node)) => {
271                if let Some(next_node) = self.index.lookup(src) {
272                    self.index.appends(&tail_node, &next_node)
273                } else {
274                    false
275                }
276            }
277            _ => false,
278        }
279    }
280
281    /// Writes the opening tag for a diff operation.
282    fn write_op_open<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
283        let indent_str = Self::indent_str_for(self.indent.get());
284        match op.op_type {
285            DiffOpType::RootCopy => {
286                writeln!(writer, "{}<{}>", indent_str, DIFF_ROOT_TAG)?;
287                self.indent.set(self.indent.get() + 1);
288            }
289            DiffOpType::RootInsert => {
290                writeln!(
291                    writer,
292                    "{}<{} {}=\"{}\">",
293                    indent_str, DIFF_ROOT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS
294                )?;
295                self.indent.set(self.indent.get() + 1);
296            }
297            DiffOpType::Copy => {
298                write!(writer, "{}<{}", indent_str, DIFF_COPY_TAG)?;
299                if let Some(src) = op.source {
300                    write!(writer, " {}=\"{}\"", DIFF_CPYSRC_ATTR, src)?;
301                }
302                if let Some(dst) = op.destination {
303                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
304                }
305                if let Some(run) = op.run {
306                    if run > 1 {
307                        write!(writer, " {}=\"{}\"", DIFF_CPYRUN_ATTR, run)?;
308                    }
309                }
310                writeln!(writer, ">")?;
311                self.indent.set(self.indent.get() + 1);
312            }
313            DiffOpType::Insert => {
314                write!(writer, "{}<{}", indent_str, DIFF_INSERT_TAG)?;
315                if let Some(dst) = op.destination {
316                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
317                }
318                writeln!(writer, ">")?;
319                self.indent.set(self.indent.get() + 1);
320            }
321        }
322        Ok(())
323    }
324
325    /// Writes the closing tag for a diff operation.
326    fn write_op_close<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
327        self.indent.set(self.indent.get() - 1);
328        let indent_str = Self::indent_str_for(self.indent.get());
329        match op.op_type {
330            DiffOpType::RootCopy | DiffOpType::RootInsert => {
331                writeln!(writer, "{}</{}>", indent_str, DIFF_ROOT_TAG)?;
332            }
333            DiffOpType::Copy => {
334                writeln!(writer, "{}</{}>", indent_str, DIFF_COPY_TAG)?;
335            }
336            DiffOpType::Insert => {
337                writeln!(writer, "{}</{}>", indent_str, DIFF_INSERT_TAG)?;
338            }
339        }
340        Ok(())
341    }
342
343    /// Checks if an element name conflicts with diff reserved tags.
344    fn needs_escape(qname: &str) -> bool {
345        qname == DIFF_COPY_TAG
346            || qname == DIFF_INSERT_TAG
347            || qname == DIFF_ESC_TAG
348            || qname == DIFF_ROOT_TAG
349    }
350
351    /// Writes the opening content for a node.
352    fn write_content_open<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
353        let borrowed = node.borrow();
354        match borrowed.content() {
355            Some(XmlContent::Text(text)) => {
356                // Convert char slice to String
357                let text_str: String = text.text().iter().collect();
358                write!(
359                    writer,
360                    "{}{}",
361                    Self::indent_str_for(self.indent.get()),
362                    escape_xml(&text_str)
363                )?;
364            }
365            Some(XmlContent::Comment(comment)) => {
366                // Output comment
367                let comment_text: String = comment.text().iter().collect();
368                writeln!(
369                    writer,
370                    "{}<!-- {} -->",
371                    Self::indent_str_for(self.indent.get()),
372                    comment_text
373                )?;
374            }
375            Some(XmlContent::Element(elem)) => {
376                // Check if element name conflicts with diff reserved tags
377                let needs_esc = Self::needs_escape(elem.qname());
378                if needs_esc {
379                    writeln!(
380                        writer,
381                        "{}<{}>",
382                        Self::indent_str_for(self.indent.get()),
383                        DIFF_ESC_TAG
384                    )?;
385                    self.indent.set(self.indent.get() + 1);
386                }
387
388                write!(
389                    writer,
390                    "{}<{}",
391                    Self::indent_str_for(self.indent.get()),
392                    elem.qname()
393                )?;
394                // Attributes is HashMap<String, String>
395                let mut attr_names: Vec<_> = elem.attributes().keys().collect();
396                attr_names.sort();
397                for name in attr_names {
398                    if let Some(value) = elem.attributes().get(name) {
399                        write!(writer, " {}=\"{}\"", name, escape_xml_attr(value))?;
400                    }
401                }
402                writeln!(writer, ">")?;
403                self.indent.set(self.indent.get() + 1);
404            }
405            None => {}
406        }
407        Ok(())
408    }
409
410    /// Writes the closing content for a node.
411    fn write_content_close<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
412        let borrowed = node.borrow();
413        if let Some(XmlContent::Element(elem)) = borrowed.content() {
414            self.indent.set(self.indent.get() - 1);
415            writeln!(
416                writer,
417                "{}</{}>",
418                Self::indent_str_for(self.indent.get()),
419                elem.qname()
420            )?;
421
422            // Close escape wrapper if element name was reserved
423            if Self::needs_escape(elem.qname()) {
424                self.indent.set(self.indent.get() - 1);
425                writeln!(
426                    writer,
427                    "{}</{}>",
428                    Self::indent_str_for(self.indent.get()),
429                    DIFF_ESC_TAG
430                )?;
431            }
432        }
433        Ok(())
434    }
435
436    /// Returns the indentation string for a given level.
437    fn indent_str_for(level: usize) -> String {
438        "  ".repeat(level)
439    }
440}
441
442/// Escapes special XML characters in text content.
443fn escape_xml(s: &str) -> String {
444    s.replace('&', "&amp;")
445        .replace('<', "&lt;")
446        .replace('>', "&gt;")
447}
448
449/// Escapes special XML characters in attribute values.
450fn escape_xml_attr(s: &str) -> String {
451    s.replace('&', "&amp;")
452        .replace('<', "&lt;")
453        .replace('>', "&gt;")
454        .replace('"', "&quot;")
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    use crate::matching::HeuristicMatching;
461    use crate::node::{new_base_node, new_branch_node, NodeInner, XmlElement};
462    use std::collections::HashMap;
463
464    #[test]
465    fn test_escape_xml() {
466        assert_eq!(escape_xml("hello"), "hello");
467        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
468        assert_eq!(escape_xml("a & b"), "a &amp; b");
469    }
470
471    #[test]
472    fn test_escape_xml_attr() {
473        assert_eq!(escape_xml_attr("hello"), "hello");
474        assert_eq!(escape_xml_attr("say \"hi\""), "say &quot;hi&quot;");
475    }
476
477    #[test]
478    fn test_diff_identical_trees() {
479        // Create identical base and branch trees
480        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
481            "root".to_string(),
482            HashMap::new(),
483        ))));
484        let base_child = new_base_node(Some(XmlContent::Element(XmlElement::new(
485            "child".to_string(),
486            HashMap::new(),
487        ))));
488        NodeInner::add_child_to_ref(&base, base_child);
489
490        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
491            "root".to_string(),
492            HashMap::new(),
493        ))));
494        let branch_child = new_branch_node(Some(XmlContent::Element(XmlElement::new(
495            "child".to_string(),
496            HashMap::new(),
497        ))));
498        NodeInner::add_child_to_ref(&branch, branch_child);
499
500        // Create matching
501        let mut matching = HeuristicMatching::new();
502        matching.build_matching(&base, &branch);
503
504        // Generate diff
505        if let Some(diff) = Diff::new(&matching) {
506            let mut output = Vec::new();
507            diff.diff(&mut output).unwrap();
508
509            let result = String::from_utf8(output).unwrap();
510            assert!(result.contains("<diff>"));
511            assert!(result.contains("</diff>"));
512        }
513    }
514}