Skip to main content

xml_3dm/diff/
generator.rs

1//! Diff generation algorithm.
2//!
3//! Produces an XML diff document that encodes the transformations needed
4//! to convert the base tree into the branch tree.
5
6use std::io::Write;
7
8use crate::matching::Matching;
9use crate::node::{BranchNode, NodeRef, XmlContent};
10
11use super::bfs_index::BfsIndex;
12use super::diff_operation::{DiffOpType, DiffOperation};
13use super::{
14    DIFF_COPY_TAG, DIFF_CPYDST_ATTR, DIFF_CPYRUN_ATTR, DIFF_CPYSRC_ATTR, DIFF_ESC_TAG,
15    DIFF_INSERT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS, DIFF_ROOT_TAG,
16};
17
18/// Sequence for run-length encoding of consecutive copies.
19struct Sequence {
20    /// Source node ID for start of sequence.
21    src: Option<u64>,
22    /// Destination node ID.
23    dst: Option<u64>,
24    /// Tail node ID (last in sequence).
25    tail: Option<u64>,
26    /// Number of consecutive nodes.
27    run: i64,
28}
29
30impl Sequence {
31    fn new() -> Self {
32        Sequence {
33            src: None,
34            dst: None,
35            tail: None,
36            run: -1,
37        }
38    }
39
40    fn set_empty(&mut self) {
41        self.run = -1;
42    }
43
44    fn is_empty(&self) -> bool {
45        self.run == -1
46    }
47
48    fn init(&mut self, src: u64, dst: Option<u64>) {
49        self.src = Some(src);
50        self.tail = Some(src);
51        self.dst = dst;
52        self.run = 1;
53    }
54
55    fn append(&mut self, src: u64) {
56        self.run += 1;
57        self.tail = Some(src);
58    }
59}
60
61/// Diff generator for matched trees.
62pub struct Diff<'a> {
63    /// The matching between base and branch trees.
64    matching: &'a dyn Matching,
65    /// BFS index for the base tree.
66    index: BfsIndex,
67    /// Current indentation level for formatted output.
68    indent: std::cell::Cell<usize>,
69}
70
71impl<'a> Diff<'a> {
72    /// Creates a new diff generator.
73    pub fn new(matching: &'a dyn Matching) -> Option<Self> {
74        let base_root = matching.base_root()?;
75        let index = BfsIndex::new(base_root);
76
77        Some(Diff {
78            matching,
79            index,
80            indent: std::cell::Cell::new(0),
81        })
82    }
83
84    /// Creates a diff generator with a custom index.
85    pub fn with_index(matching: &'a dyn Matching, index: BfsIndex) -> Self {
86        Diff {
87            matching,
88            index,
89            indent: std::cell::Cell::new(0),
90        }
91    }
92
93    /// Generates the diff and writes it to the given writer.
94    pub fn diff<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
95        if let Some(branch_root) = self.matching.branch_root() {
96            self.diff_from(writer, branch_root)
97        } else {
98            Err(std::io::Error::new(
99                std::io::ErrorKind::InvalidInput,
100                "No branch root",
101            ))
102        }
103    }
104
105    /// Generates diff starting from a specific branch node.
106    fn diff_from<W: Write>(&self, writer: &mut W, branch_root: &NodeRef) -> std::io::Result<()> {
107        let root_has_match = BranchNode::base_match(branch_root).is_some();
108
109        writeln!(writer, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
110
111        if root_has_match {
112            // Root matches - use copy operation
113            let stop_nodes = self.get_stop_nodes(branch_root);
114            let op = DiffOperation::root_copy();
115            self.write_op_open(writer, &op)?;
116            self.write_copy(writer, &stop_nodes)?;
117            self.write_op_close(writer, &op)?;
118        } else {
119            // Root doesn't match - insert operation
120            let op = DiffOperation::root_insert();
121            self.write_op_open(writer, &op)?;
122            self.write_insert(writer, branch_root)?;
123            self.write_op_close(writer, &op)?;
124        }
125
126        Ok(())
127    }
128
129    /// Gets stop nodes for a matched subtree.
130    fn get_stop_nodes(&self, node: &NodeRef) -> Vec<NodeRef> {
131        let mut stop_nodes = Vec::new();
132        self.matching.get_area_stop_nodes(&mut stop_nodes, node);
133        stop_nodes
134    }
135
136    /// Writes the copy operations for the given stop nodes.
137    fn write_copy<W: Write>(&self, writer: &mut W, stop_nodes: &[NodeRef]) -> std::io::Result<()> {
138        let mut seq = Sequence::new();
139
140        for stop_node in stop_nodes {
141            let dst = BranchNode::base_match(stop_node).and_then(|n| self.index.get_id(&n));
142
143            if !self.emit_child_list(writer, &mut seq, stop_node, dst, false)? {
144                // Leaf node (e.g. PI, comment, text) with no children
145                let op = DiffOperation::insert(dst);
146                self.write_op_open(writer, &op)?;
147                self.write_content_open(writer, stop_node)?;
148                self.write_content_close(writer, stop_node)?;
149                self.write_op_close(writer, &op)?;
150            }
151        }
152
153        Ok(())
154    }
155
156    /// Writes an insert operation for a branch node.
157    fn write_insert<W: Write>(&self, writer: &mut W, branch: &NodeRef) -> std::io::Result<()> {
158        let mut seq = Sequence::new();
159        self.write_content_open(writer, branch)?;
160        self.emit_child_list(writer, &mut seq, branch, None, true)?;
161        self.write_content_close(writer, branch)?;
162        Ok(())
163    }
164
165    /// Emits the child list with run-length encoding for copies.
166    fn emit_child_list<W: Write>(
167        &self,
168        writer: &mut W,
169        seq: &mut Sequence,
170        parent: &NodeRef,
171        dst: Option<u64>,
172        ins_mode: bool,
173    ) -> std::io::Result<bool> {
174        let children: Vec<_> = parent.borrow().children().to_vec();
175        let child_count = children.len();
176
177        for (ic, child) in children.iter().enumerate() {
178            let last_stop_node = ic == child_count - 1;
179            let base_match = BranchNode::base_match(child);
180
181            if let Some(ref base) = base_match {
182                // Child has a base match - potential copy
183                let child_stop_nodes = self.get_stop_nodes(child);
184                let src = self.index.get_id(base);
185
186                if let Some(src_id) = src {
187                    if child_stop_nodes.is_empty() && !last_stop_node {
188                        // Can potentially extend sequence
189                        if seq.is_empty() {
190                            seq.init(src_id, dst);
191                            continue;
192                        } else if self.sequence_appends(seq, src_id, dst) {
193                            seq.append(src_id);
194                            continue;
195                        }
196                    }
197
198                    // Did not append to sequence (or at last stop node) => emit sequence
199                    if !self.sequence_appends(seq, src_id, dst) {
200                        // Current does not append to prev seq -> output prev seq + new in separate tags
201                        if !seq.is_empty() {
202                            let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
203                            self.write_op_open(writer, &op)?;
204                            self.write_op_close(writer, &op)?;
205                        }
206
207                        if !child_stop_nodes.is_empty() || last_stop_node {
208                            let op = DiffOperation::copy(src_id, dst, 1);
209                            self.write_op_open(writer, &op)?;
210                            self.write_copy(writer, &child_stop_nodes)?;
211                            self.write_op_close(writer, &op)?;
212                            seq.set_empty();
213                        } else {
214                            seq.init(src_id, dst);
215                        }
216                    } else {
217                        // Appends to open sequence (other reason for seq break)
218                        seq.append(src_id);
219                        let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
220                        self.write_op_open(writer, &op)?;
221                        self.write_copy(writer, &child_stop_nodes)?;
222                        self.write_op_close(writer, &op)?;
223                        seq.set_empty();
224                    }
225                }
226            } else {
227                // No base match - insert
228                if !seq.is_empty() {
229                    let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
230                    self.write_op_open(writer, &op)?;
231                    self.write_op_close(writer, &op)?;
232                    seq.set_empty();
233                }
234
235                if !ins_mode {
236                    // Check if we need to open an insert tag (SHORTINS optimization)
237                    let prev_has_match =
238                        ic > 0 && BranchNode::base_match(&children[ic - 1]).is_some();
239                    if ic == 0 || prev_has_match {
240                        let op = DiffOperation::insert(dst);
241                        self.write_op_open(writer, &op)?;
242                    }
243                }
244
245                self.write_insert(writer, child)?;
246
247                if !ins_mode {
248                    // Check if we need to close the insert tag (SHORTINS optimization)
249                    let next_has_match = !last_stop_node
250                        && ic + 1 < child_count
251                        && BranchNode::base_match(&children[ic + 1]).is_some();
252                    if last_stop_node || next_has_match {
253                        let op = DiffOperation::insert(None);
254                        self.write_op_close(writer, &op)?;
255                    }
256                }
257            }
258        }
259
260        Ok(!children.is_empty())
261    }
262
263    /// Checks if a source ID appends to the current sequence.
264    fn sequence_appends(&self, seq: &Sequence, src: u64, dst: Option<u64>) -> bool {
265        if seq.is_empty() {
266            return false;
267        }
268        if dst != seq.dst {
269            return false;
270        }
271        match (seq.tail, self.index.lookup(seq.tail.unwrap())) {
272            (Some(_), Some(tail_node)) => {
273                if let Some(next_node) = self.index.lookup(src) {
274                    self.index.appends(&tail_node, &next_node)
275                } else {
276                    false
277                }
278            }
279            _ => false,
280        }
281    }
282
283    /// Writes the opening tag for a diff operation.
284    fn write_op_open<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
285        let indent_str = Self::indent_str_for(self.indent.get());
286        match op.op_type {
287            DiffOpType::RootCopy => {
288                writeln!(writer, "{}<{}>", indent_str, DIFF_ROOT_TAG)?;
289                self.indent.set(self.indent.get() + 1);
290            }
291            DiffOpType::RootInsert => {
292                writeln!(
293                    writer,
294                    "{}<{} {}=\"{}\">",
295                    indent_str, DIFF_ROOT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS
296                )?;
297                self.indent.set(self.indent.get() + 1);
298            }
299            DiffOpType::Copy => {
300                write!(writer, "{}<{}", indent_str, DIFF_COPY_TAG)?;
301                if let Some(src) = op.source {
302                    write!(writer, " {}=\"{}\"", DIFF_CPYSRC_ATTR, src)?;
303                }
304                if let Some(dst) = op.destination {
305                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
306                }
307                if let Some(run) = op.run {
308                    if run > 1 {
309                        write!(writer, " {}=\"{}\"", DIFF_CPYRUN_ATTR, run)?;
310                    }
311                }
312                writeln!(writer, ">")?;
313                self.indent.set(self.indent.get() + 1);
314            }
315            DiffOpType::Insert => {
316                write!(writer, "{}<{}", indent_str, DIFF_INSERT_TAG)?;
317                if let Some(dst) = op.destination {
318                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
319                }
320                writeln!(writer, ">")?;
321                self.indent.set(self.indent.get() + 1);
322            }
323        }
324        Ok(())
325    }
326
327    /// Writes the closing tag for a diff operation.
328    fn write_op_close<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
329        self.indent.set(self.indent.get() - 1);
330        let indent_str = Self::indent_str_for(self.indent.get());
331        match op.op_type {
332            DiffOpType::RootCopy | DiffOpType::RootInsert => {
333                writeln!(writer, "{}</{}>", indent_str, DIFF_ROOT_TAG)?;
334            }
335            DiffOpType::Copy => {
336                writeln!(writer, "{}</{}>", indent_str, DIFF_COPY_TAG)?;
337            }
338            DiffOpType::Insert => {
339                writeln!(writer, "{}</{}>", indent_str, DIFF_INSERT_TAG)?;
340            }
341        }
342        Ok(())
343    }
344
345    /// Checks if an element name conflicts with diff reserved tags.
346    fn needs_escape(qname: &str) -> bool {
347        qname == DIFF_COPY_TAG
348            || qname == DIFF_INSERT_TAG
349            || qname == DIFF_ESC_TAG
350            || qname == DIFF_ROOT_TAG
351    }
352
353    /// Writes the opening content for a node.
354    fn write_content_open<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
355        let borrowed = node.borrow();
356        match borrowed.content() {
357            Some(XmlContent::Text(text)) => {
358                // Convert char slice to String
359                let text_str: String = text.text().iter().collect();
360                write!(
361                    writer,
362                    "{}{}",
363                    Self::indent_str_for(self.indent.get()),
364                    escape_xml(&text_str)
365                )?;
366            }
367            Some(XmlContent::Comment(comment)) => {
368                // Output comment
369                let comment_text: String = comment.text().iter().collect();
370                writeln!(
371                    writer,
372                    "{}<!-- {} -->",
373                    Self::indent_str_for(self.indent.get()),
374                    comment_text
375                )?;
376            }
377            Some(XmlContent::ProcessingInstruction(pi)) => {
378                // Output processing instruction
379                if pi.content().is_empty() {
380                    writeln!(
381                        writer,
382                        "{}<?{}?>",
383                        Self::indent_str_for(self.indent.get()),
384                        pi.target()
385                    )?;
386                } else {
387                    writeln!(
388                        writer,
389                        "{}<?{} {}?>",
390                        Self::indent_str_for(self.indent.get()),
391                        pi.target(),
392                        pi.content()
393                    )?;
394                }
395            }
396            Some(XmlContent::Element(elem)) => {
397                // Check if element name conflicts with diff reserved tags
398                let needs_esc = Self::needs_escape(elem.qname());
399                if needs_esc {
400                    writeln!(
401                        writer,
402                        "{}<{}>",
403                        Self::indent_str_for(self.indent.get()),
404                        DIFF_ESC_TAG
405                    )?;
406                    self.indent.set(self.indent.get() + 1);
407                }
408
409                write!(
410                    writer,
411                    "{}<{}",
412                    Self::indent_str_for(self.indent.get()),
413                    elem.qname()
414                )?;
415                // Namespace declarations (sorted for deterministic output)
416                let mut ns_prefixes: Vec<_> = elem.namespace_decls().keys().collect();
417                ns_prefixes.sort();
418                for prefix in ns_prefixes {
419                    if let Some(uri) = elem.namespace_decls().get(prefix) {
420                        if prefix.is_empty() {
421                            write!(writer, " xmlns=\"{}\"", escape_xml_attr(uri))?;
422                        } else {
423                            write!(writer, " xmlns:{}=\"{}\"", prefix, escape_xml_attr(uri))?;
424                        }
425                    }
426                }
427                // Attributes (sorted for deterministic output)
428                let mut attr_names: Vec<_> = elem.attributes().keys().collect();
429                attr_names.sort();
430                for name in attr_names {
431                    if let Some(value) = elem.attributes().get(name) {
432                        write!(writer, " {}=\"{}\"", name, escape_xml_attr(value))?;
433                    }
434                }
435                writeln!(writer, ">")?;
436                self.indent.set(self.indent.get() + 1);
437            }
438            None => {}
439        }
440        Ok(())
441    }
442
443    /// Writes the closing content for a node.
444    fn write_content_close<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
445        let borrowed = node.borrow();
446        if let Some(XmlContent::Element(elem)) = borrowed.content() {
447            self.indent.set(self.indent.get() - 1);
448            writeln!(
449                writer,
450                "{}</{}>",
451                Self::indent_str_for(self.indent.get()),
452                elem.qname()
453            )?;
454
455            // Close escape wrapper if element name was reserved
456            if Self::needs_escape(elem.qname()) {
457                self.indent.set(self.indent.get() - 1);
458                writeln!(
459                    writer,
460                    "{}</{}>",
461                    Self::indent_str_for(self.indent.get()),
462                    DIFF_ESC_TAG
463                )?;
464            }
465        }
466        Ok(())
467    }
468
469    /// Returns the indentation string for a given level.
470    fn indent_str_for(level: usize) -> String {
471        "  ".repeat(level)
472    }
473}
474
475/// Escapes special XML characters in text content.
476fn escape_xml(s: &str) -> String {
477    s.replace('&', "&amp;")
478        .replace('<', "&lt;")
479        .replace('>', "&gt;")
480}
481
482/// Escapes special XML characters in attribute values.
483fn escape_xml_attr(s: &str) -> String {
484    s.replace('&', "&amp;")
485        .replace('<', "&lt;")
486        .replace('>', "&gt;")
487        .replace('"', "&quot;")
488}
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493    use crate::matching::HeuristicMatching;
494    use crate::node::{
495        new_base_node, new_branch_node, NodeInner, XmlContent, XmlElement, XmlProcessingInstruction,
496    };
497    use std::collections::HashMap;
498
499    #[test]
500    fn test_escape_xml() {
501        assert_eq!(escape_xml("hello"), "hello");
502        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
503        assert_eq!(escape_xml("a & b"), "a &amp; b");
504    }
505
506    #[test]
507    fn test_escape_xml_attr() {
508        assert_eq!(escape_xml_attr("hello"), "hello");
509        assert_eq!(escape_xml_attr("say \"hi\""), "say &quot;hi&quot;");
510    }
511
512    #[test]
513    fn test_diff_identical_trees() {
514        // Create identical base and branch trees
515        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
516            "root".to_string(),
517            HashMap::new(),
518        ))));
519        let base_child = new_base_node(Some(XmlContent::Element(XmlElement::new(
520            "child".to_string(),
521            HashMap::new(),
522        ))));
523        NodeInner::add_child_to_ref(&base, base_child);
524
525        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
526            "root".to_string(),
527            HashMap::new(),
528        ))));
529        let branch_child = new_branch_node(Some(XmlContent::Element(XmlElement::new(
530            "child".to_string(),
531            HashMap::new(),
532        ))));
533        NodeInner::add_child_to_ref(&branch, branch_child);
534
535        // Create matching
536        let mut matching = HeuristicMatching::new();
537        matching.build_matching(&base, &branch);
538
539        // Generate diff
540        if let Some(diff) = Diff::new(&matching) {
541            let mut output = Vec::new();
542            diff.diff(&mut output).unwrap();
543
544            let result = String::from_utf8(output).unwrap();
545            assert!(result.contains("<diff>"));
546            assert!(result.contains("</diff>"));
547        }
548    }
549
550    #[test]
551    fn test_diff_pi_insertion() {
552        // Base: $ROOT$ -> root element (no PI)
553        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
554            "$ROOT$".to_string(),
555            HashMap::new(),
556        ))));
557        let base_root = new_base_node(Some(XmlContent::Element(XmlElement::new(
558            "root".to_string(),
559            HashMap::new(),
560        ))));
561        NodeInner::add_child_to_ref(&base, base_root);
562
563        // Branch: $ROOT$ -> PI + root element
564        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
565            "$ROOT$".to_string(),
566            HashMap::new(),
567        ))));
568        let pi = new_branch_node(Some(XmlContent::ProcessingInstruction(
569            XmlProcessingInstruction::new("my-pi", "some data"),
570        )));
571        NodeInner::add_child_to_ref(&branch, pi);
572        let branch_root = new_branch_node(Some(XmlContent::Element(XmlElement::new(
573            "root".to_string(),
574            HashMap::new(),
575        ))));
576        NodeInner::add_child_to_ref(&branch, branch_root);
577
578        let mut matching = HeuristicMatching::new();
579        matching.build_matching(&base, &branch);
580
581        if let Some(diff) = Diff::new(&matching) {
582            let mut output = Vec::new();
583            diff.diff(&mut output).unwrap();
584
585            let result = String::from_utf8(output).unwrap();
586            assert!(
587                result.contains("<?my-pi some data?>"),
588                "Diff output should contain PI content, got: {}",
589                result
590            );
591        }
592    }
593}