Skip to main content

xml_3dm/diff/
generator.rs

1//! Diff generation algorithm.
2//!
3//! Produces an XML diff document that encodes the transformations needed
4//! to convert the base tree into the branch tree.
5
6use std::io::Write;
7
8use crate::matching::Matching;
9use crate::node::{BranchNode, NodeRef, XmlContent};
10
11use super::bfs_index::BfsIndex;
12use super::diff_operation::{DiffOpType, DiffOperation};
13use super::{
14    DIFF_COPY_TAG, DIFF_CPYDST_ATTR, DIFF_CPYRUN_ATTR, DIFF_CPYSRC_ATTR, DIFF_ESC_TAG,
15    DIFF_INSERT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS, DIFF_ROOT_TAG,
16};
17
18/// Sequence for run-length encoding of consecutive copies.
19struct Sequence {
20    /// Source node ID for start of sequence.
21    src: Option<u64>,
22    /// Destination node ID.
23    dst: Option<u64>,
24    /// Tail node ID (last in sequence).
25    tail: Option<u64>,
26    /// Number of consecutive nodes.
27    run: i64,
28}
29
30impl Sequence {
31    fn new() -> Self {
32        Sequence {
33            src: None,
34            dst: None,
35            tail: None,
36            run: -1,
37        }
38    }
39
40    fn set_empty(&mut self) {
41        self.run = -1;
42    }
43
44    fn is_empty(&self) -> bool {
45        self.run == -1
46    }
47
48    fn init(&mut self, src: u64, dst: Option<u64>) {
49        self.src = Some(src);
50        self.tail = Some(src);
51        self.dst = dst;
52        self.run = 1;
53    }
54
55    fn append(&mut self, src: u64) {
56        self.run += 1;
57        self.tail = Some(src);
58    }
59}
60
61/// Diff generator for matched trees.
62pub struct Diff<'a> {
63    /// The matching between base and branch trees.
64    matching: &'a dyn Matching,
65    /// BFS index for the base tree.
66    index: BfsIndex,
67    /// Current indentation level for formatted output.
68    indent: std::cell::Cell<usize>,
69}
70
71impl<'a> Diff<'a> {
72    /// Creates a new diff generator.
73    pub fn new(matching: &'a dyn Matching) -> Option<Self> {
74        let base_root = matching.base_root()?;
75        let index = BfsIndex::new(base_root);
76
77        Some(Diff {
78            matching,
79            index,
80            indent: std::cell::Cell::new(0),
81        })
82    }
83
84    /// Creates a diff generator with a custom index.
85    pub fn with_index(matching: &'a dyn Matching, index: BfsIndex) -> Self {
86        Diff {
87            matching,
88            index,
89            indent: std::cell::Cell::new(0),
90        }
91    }
92
93    /// Generates the diff and writes it to the given writer.
94    pub fn diff<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
95        if let Some(branch_root) = self.matching.branch_root() {
96            self.diff_from(writer, branch_root)
97        } else {
98            Err(std::io::Error::new(
99                std::io::ErrorKind::InvalidInput,
100                "No branch root",
101            ))
102        }
103    }
104
105    /// Generates diff starting from a specific branch node.
106    fn diff_from<W: Write>(&self, writer: &mut W, branch_root: &NodeRef) -> std::io::Result<()> {
107        let root_has_match = BranchNode::base_match(branch_root).is_some();
108
109        writeln!(writer, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
110
111        if root_has_match {
112            // Root matches - use copy operation
113            let stop_nodes = self.get_stop_nodes(branch_root);
114            let op = DiffOperation::root_copy();
115            self.write_op_open(writer, &op)?;
116            self.write_copy(writer, &stop_nodes)?;
117            self.write_op_close(writer, &op)?;
118        } else {
119            // Root doesn't match - insert operation
120            let op = DiffOperation::root_insert();
121            self.write_op_open(writer, &op)?;
122            self.write_insert(writer, branch_root)?;
123            self.write_op_close(writer, &op)?;
124        }
125
126        Ok(())
127    }
128
129    /// Gets stop nodes for a matched subtree.
130    fn get_stop_nodes(&self, node: &NodeRef) -> Vec<NodeRef> {
131        let mut stop_nodes = Vec::new();
132        self.matching.get_area_stop_nodes(&mut stop_nodes, node);
133        stop_nodes
134    }
135
136    /// Writes the copy operations for the given stop nodes.
137    fn write_copy<W: Write>(&self, writer: &mut W, stop_nodes: &[NodeRef]) -> std::io::Result<()> {
138        let mut seq = Sequence::new();
139
140        for stop_node in stop_nodes {
141            let dst = BranchNode::base_match(stop_node).and_then(|n| self.index.get_id(&n));
142
143            if !self.emit_child_list(writer, &mut seq, stop_node, dst, false)? {
144                // Leaf node (e.g. PI, comment, text) with no children
145                let op = DiffOperation::insert(dst);
146                self.write_op_open(writer, &op)?;
147                self.write_content_open(writer, stop_node)?;
148                self.write_content_close(writer, stop_node)?;
149                self.write_op_close(writer, &op)?;
150            }
151        }
152
153        Ok(())
154    }
155
156    /// Writes an insert operation for a branch node.
157    fn write_insert<W: Write>(&self, writer: &mut W, branch: &NodeRef) -> std::io::Result<()> {
158        let mut seq = Sequence::new();
159        self.write_content_open(writer, branch)?;
160        self.emit_child_list(writer, &mut seq, branch, None, true)?;
161        self.write_content_close(writer, branch)?;
162        Ok(())
163    }
164
165    /// Emits the child list with run-length encoding for copies.
166    fn emit_child_list<W: Write>(
167        &self,
168        writer: &mut W,
169        seq: &mut Sequence,
170        parent: &NodeRef,
171        dst: Option<u64>,
172        ins_mode: bool,
173    ) -> std::io::Result<bool> {
174        let children: Vec<_> = parent.borrow().children().to_vec();
175        let child_count = children.len();
176
177        for (ic, child) in children.iter().enumerate() {
178            let last_stop_node = ic == child_count - 1;
179            let base_match = BranchNode::base_match(child);
180
181            if let Some(ref base) = base_match {
182                // Child has a base match - potential copy
183                let child_stop_nodes = self.get_stop_nodes(child);
184                let src = self.index.get_id(base);
185
186                if let Some(src_id) = src {
187                    if child_stop_nodes.is_empty() && !last_stop_node {
188                        // Can potentially extend sequence
189                        if seq.is_empty() {
190                            seq.init(src_id, dst);
191                            continue;
192                        } else if self.sequence_appends(seq, src_id, dst) {
193                            seq.append(src_id);
194                            continue;
195                        }
196                    }
197
198                    // Did not append to sequence (or at last stop node) => emit sequence
199                    if !self.sequence_appends(seq, src_id, dst) {
200                        // Current does not append to prev seq -> output prev seq + new in separate tags
201                        if !seq.is_empty() {
202                            let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
203                            self.write_op_open(writer, &op)?;
204                            self.write_op_close(writer, &op)?;
205                        }
206
207                        if !child_stop_nodes.is_empty() || last_stop_node {
208                            let op = DiffOperation::copy(src_id, dst, 1);
209                            self.write_op_open(writer, &op)?;
210                            self.write_copy(writer, &child_stop_nodes)?;
211                            self.write_op_close(writer, &op)?;
212                            seq.set_empty();
213                        } else {
214                            seq.init(src_id, dst);
215                        }
216                    } else {
217                        // Appends to open sequence (other reason for seq break)
218                        seq.append(src_id);
219                        let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
220                        self.write_op_open(writer, &op)?;
221                        self.write_copy(writer, &child_stop_nodes)?;
222                        self.write_op_close(writer, &op)?;
223                        seq.set_empty();
224                    }
225                }
226            } else {
227                // No base match - insert
228                if !seq.is_empty() {
229                    let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
230                    self.write_op_open(writer, &op)?;
231                    self.write_op_close(writer, &op)?;
232                    seq.set_empty();
233                }
234
235                if !ins_mode {
236                    // Check if we need to open an insert tag (SHORTINS optimization)
237                    let prev_has_match =
238                        ic > 0 && BranchNode::base_match(&children[ic - 1]).is_some();
239                    if ic == 0 || prev_has_match {
240                        let op = DiffOperation::insert(dst);
241                        self.write_op_open(writer, &op)?;
242                    }
243                }
244
245                self.write_insert(writer, child)?;
246
247                if !ins_mode {
248                    // Check if we need to close the insert tag (SHORTINS optimization)
249                    let next_has_match = !last_stop_node
250                        && ic + 1 < child_count
251                        && BranchNode::base_match(&children[ic + 1]).is_some();
252                    if last_stop_node || next_has_match {
253                        let op = DiffOperation::insert(None);
254                        self.write_op_close(writer, &op)?;
255                    }
256                }
257            }
258        }
259
260        Ok(!children.is_empty())
261    }
262
263    /// Checks if a source ID appends to the current sequence.
264    fn sequence_appends(&self, seq: &Sequence, src: u64, dst: Option<u64>) -> bool {
265        if seq.is_empty() {
266            return false;
267        }
268        if dst != seq.dst {
269            return false;
270        }
271        match (seq.tail, self.index.lookup(seq.tail.unwrap())) {
272            (Some(_), Some(tail_node)) => {
273                if let Some(next_node) = self.index.lookup(src) {
274                    self.index.appends(&tail_node, &next_node)
275                } else {
276                    false
277                }
278            }
279            _ => false,
280        }
281    }
282
283    /// Writes the opening tag for a diff operation.
284    fn write_op_open<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
285        let indent_str = Self::indent_str_for(self.indent.get());
286        match op.op_type {
287            DiffOpType::RootCopy => {
288                writeln!(writer, "{}<{}>", indent_str, DIFF_ROOT_TAG)?;
289                self.indent.set(self.indent.get() + 1);
290            }
291            DiffOpType::RootInsert => {
292                writeln!(
293                    writer,
294                    "{}<{} {}=\"{}\">",
295                    indent_str, DIFF_ROOT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS
296                )?;
297                self.indent.set(self.indent.get() + 1);
298            }
299            DiffOpType::Copy => {
300                write!(writer, "{}<{}", indent_str, DIFF_COPY_TAG)?;
301                if let Some(src) = op.source {
302                    write!(writer, " {}=\"{}\"", DIFF_CPYSRC_ATTR, src)?;
303                }
304                if let Some(dst) = op.destination {
305                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
306                }
307                if let Some(run) = op.run {
308                    if run > 1 {
309                        write!(writer, " {}=\"{}\"", DIFF_CPYRUN_ATTR, run)?;
310                    }
311                }
312                writeln!(writer, ">")?;
313                self.indent.set(self.indent.get() + 1);
314            }
315            DiffOpType::Insert => {
316                write!(writer, "{}<{}", indent_str, DIFF_INSERT_TAG)?;
317                if let Some(dst) = op.destination {
318                    write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
319                }
320                writeln!(writer, ">")?;
321                self.indent.set(self.indent.get() + 1);
322            }
323        }
324        Ok(())
325    }
326
327    /// Writes the closing tag for a diff operation.
328    fn write_op_close<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
329        self.indent.set(self.indent.get() - 1);
330        let indent_str = Self::indent_str_for(self.indent.get());
331        match op.op_type {
332            DiffOpType::RootCopy | DiffOpType::RootInsert => {
333                writeln!(writer, "{}</{}>", indent_str, DIFF_ROOT_TAG)?;
334            }
335            DiffOpType::Copy => {
336                writeln!(writer, "{}</{}>", indent_str, DIFF_COPY_TAG)?;
337            }
338            DiffOpType::Insert => {
339                writeln!(writer, "{}</{}>", indent_str, DIFF_INSERT_TAG)?;
340            }
341        }
342        Ok(())
343    }
344
345    /// Checks if an element name conflicts with diff reserved tags.
346    fn needs_escape(qname: &str) -> bool {
347        qname == DIFF_COPY_TAG || qname == DIFF_INSERT_TAG || qname == DIFF_ESC_TAG
348    }
349
350    /// Writes the opening content for a node.
351    fn write_content_open<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
352        let borrowed = node.borrow();
353        match borrowed.content() {
354            Some(XmlContent::Text(text)) => {
355                // Convert char slice to String
356                let text_str: String = text.text().iter().collect();
357                write!(
358                    writer,
359                    "{}{}",
360                    Self::indent_str_for(self.indent.get()),
361                    escape_xml(&text_str)
362                )?;
363            }
364            Some(XmlContent::Comment(comment)) => {
365                // Output comment
366                let comment_text: String = comment.text().iter().collect();
367                writeln!(
368                    writer,
369                    "{}<!-- {} -->",
370                    Self::indent_str_for(self.indent.get()),
371                    comment_text
372                )?;
373            }
374            Some(XmlContent::ProcessingInstruction(pi)) => {
375                // Output processing instruction
376                if pi.content().is_empty() {
377                    writeln!(
378                        writer,
379                        "{}<?{}?>",
380                        Self::indent_str_for(self.indent.get()),
381                        pi.target()
382                    )?;
383                } else {
384                    writeln!(
385                        writer,
386                        "{}<?{} {}?>",
387                        Self::indent_str_for(self.indent.get()),
388                        pi.target(),
389                        pi.content()
390                    )?;
391                }
392            }
393            Some(XmlContent::Element(elem)) => {
394                // Check if element name conflicts with diff reserved tags
395                let needs_esc = Self::needs_escape(elem.qname());
396                if needs_esc {
397                    writeln!(
398                        writer,
399                        "{}<{}>",
400                        Self::indent_str_for(self.indent.get()),
401                        DIFF_ESC_TAG
402                    )?;
403                    self.indent.set(self.indent.get() + 1);
404                }
405
406                write!(
407                    writer,
408                    "{}<{}",
409                    Self::indent_str_for(self.indent.get()),
410                    elem.qname()
411                )?;
412                // Namespace declarations (sorted for deterministic output)
413                let mut ns_prefixes: Vec<_> = elem.namespace_decls().keys().collect();
414                ns_prefixes.sort();
415                for prefix in ns_prefixes {
416                    if let Some(uri) = elem.namespace_decls().get(prefix) {
417                        if prefix.is_empty() {
418                            write!(writer, " xmlns=\"{}\"", escape_xml_attr(uri))?;
419                        } else {
420                            write!(writer, " xmlns:{}=\"{}\"", prefix, escape_xml_attr(uri))?;
421                        }
422                    }
423                }
424                // Attributes (sorted for deterministic output)
425                let mut attr_names: Vec<_> = elem.attributes().keys().collect();
426                attr_names.sort();
427                for name in attr_names {
428                    if let Some(value) = elem.attributes().get(name) {
429                        write!(writer, " {}=\"{}\"", name, escape_xml_attr(value))?;
430                    }
431                }
432                writeln!(writer, ">")?;
433                self.indent.set(self.indent.get() + 1);
434            }
435            None => {}
436        }
437        Ok(())
438    }
439
440    /// Writes the closing content for a node.
441    fn write_content_close<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
442        let borrowed = node.borrow();
443        if let Some(XmlContent::Element(elem)) = borrowed.content() {
444            self.indent.set(self.indent.get() - 1);
445            writeln!(
446                writer,
447                "{}</{}>",
448                Self::indent_str_for(self.indent.get()),
449                elem.qname()
450            )?;
451
452            // Close escape wrapper if element name was reserved
453            if Self::needs_escape(elem.qname()) {
454                self.indent.set(self.indent.get() - 1);
455                writeln!(
456                    writer,
457                    "{}</{}>",
458                    Self::indent_str_for(self.indent.get()),
459                    DIFF_ESC_TAG
460                )?;
461            }
462        }
463        Ok(())
464    }
465
466    /// Returns the indentation string for a given level.
467    fn indent_str_for(level: usize) -> String {
468        "  ".repeat(level)
469    }
470}
471
472/// Escapes special XML characters in text content.
473fn escape_xml(s: &str) -> String {
474    s.replace('&', "&amp;")
475        .replace('<', "&lt;")
476        .replace('>', "&gt;")
477}
478
479/// Escapes special XML characters in attribute values.
480fn escape_xml_attr(s: &str) -> String {
481    s.replace('&', "&amp;")
482        .replace('<', "&lt;")
483        .replace('>', "&gt;")
484        .replace('"', "&quot;")
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490    use crate::matching::HeuristicMatching;
491    use crate::node::{
492        new_base_node, new_branch_node, NodeInner, XmlContent, XmlElement, XmlProcessingInstruction,
493    };
494    use std::collections::HashMap;
495
496    #[test]
497    fn test_escape_xml() {
498        assert_eq!(escape_xml("hello"), "hello");
499        assert_eq!(escape_xml("<test>"), "&lt;test&gt;");
500        assert_eq!(escape_xml("a & b"), "a &amp; b");
501    }
502
503    #[test]
504    fn test_escape_xml_attr() {
505        assert_eq!(escape_xml_attr("hello"), "hello");
506        assert_eq!(escape_xml_attr("say \"hi\""), "say &quot;hi&quot;");
507    }
508
509    #[test]
510    fn test_diff_identical_trees() {
511        // Create identical base and branch trees
512        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
513            "root".to_string(),
514            HashMap::new(),
515        ))));
516        let base_child = new_base_node(Some(XmlContent::Element(XmlElement::new(
517            "child".to_string(),
518            HashMap::new(),
519        ))));
520        NodeInner::add_child_to_ref(&base, base_child);
521
522        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
523            "root".to_string(),
524            HashMap::new(),
525        ))));
526        let branch_child = new_branch_node(Some(XmlContent::Element(XmlElement::new(
527            "child".to_string(),
528            HashMap::new(),
529        ))));
530        NodeInner::add_child_to_ref(&branch, branch_child);
531
532        // Create matching
533        let mut matching = HeuristicMatching::new();
534        matching.build_matching(&base, &branch);
535
536        // Generate diff
537        if let Some(diff) = Diff::new(&matching) {
538            let mut output = Vec::new();
539            diff.diff(&mut output).unwrap();
540
541            let result = String::from_utf8(output).unwrap();
542            assert!(result.contains("<diff>"));
543            assert!(result.contains("</diff>"));
544        }
545    }
546
547    #[test]
548    fn test_diff_pi_insertion() {
549        // Base: $ROOT$ -> root element (no PI)
550        let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
551            "$ROOT$".to_string(),
552            HashMap::new(),
553        ))));
554        let base_root = new_base_node(Some(XmlContent::Element(XmlElement::new(
555            "root".to_string(),
556            HashMap::new(),
557        ))));
558        NodeInner::add_child_to_ref(&base, base_root);
559
560        // Branch: $ROOT$ -> PI + root element
561        let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
562            "$ROOT$".to_string(),
563            HashMap::new(),
564        ))));
565        let pi = new_branch_node(Some(XmlContent::ProcessingInstruction(
566            XmlProcessingInstruction::new("my-pi", "some data"),
567        )));
568        NodeInner::add_child_to_ref(&branch, pi);
569        let branch_root = new_branch_node(Some(XmlContent::Element(XmlElement::new(
570            "root".to_string(),
571            HashMap::new(),
572        ))));
573        NodeInner::add_child_to_ref(&branch, branch_root);
574
575        let mut matching = HeuristicMatching::new();
576        matching.build_matching(&base, &branch);
577
578        if let Some(diff) = Diff::new(&matching) {
579            let mut output = Vec::new();
580            diff.diff(&mut output).unwrap();
581
582            let result = String::from_utf8(output).unwrap();
583            assert!(
584                result.contains("<?my-pi some data?>"),
585                "Diff output should contain PI content, got: {}",
586                result
587            );
588        }
589    }
590}