1use std::io::Write;
7
8use crate::matching::Matching;
9use crate::node::{BranchNode, NodeRef, XmlContent};
10
11use super::bfs_index::BfsIndex;
12use super::diff_operation::{DiffOpType, DiffOperation};
13use super::{
14 DIFF_COPY_TAG, DIFF_CPYDST_ATTR, DIFF_CPYRUN_ATTR, DIFF_CPYSRC_ATTR, DIFF_ESC_TAG,
15 DIFF_INSERT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS, DIFF_ROOT_TAG,
16};
17
18struct Sequence {
20 src: Option<u64>,
22 dst: Option<u64>,
24 tail: Option<u64>,
26 run: i64,
28}
29
30impl Sequence {
31 fn new() -> Self {
32 Sequence {
33 src: None,
34 dst: None,
35 tail: None,
36 run: -1,
37 }
38 }
39
40 fn set_empty(&mut self) {
41 self.run = -1;
42 }
43
44 fn is_empty(&self) -> bool {
45 self.run == -1
46 }
47
48 fn init(&mut self, src: u64, dst: Option<u64>) {
49 self.src = Some(src);
50 self.tail = Some(src);
51 self.dst = dst;
52 self.run = 1;
53 }
54
55 fn append(&mut self, src: u64) {
56 self.run += 1;
57 self.tail = Some(src);
58 }
59}
60
61pub struct Diff<'a> {
63 matching: &'a dyn Matching,
65 index: BfsIndex,
67 indent: std::cell::Cell<usize>,
69}
70
71impl<'a> Diff<'a> {
72 pub fn new(matching: &'a dyn Matching) -> Option<Self> {
74 let base_root = matching.base_root()?;
75 let index = BfsIndex::new(base_root);
76
77 Some(Diff {
78 matching,
79 index,
80 indent: std::cell::Cell::new(0),
81 })
82 }
83
84 pub fn with_index(matching: &'a dyn Matching, index: BfsIndex) -> Self {
86 Diff {
87 matching,
88 index,
89 indent: std::cell::Cell::new(0),
90 }
91 }
92
93 pub fn diff<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
95 if let Some(branch_root) = self.matching.branch_root() {
96 self.diff_from(writer, branch_root)
97 } else {
98 Err(std::io::Error::new(
99 std::io::ErrorKind::InvalidInput,
100 "No branch root",
101 ))
102 }
103 }
104
105 fn diff_from<W: Write>(&self, writer: &mut W, branch_root: &NodeRef) -> std::io::Result<()> {
107 let root_has_match = BranchNode::base_match(branch_root).is_some();
108
109 writeln!(writer, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")?;
110
111 if root_has_match {
112 let stop_nodes = self.get_stop_nodes(branch_root);
114 let op = DiffOperation::root_copy();
115 self.write_op_open(writer, &op)?;
116 self.write_copy(writer, &stop_nodes)?;
117 self.write_op_close(writer, &op)?;
118 } else {
119 let op = DiffOperation::root_insert();
121 self.write_op_open(writer, &op)?;
122 self.write_insert(writer, branch_root)?;
123 self.write_op_close(writer, &op)?;
124 }
125
126 Ok(())
127 }
128
129 fn get_stop_nodes(&self, node: &NodeRef) -> Vec<NodeRef> {
131 let mut stop_nodes = Vec::new();
132 self.matching.get_area_stop_nodes(&mut stop_nodes, node);
133 stop_nodes
134 }
135
136 fn write_copy<W: Write>(&self, writer: &mut W, stop_nodes: &[NodeRef]) -> std::io::Result<()> {
138 let mut seq = Sequence::new();
139
140 for stop_node in stop_nodes {
141 let dst = BranchNode::base_match(stop_node).and_then(|n| self.index.get_id(&n));
142
143 if !self.emit_child_list(writer, &mut seq, stop_node, dst, false)? {
144 let op = DiffOperation::insert(dst);
146 self.write_op_open(writer, &op)?;
147 self.write_content_open(writer, stop_node)?;
148 self.write_content_close(writer, stop_node)?;
149 self.write_op_close(writer, &op)?;
150 }
151 }
152
153 Ok(())
154 }
155
156 fn write_insert<W: Write>(&self, writer: &mut W, branch: &NodeRef) -> std::io::Result<()> {
158 let mut seq = Sequence::new();
159 self.write_content_open(writer, branch)?;
160 self.emit_child_list(writer, &mut seq, branch, None, true)?;
161 self.write_content_close(writer, branch)?;
162 Ok(())
163 }
164
165 fn emit_child_list<W: Write>(
167 &self,
168 writer: &mut W,
169 seq: &mut Sequence,
170 parent: &NodeRef,
171 dst: Option<u64>,
172 ins_mode: bool,
173 ) -> std::io::Result<bool> {
174 let children: Vec<_> = parent.borrow().children().to_vec();
175 let child_count = children.len();
176
177 for (ic, child) in children.iter().enumerate() {
178 let last_stop_node = ic == child_count - 1;
179 let base_match = BranchNode::base_match(child);
180
181 if let Some(ref base) = base_match {
182 let child_stop_nodes = self.get_stop_nodes(child);
184 let src = self.index.get_id(base);
185
186 if let Some(src_id) = src {
187 if child_stop_nodes.is_empty() && !last_stop_node {
188 if seq.is_empty() {
190 seq.init(src_id, dst);
191 continue;
192 } else if self.sequence_appends(seq, src_id, dst) {
193 seq.append(src_id);
194 continue;
195 }
196 }
197
198 if !self.sequence_appends(seq, src_id, dst) {
200 if !seq.is_empty() {
202 let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
203 self.write_op_open(writer, &op)?;
204 self.write_op_close(writer, &op)?;
205 }
206
207 if !child_stop_nodes.is_empty() || last_stop_node {
208 let op = DiffOperation::copy(src_id, dst, 1);
209 self.write_op_open(writer, &op)?;
210 self.write_copy(writer, &child_stop_nodes)?;
211 self.write_op_close(writer, &op)?;
212 seq.set_empty();
213 } else {
214 seq.init(src_id, dst);
215 }
216 } else {
217 seq.append(src_id);
219 let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
220 self.write_op_open(writer, &op)?;
221 self.write_copy(writer, &child_stop_nodes)?;
222 self.write_op_close(writer, &op)?;
223 seq.set_empty();
224 }
225 }
226 } else {
227 if !seq.is_empty() {
229 let op = DiffOperation::copy(seq.src.unwrap(), seq.dst, seq.run as u64);
230 self.write_op_open(writer, &op)?;
231 self.write_op_close(writer, &op)?;
232 seq.set_empty();
233 }
234
235 if !ins_mode {
236 let prev_has_match =
238 ic > 0 && BranchNode::base_match(&children[ic - 1]).is_some();
239 if ic == 0 || prev_has_match {
240 let op = DiffOperation::insert(dst);
241 self.write_op_open(writer, &op)?;
242 }
243 }
244
245 self.write_insert(writer, child)?;
246
247 if !ins_mode {
248 let next_has_match = !last_stop_node
250 && ic + 1 < child_count
251 && BranchNode::base_match(&children[ic + 1]).is_some();
252 if last_stop_node || next_has_match {
253 let op = DiffOperation::insert(None);
254 self.write_op_close(writer, &op)?;
255 }
256 }
257 }
258 }
259
260 Ok(!children.is_empty())
261 }
262
263 fn sequence_appends(&self, seq: &Sequence, src: u64, dst: Option<u64>) -> bool {
265 if seq.is_empty() {
266 return false;
267 }
268 if dst != seq.dst {
269 return false;
270 }
271 match (seq.tail, self.index.lookup(seq.tail.unwrap())) {
272 (Some(_), Some(tail_node)) => {
273 if let Some(next_node) = self.index.lookup(src) {
274 self.index.appends(&tail_node, &next_node)
275 } else {
276 false
277 }
278 }
279 _ => false,
280 }
281 }
282
283 fn write_op_open<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
285 let indent_str = Self::indent_str_for(self.indent.get());
286 match op.op_type {
287 DiffOpType::RootCopy => {
288 writeln!(writer, "{}<{}>", indent_str, DIFF_ROOT_TAG)?;
289 self.indent.set(self.indent.get() + 1);
290 }
291 DiffOpType::RootInsert => {
292 writeln!(
293 writer,
294 "{}<{} {}=\"{}\">",
295 indent_str, DIFF_ROOT_TAG, DIFF_ROOTOP_ATTR, DIFF_ROOTOP_INS
296 )?;
297 self.indent.set(self.indent.get() + 1);
298 }
299 DiffOpType::Copy => {
300 write!(writer, "{}<{}", indent_str, DIFF_COPY_TAG)?;
301 if let Some(src) = op.source {
302 write!(writer, " {}=\"{}\"", DIFF_CPYSRC_ATTR, src)?;
303 }
304 if let Some(dst) = op.destination {
305 write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
306 }
307 if let Some(run) = op.run {
308 if run > 1 {
309 write!(writer, " {}=\"{}\"", DIFF_CPYRUN_ATTR, run)?;
310 }
311 }
312 writeln!(writer, ">")?;
313 self.indent.set(self.indent.get() + 1);
314 }
315 DiffOpType::Insert => {
316 write!(writer, "{}<{}", indent_str, DIFF_INSERT_TAG)?;
317 if let Some(dst) = op.destination {
318 write!(writer, " {}=\"{}\"", DIFF_CPYDST_ATTR, dst)?;
319 }
320 writeln!(writer, ">")?;
321 self.indent.set(self.indent.get() + 1);
322 }
323 }
324 Ok(())
325 }
326
327 fn write_op_close<W: Write>(&self, writer: &mut W, op: &DiffOperation) -> std::io::Result<()> {
329 self.indent.set(self.indent.get() - 1);
330 let indent_str = Self::indent_str_for(self.indent.get());
331 match op.op_type {
332 DiffOpType::RootCopy | DiffOpType::RootInsert => {
333 writeln!(writer, "{}</{}>", indent_str, DIFF_ROOT_TAG)?;
334 }
335 DiffOpType::Copy => {
336 writeln!(writer, "{}</{}>", indent_str, DIFF_COPY_TAG)?;
337 }
338 DiffOpType::Insert => {
339 writeln!(writer, "{}</{}>", indent_str, DIFF_INSERT_TAG)?;
340 }
341 }
342 Ok(())
343 }
344
345 fn needs_escape(qname: &str) -> bool {
347 qname == DIFF_COPY_TAG || qname == DIFF_INSERT_TAG || qname == DIFF_ESC_TAG
348 }
349
350 fn write_content_open<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
352 let borrowed = node.borrow();
353 match borrowed.content() {
354 Some(XmlContent::Text(text)) => {
355 let text_str: String = text.text().iter().collect();
357 write!(
358 writer,
359 "{}{}",
360 Self::indent_str_for(self.indent.get()),
361 escape_xml(&text_str)
362 )?;
363 }
364 Some(XmlContent::Comment(comment)) => {
365 let comment_text: String = comment.text().iter().collect();
367 writeln!(
368 writer,
369 "{}<!-- {} -->",
370 Self::indent_str_for(self.indent.get()),
371 comment_text
372 )?;
373 }
374 Some(XmlContent::ProcessingInstruction(pi)) => {
375 if pi.content().is_empty() {
377 writeln!(
378 writer,
379 "{}<?{}?>",
380 Self::indent_str_for(self.indent.get()),
381 pi.target()
382 )?;
383 } else {
384 writeln!(
385 writer,
386 "{}<?{} {}?>",
387 Self::indent_str_for(self.indent.get()),
388 pi.target(),
389 pi.content()
390 )?;
391 }
392 }
393 Some(XmlContent::Element(elem)) => {
394 let needs_esc = Self::needs_escape(elem.qname());
396 if needs_esc {
397 writeln!(
398 writer,
399 "{}<{}>",
400 Self::indent_str_for(self.indent.get()),
401 DIFF_ESC_TAG
402 )?;
403 self.indent.set(self.indent.get() + 1);
404 }
405
406 write!(
407 writer,
408 "{}<{}",
409 Self::indent_str_for(self.indent.get()),
410 elem.qname()
411 )?;
412 let mut ns_prefixes: Vec<_> = elem.namespace_decls().keys().collect();
414 ns_prefixes.sort();
415 for prefix in ns_prefixes {
416 if let Some(uri) = elem.namespace_decls().get(prefix) {
417 if prefix.is_empty() {
418 write!(writer, " xmlns=\"{}\"", escape_xml_attr(uri))?;
419 } else {
420 write!(writer, " xmlns:{}=\"{}\"", prefix, escape_xml_attr(uri))?;
421 }
422 }
423 }
424 let mut attr_names: Vec<_> = elem.attributes().keys().collect();
426 attr_names.sort();
427 for name in attr_names {
428 if let Some(value) = elem.attributes().get(name) {
429 write!(writer, " {}=\"{}\"", name, escape_xml_attr(value))?;
430 }
431 }
432 writeln!(writer, ">")?;
433 self.indent.set(self.indent.get() + 1);
434 }
435 None => {}
436 }
437 Ok(())
438 }
439
440 fn write_content_close<W: Write>(&self, writer: &mut W, node: &NodeRef) -> std::io::Result<()> {
442 let borrowed = node.borrow();
443 if let Some(XmlContent::Element(elem)) = borrowed.content() {
444 self.indent.set(self.indent.get() - 1);
445 writeln!(
446 writer,
447 "{}</{}>",
448 Self::indent_str_for(self.indent.get()),
449 elem.qname()
450 )?;
451
452 if Self::needs_escape(elem.qname()) {
454 self.indent.set(self.indent.get() - 1);
455 writeln!(
456 writer,
457 "{}</{}>",
458 Self::indent_str_for(self.indent.get()),
459 DIFF_ESC_TAG
460 )?;
461 }
462 }
463 Ok(())
464 }
465
466 fn indent_str_for(level: usize) -> String {
468 " ".repeat(level)
469 }
470}
471
472fn escape_xml(s: &str) -> String {
474 s.replace('&', "&")
475 .replace('<', "<")
476 .replace('>', ">")
477}
478
479fn escape_xml_attr(s: &str) -> String {
481 s.replace('&', "&")
482 .replace('<', "<")
483 .replace('>', ">")
484 .replace('"', """)
485}
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490 use crate::matching::HeuristicMatching;
491 use crate::node::{
492 new_base_node, new_branch_node, NodeInner, XmlContent, XmlElement, XmlProcessingInstruction,
493 };
494 use std::collections::HashMap;
495
496 #[test]
497 fn test_escape_xml() {
498 assert_eq!(escape_xml("hello"), "hello");
499 assert_eq!(escape_xml("<test>"), "<test>");
500 assert_eq!(escape_xml("a & b"), "a & b");
501 }
502
503 #[test]
504 fn test_escape_xml_attr() {
505 assert_eq!(escape_xml_attr("hello"), "hello");
506 assert_eq!(escape_xml_attr("say \"hi\""), "say "hi"");
507 }
508
509 #[test]
510 fn test_diff_identical_trees() {
511 let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
513 "root".to_string(),
514 HashMap::new(),
515 ))));
516 let base_child = new_base_node(Some(XmlContent::Element(XmlElement::new(
517 "child".to_string(),
518 HashMap::new(),
519 ))));
520 NodeInner::add_child_to_ref(&base, base_child);
521
522 let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
523 "root".to_string(),
524 HashMap::new(),
525 ))));
526 let branch_child = new_branch_node(Some(XmlContent::Element(XmlElement::new(
527 "child".to_string(),
528 HashMap::new(),
529 ))));
530 NodeInner::add_child_to_ref(&branch, branch_child);
531
532 let mut matching = HeuristicMatching::new();
534 matching.build_matching(&base, &branch);
535
536 if let Some(diff) = Diff::new(&matching) {
538 let mut output = Vec::new();
539 diff.diff(&mut output).unwrap();
540
541 let result = String::from_utf8(output).unwrap();
542 assert!(result.contains("<diff>"));
543 assert!(result.contains("</diff>"));
544 }
545 }
546
547 #[test]
548 fn test_diff_pi_insertion() {
549 let base = new_base_node(Some(XmlContent::Element(XmlElement::new(
551 "$ROOT$".to_string(),
552 HashMap::new(),
553 ))));
554 let base_root = new_base_node(Some(XmlContent::Element(XmlElement::new(
555 "root".to_string(),
556 HashMap::new(),
557 ))));
558 NodeInner::add_child_to_ref(&base, base_root);
559
560 let branch = new_branch_node(Some(XmlContent::Element(XmlElement::new(
562 "$ROOT$".to_string(),
563 HashMap::new(),
564 ))));
565 let pi = new_branch_node(Some(XmlContent::ProcessingInstruction(
566 XmlProcessingInstruction::new("my-pi", "some data"),
567 )));
568 NodeInner::add_child_to_ref(&branch, pi);
569 let branch_root = new_branch_node(Some(XmlContent::Element(XmlElement::new(
570 "root".to_string(),
571 HashMap::new(),
572 ))));
573 NodeInner::add_child_to_ref(&branch, branch_root);
574
575 let mut matching = HeuristicMatching::new();
576 matching.build_matching(&base, &branch);
577
578 if let Some(diff) = Diff::new(&matching) {
579 let mut output = Vec::new();
580 diff.diff(&mut output).unwrap();
581
582 let result = String::from_utf8(output).unwrap();
583 assert!(
584 result.contains("<?my-pi some data?>"),
585 "Diff output should contain PI content, got: {}",
586 result
587 );
588 }
589 }
590}