natural_xml_diff/
edits.rs

1use dissimilar::{diff, Chunk};
2use xot::{Element, NameId, Node, Text, Value, Xot};
3
4use crate::comparison::Comparison;
5use crate::vtree::{Status, Vtree};
6
7/// An edit describes a change to an XML document.
8/// Nodes are addressed with `usize`, which is an index
9/// into the complete descendants in the tree (including the root node 0)
10/// in document order (pre-order). The indexing includes non-element
11/// nodes.
12#[derive(Debug, PartialEq, Eq)]
13pub enum Edit {
14    /// Insert content at a given positionf
15    Insert(InsertPosition, InsertContent),
16    /// Delete content at a given position. The usize is the
17    /// index of the node to delete.
18    Delete(usize),
19    /// Update text content of a node. The precise changes are
20    /// described by the vector.
21    TextUpdate(usize, Vec<TextChange>),
22    /// Update attributes of a node. The precise changes are
23    /// described by the vector.
24    AttributeUpdate(usize, Vec<AttributeChange>),
25}
26
27/// The position where content is inserted.
28#[derive(Debug, PartialEq, Eq)]
29pub struct InsertPosition {
30    /// The index of the parent node.
31    pub parent_node_id: usize,
32    /// Child node osition where to insert in the parent. Existing
33    /// nodes at this position and after are shifted to the right.
34    pub child_position: u32,
35    /// The number of intervening descendant nodes the parent node has
36    /// This is used to help sort deletes before any inserts into that
37    /// parent happen
38    pub descendant_count: u32,
39}
40
41/// What to insert
42#[derive(Debug, PartialEq, Eq)]
43pub enum InsertContent {
44    /// Insert a copy of this XML node as given by a [`xot`] node reference into
45    /// document B.
46    XmlNode(Node),
47    /// Text content to insert.
48    Text(String),
49}
50
51/// Describe how a text node should be updated.
52#[derive(Debug, PartialEq, Eq)]
53pub enum TextChange {
54    ///This text is equal in both documents.
55    Equal(String),
56    /// This text was deleted.
57    Delete(String),
58    /// This text was inserted.
59    Insert(String),
60}
61
62/// Describe how a element node's attributes should be updated.
63#[derive(Debug, PartialEq, Eq)]
64pub enum AttributeChange {
65    /// Update an attribute with a new value.
66    Update(NameId, String),
67    /// Delete an attribute.
68    Delete(NameId),
69    /// Insert an attribute.
70    Insert(NameId, String),
71}
72
73impl Comparison {
74    /// Returns a list of edits that transform document A into document B.
75    ///
76    /// The edits can be applied to the document A to obtain the diff document,
77    /// by using `apply_edits`.
78    ///
79    /// This is a low-level API to see the details of the edit operations.
80    /// Normally you'd use the higher level `Comparison::diff` method to
81    /// obtain the diff document directly.
82    pub(crate) fn edits(&mut self, xot: &mut Xot) -> Vec<Edit> {
83        // first execute the diff and update the status information
84        // in the vnodes
85        self.diff_status(xot);
86
87        let mut edits = Vec::new();
88        for delete_id in self.deletes() {
89            edits.push(Edit::Delete(delete_id));
90        }
91        for (node_id, parent_node_id, (child_position, intervening)) in self.inserts() {
92            let insert_node = self.get_node_b(node_id);
93            let content = match xot.value(insert_node) {
94                Value::Element(_) => InsertContent::XmlNode(insert_node),
95                Value::Text(text) => {
96                    let text = text.get();
97                    InsertContent::Text(text.to_owned())
98                }
99                Value::ProcessingInstruction(_) => InsertContent::XmlNode(insert_node),
100                Value::Comment(_) => InsertContent::XmlNode(insert_node),
101                Value::Root => {
102                    panic!("Unsupported root node type");
103                }
104            };
105            edits.push(Edit::Insert(
106                InsertPosition {
107                    parent_node_id,
108                    child_position,
109                    descendant_count: intervening,
110                },
111                content,
112            ));
113        }
114
115        for (node_id_a, node_id_b) in self.updates() {
116            let node_a = self.get_node_a(node_id_a);
117            let node_b = self.get_node_b(node_id_b);
118            let value_pair = (xot.value(node_a), xot.value(node_b));
119            match value_pair {
120                (Value::Text(text_a), Value::Text(text_b)) => {
121                    edits.push(Edit::TextUpdate(
122                        node_id_a,
123                        get_text_changes(text_a, text_b),
124                    ));
125                }
126                (Value::Element(element_a), Value::Element(element_b)) => {
127                    edits.push(Edit::AttributeUpdate(
128                        node_id_a,
129                        get_attribute_changes(element_a, element_b),
130                    ));
131                }
132                (Value::Root, Value::Root) => {
133                    // nothing to do for root
134                }
135                _ => {
136                    panic!("Unsupported node values for update");
137                }
138            }
139        }
140
141        edits.sort_by_key(|edit| match edit {
142            Edit::Delete(delete_id) => (*delete_id, 0),
143            Edit::TextUpdate(update_id, _changes) => (*update_id, 0),
144            Edit::AttributeUpdate(update_id, _changes) => (*update_id, 0),
145            // sort inserts after deletes that can affect them
146            // if an insert is for the same node as one that is deleted, delete
147            // must come first
148            Edit::Insert(position, _content) => (
149                position.parent_node_id + position.descendant_count as usize,
150                1,
151            ),
152        });
153        edits
154    }
155
156    // deletes are node ids to be deleted in tree a
157    fn deletes(&self) -> impl Iterator<Item = usize> + '_ {
158        let vtree_a = &self.vtree_a;
159        vtree_a
160            .nodes
161            .iter()
162            .enumerate()
163            .filter(|(_, vnode)| match vnode.status {
164                Status::Equal(_id_b) => false,
165                Status::Update(_id_b) => false,
166                Status::Different => true,
167            })
168            // we don't want to delete any nodes that have a parent that is
169            // different; their parent already is deleted
170            .filter(|(_, vnode)| {
171                let parent_id = vnode.parent_id;
172                if let Some(parent_id) = parent_id {
173                    let parent = &vtree_a.nodes[parent_id];
174                    match parent.status {
175                        Status::Equal(_id_b) => true,
176                        Status::Update(_id_b) => true,
177                        Status::Different => false,
178                    }
179                } else {
180                    // never delete the root node
181                    false
182                }
183            })
184            .map(|(id, _)| id)
185    }
186
187    // insert is the node from tree b to insert, the node in tree a to
188    // insert under, and the position in the children where to insert
189    fn inserts(&self) -> impl Iterator<Item = (usize, usize, (u32, u32))> + '_ {
190        let vtree_b = &self.vtree_b;
191        vtree_b
192            .nodes
193            .iter()
194            .enumerate()
195            .filter(|(_, vnode)| match vnode.status {
196                Status::Equal(_id_a) => false,
197                Status::Update(_id_a) => false,
198                Status::Different => true,
199            })
200            // we don't want to insert any nodes that have a parent that is
201            // different; their parent already is inserted
202            // we also want to determine the insertion parent's id in tree a, and
203            // the position if its child
204            .filter_map(|(id, vnode)| {
205                let parent_id = vnode.parent_id;
206                if let Some(parent_id) = parent_id {
207                    let vnode_parent = &self.vtree_b.nodes[parent_id];
208                    match vnode_parent.status {
209                        Status::Equal(id_a) | Status::Update(id_a) => Some((
210                            id,
211                            id_a as usize,
212                            get_child_position(&self.vtree_b, parent_id, id),
213                        )),
214                        Status::Different => None,
215                    }
216                } else {
217                    // never insert the root node
218                    None
219                }
220            })
221    }
222
223    fn updates(&self) -> impl Iterator<Item = (usize, usize)> + '_ {
224        let vtree_a = &self.vtree_a;
225        vtree_a
226            .nodes
227            .iter()
228            .enumerate()
229            .filter_map(|(index, node)| match node.status {
230                Status::Equal(_id_b) => None,
231                Status::Update(id_b) => Some((index, id_b as usize)),
232                Status::Different => None,
233            })
234    }
235}
236
237fn get_child_position(vtree: &Vtree, parent: usize, child: usize) -> (u32, u32) {
238    let parent_vnode = &vtree.nodes[parent];
239    let child_vnode = &vtree.nodes[child];
240    for (i, candidate_child) in vtree.children(parent_vnode).enumerate() {
241        if std::ptr::eq(candidate_child, child_vnode) {
242            return (i as u32, parent_vnode.descendant_count);
243        }
244    }
245    unreachable!("child not found in parent");
246}
247
248fn get_text_changes(text_a: &Text, text_b: &Text) -> Vec<TextChange> {
249    diff(text_a.get(), text_b.get())
250        .iter()
251        .map(|chunk| match chunk {
252            Chunk::Equal(text) => TextChange::Equal(text.to_string()),
253            Chunk::Delete(text) => TextChange::Delete(text.to_string()),
254            Chunk::Insert(text) => TextChange::Insert(text.to_string()),
255        })
256        .collect()
257}
258
259fn get_attribute_changes(element_a: &Element, element_b: &Element) -> Vec<AttributeChange> {
260    let mut changes = Vec::new();
261    for (name, value) in element_a.attributes().iter() {
262        if let Some(value_b) = element_b.attributes().get(name) {
263            if value != value_b {
264                changes.push(AttributeChange::Update(*name, value_b.clone()));
265            }
266        } else {
267            changes.push(AttributeChange::Delete(*name));
268        }
269    }
270    for (name, value) in element_b.attributes().iter() {
271        if !element_a.attributes().contains_key(name) {
272            changes.push(AttributeChange::Insert(*name, value.clone()));
273        }
274    }
275    changes
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281    use xot::{ValueType, Xot};
282
283    #[test]
284    fn test_deletes() {
285        let mut xot = Xot::new();
286        // example taken from the jndiff paper, page 11
287        let doc_a = xot
288            .parse(concat!(
289                "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
290                "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
291                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
292                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
293                "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
294            ))
295            .unwrap();
296        let doc_b = xot
297            .parse(concat!(
298                "<book><chapter><para>Text 2</para></chapter>",
299                "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
300                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
301                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
302                "<chapter><para>Text 12</para></chapter></book>"
303            ))
304            .unwrap();
305
306        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
307        comparison.diff_status(&xot);
308        let deletes = comparison.deletes().collect::<Vec<_>>();
309        assert_eq!(deletes, vec![3, 25]);
310    }
311
312    #[test]
313    fn test_inserts() {
314        let mut xot = Xot::new();
315        // example taken from the jndiff paper, page 11
316        let doc_a = xot
317            .parse(concat!(
318                "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
319                "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
320                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
321                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
322                "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
323            ))
324            .unwrap();
325        let doc_b = xot
326            .parse(concat!(
327                "<book><chapter><para>Text 2</para></chapter>",
328                "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
329                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
330                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
331                "<chapter><para>Text 12</para></chapter></book>"
332            ))
333            .unwrap();
334
335        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
336        comparison.diff_status(&xot);
337        let inserts = comparison.inserts().collect::<Vec<_>>();
338        assert_eq!(inserts, vec![(10, 7, (2, 6))]);
339    }
340
341    #[test]
342    fn test_updates() {
343        let mut xot = Xot::new();
344        // example taken from the jndiff paper, page 11
345        let doc_a = xot
346            .parse(concat!(
347                "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
348                "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
349                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
350                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
351                "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
352            ))
353            .unwrap();
354        let doc_b = xot
355            .parse(concat!(
356                "<book><chapter><para>Text 2</para></chapter>",
357                "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
358                "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
359                "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
360                "<chapter><para>Text 12</para></chapter></book>"
361            ))
362            .unwrap();
363
364        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
365        comparison.diff_status(&xot);
366        let updates = comparison.updates().collect::<Vec<_>>();
367        assert_eq!(updates, vec![(0, 0), (11, 9)]);
368    }
369
370    #[test]
371    fn test_edits() {
372        let mut xot = Xot::new();
373        // example taken from the jndiff paper, page 11
374        let xml_a = concat!(
375            "<book><chapter><title>Text 1</title><para>Text 2</para></chapter>",
376            "<chapter><title>Text 4</title><para>Text 5</para></chapter>",
377            "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
378            "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
379            "<chapter><para>Text 11</para><para>Text 12</para></chapter></book>"
380        );
381        let xml_b = concat!(
382            "<book><chapter><para>Text 2</para></chapter>",
383            "<chapter><title>Text 4</title><para>Text 25</para><para>Text 11</para></chapter>",
384            "<chapter><title>Text 6</title><para>Text 7<img/>Text 8</para></chapter>",
385            "<chapter><title>Text 9</title><para>Text 10</para></chapter>",
386            "<chapter><para>Text 12</para></chapter></book>"
387        );
388        let doc_a = xot.parse(xml_a).unwrap();
389        let doc_b = xot.parse(xml_b).unwrap();
390
391        let inserted_node = xot
392            .descendants(doc_b)
393            .find(|node| xot.text_content_str(*node) == Some("Text 11"))
394            .unwrap();
395
396        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
397        let edits = comparison.edits(&mut xot);
398        assert_eq!(
399            edits,
400            vec![
401                Edit::Delete(3),
402                Edit::TextUpdate(
403                    11,
404                    vec![
405                        (TextChange::Equal("Text ".to_string())),
406                        (TextChange::Insert("2".to_string())),
407                        (TextChange::Equal("5".to_string()))
408                    ]
409                ),
410                Edit::Insert(
411                    InsertPosition {
412                        parent_node_id: 7,
413                        child_position: 2,
414                        descendant_count: 6
415                    },
416                    InsertContent::XmlNode(inserted_node)
417                ),
418                Edit::Delete(25)
419            ]
420        );
421    }
422
423    #[test]
424    fn test_edits_problematic() {
425        let mut xot = Xot::new();
426
427        // a particularly problematic case which has whitespace in it
428        let xml_a = r#"<section>
429        <title>T</title>
430        <para>P</para>
431
432        <section>First longer</section>
433
434        <section>Second</section>
435    </section>"#;
436        let xml_b = r#"<section>
437        <title>T</title>
438        <para>P</para>
439
440        <section>Second</section>
441
442        <section>First longer</section>
443    </section>"#;
444
445        let doc_a = xot.parse(xml_a).unwrap();
446        let doc_b = xot.parse(xml_b).unwrap();
447
448        let inserted_node = xot
449            .descendants(doc_b)
450            .find(|node| xot.text_content_str(*node) == Some("Second"))
451            .unwrap();
452
453        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
454        let edits = comparison.edits(&mut xot);
455
456        assert_eq!(
457            edits,
458            vec![
459                Edit::Delete(11),
460                Edit::Delete(12),
461                Edit::Insert(
462                    InsertPosition {
463                        parent_node_id: 1,
464                        child_position: 5,
465                        descendant_count: 13
466                    },
467                    InsertContent::XmlNode(inserted_node)
468                ),
469                Edit::Insert(
470                    InsertPosition {
471                        parent_node_id: 1,
472                        child_position: 6,
473                        descendant_count: 13
474                    },
475                    InsertContent::Text("\n\n        ".to_owned())
476                ),
477            ]
478        );
479    }
480
481    #[test]
482    fn test_deletes_problematic() {
483        let mut xot = Xot::new();
484        let xml_a = r#"<section>
485    <title>Section title</title>
486    <para>Compare documents with different root element</para>
487
488    <section>
489        <title>Sit Ipsum Consectetur Sem Ligula</title>
490        <para>Etiam porta sem malesuada <phrase>magna mollis euismod</phrase>. Lorem ipsum dolor sit amet, consectetur adipiscing elit.</para>
491    </section>
492
493    <section>
494        <title>Ultricies Mollis Mattis Ullamcorper Ridiculus</title>
495        <para>Morbi leo <code>porta ac consectetur</code>risus,  ac, vestibulum at eros. Cras mattis consectetur purus sit amet fermentum. Donec id elit non mi porta gravida at eget metus. <phrase>magna mollis euismod</phrase>. Lorem ipsum dolor sit amet, consectetur adipiscing elit.</para>
496    </section>
497</section>"#;
498        let xml_b = r#"<section>
499    <title>Section title</title>
500    <para>Compare documents with different root element</para>
501
502    <section>
503        <title>Ultricies Mollis Mattis Ullamcorper Ridiculus</title>
504        <para>Morbi leo <code>porta ac consectetur</code>risus,  ac, vestibulum at eros. Cras mattis consectetur purus sit amet fermentum. Donec id elit non mi porta gravida at eget metus. <phrase>magna mollis euismod</phrase>. Lorem ipsum dolor sit amet, consectetur adipiscing elit.</para>
505    </section>
506
507    <section>
508        <title>Sit Ipsum Consectetur Sem Ligula</title>
509        <para>Etiam porta sem malesuada <phrase>magna mollis euismod</phrase>. Lorem ipsum dolor sit amet, consectetur adipiscing elit.</para>
510    </section>
511</section>"#;
512        let doc_a = xot.parse(xml_a).unwrap();
513        let doc_b = xot.parse(xml_b).unwrap();
514        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
515        comparison.diff_status(&xot);
516        let deletes = comparison.deletes().collect::<Vec<_>>();
517        assert_eq!(deletes, vec![8, 9]);
518    }
519
520    #[test]
521    fn test_attribute_update() {
522        let mut xot = Xot::new();
523        let xml_a = r#"<r><a/></r>"#;
524        let xml_b = r#"<r><a id="1"/></r>"#;
525        let doc_a = xot.parse(xml_a).unwrap();
526        let doc_b = xot.parse(xml_b).unwrap();
527        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
528        comparison.diff_status(&xot);
529        let updates = comparison.updates().collect::<Vec<_>>();
530        assert_eq!(updates, vec![(0, 0), (2, 2)]);
531    }
532
533    #[test]
534    fn test_attribute_update_edits() -> Result<(), xot::Error> {
535        let mut xot = Xot::new();
536        let xml_a = r#"<r><a/></r>"#;
537        let xml_b = r#"<r><a id="1"/></r>"#;
538        let doc_a = xot.parse(xml_a)?;
539        let doc_b = xot.parse(xml_b)?;
540        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
541        let edits = comparison.edits(&mut xot);
542        let name_id = xot.add_name("id");
543        assert_eq!(
544            edits,
545            vec![Edit::AttributeUpdate(
546                2,
547                vec![AttributeChange::Insert(name_id, "1".to_string())]
548            )]
549        );
550        Ok(())
551    }
552
553    #[test]
554    fn test_attribute_propagation_edits() -> Result<(), xot::Error> {
555        let mut xot = Xot::new();
556        let xml_a = r#"<r><a>Text</a></r>"#;
557        let xml_b = r#"<r><a id="1">Text</a></r>"#;
558        let doc_a = xot.parse(xml_a)?;
559        let doc_b = xot.parse(xml_b)?;
560        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
561        let edits = comparison.edits(&mut xot);
562        let name_id = xot.add_name("id");
563        assert_eq!(
564            edits,
565            vec![Edit::AttributeUpdate(
566                2,
567                vec![AttributeChange::Insert(name_id, "1".to_string())]
568            )]
569        );
570        Ok(())
571    }
572
573    #[test]
574    fn test_root_no_update_detected() -> Result<(), xot::Error> {
575        let mut xot = Xot::new();
576        // this is not detected as a text update, and it's the root element
577        let xml_a = r#"<p>Hello</p>"#;
578        let xml_b = r#"<p>Bye</p>"#;
579        let doc_a = xot.parse(xml_a)?;
580        let doc_b = xot.parse(xml_b)?;
581
582        let inserted_node = xot.document_element(doc_b)?;
583
584        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
585        let edits = comparison.edits(&mut xot);
586        assert_eq!(
587            edits,
588            [
589                Edit::Delete(1),
590                Edit::Insert(
591                    InsertPosition {
592                        parent_node_id: 0,
593                        child_position: 0,
594                        descendant_count: 2
595                    },
596                    InsertContent::XmlNode(inserted_node)
597                )
598            ]
599        );
600        Ok(())
601    }
602
603    #[test]
604    fn test_processing_instruction_insert() {
605        let mut xot = Xot::new();
606        let doc_a = xot.parse("<book><p/></book>").unwrap();
607        let doc_b = xot.parse("<book><?pi?><p/></book>").unwrap();
608
609        let mut comparison = Comparison::new(&xot, doc_a, doc_b);
610        let edits = comparison.edits(&mut xot);
611
612        let inserted_node = xot
613            .descendants(doc_b)
614            .find(|node| xot.value_type(*node) == ValueType::ProcessingInstruction)
615            .unwrap();
616
617        assert_eq!(
618            edits,
619            vec![Edit::Insert(
620                InsertPosition {
621                    parent_node_id: 1,
622                    child_position: 0,
623                    descendant_count: 2
624                },
625                InsertContent::XmlNode(inserted_node)
626            ),]
627        );
628    }
629}