Skip to main content

edgeparse_core/utils/
diff.rs

1//! Document diff — compares two PdfDocument instances to detect structural changes.
2//!
3//! Useful for regression testing and verifying parsing consistency.
4
5use crate::models::content::ContentElement;
6use crate::models::document::PdfDocument;
7
8/// Type of change detected between two documents.
9#[derive(Debug, Clone, PartialEq)]
10pub enum ChangeKind {
11    /// An element was added in the new document.
12    Added,
13    /// An element was removed from the old document.
14    Removed,
15    /// An element was modified (same position, different content).
16    Modified,
17}
18
19/// A single detected change between two documents.
20#[derive(Debug, Clone)]
21pub struct Change {
22    /// Type of change.
23    pub kind: ChangeKind,
24    /// Element index in the respective document's kids list.
25    pub index: usize,
26    /// Description of the change.
27    pub description: String,
28}
29
30/// Result of comparing two documents.
31#[derive(Debug, Clone)]
32pub struct DiffResult {
33    /// List of detected changes.
34    pub changes: Vec<Change>,
35    /// Whether metadata fields changed.
36    pub metadata_changed: bool,
37    /// Whether page count changed.
38    pub page_count_changed: bool,
39}
40
41impl DiffResult {
42    /// Whether the two documents are structurally identical.
43    pub fn is_identical(&self) -> bool {
44        self.changes.is_empty() && !self.metadata_changed && !self.page_count_changed
45    }
46
47    /// Number of changes detected.
48    pub fn change_count(&self) -> usize {
49        self.changes.len()
50    }
51
52    /// Summary of the diff.
53    pub fn summary(&self) -> String {
54        if self.is_identical() {
55            return "Documents are identical.".to_string();
56        }
57        let added = self
58            .changes
59            .iter()
60            .filter(|c| c.kind == ChangeKind::Added)
61            .count();
62        let removed = self
63            .changes
64            .iter()
65            .filter(|c| c.kind == ChangeKind::Removed)
66            .count();
67        let modified = self
68            .changes
69            .iter()
70            .filter(|c| c.kind == ChangeKind::Modified)
71            .count();
72        format!(
73            "{} change(s): {} added, {} removed, {} modified{}{}",
74            self.changes.len(),
75            added,
76            removed,
77            modified,
78            if self.metadata_changed {
79                ", metadata changed"
80            } else {
81                ""
82            },
83            if self.page_count_changed {
84                ", page count changed"
85            } else {
86                ""
87            },
88        )
89    }
90}
91
92/// Compare two PdfDocuments and produce a diff.
93pub fn diff_documents(old: &PdfDocument, new: &PdfDocument) -> DiffResult {
94    let mut changes = Vec::new();
95
96    let metadata_changed = old.author != new.author
97        || old.title != new.title
98        || old.creation_date != new.creation_date
99        || old.producer != new.producer
100        || old.creator != new.creator
101        || old.subject != new.subject
102        || old.keywords != new.keywords;
103
104    let page_count_changed = old.number_of_pages != new.number_of_pages;
105
106    // Compare elements using a simple sequential diff.
107    let max_len = old.kids.len().max(new.kids.len());
108    for i in 0..max_len {
109        match (old.kids.get(i), new.kids.get(i)) {
110            (Some(old_elem), Some(new_elem)) => {
111                if !elements_equal(old_elem, new_elem) {
112                    changes.push(Change {
113                        kind: ChangeKind::Modified,
114                        index: i,
115                        description: format!(
116                            "Element {} changed: {:?} -> {:?}",
117                            i,
118                            element_tag(old_elem),
119                            element_tag(new_elem)
120                        ),
121                    });
122                }
123            }
124            (Some(_), None) => {
125                changes.push(Change {
126                    kind: ChangeKind::Removed,
127                    index: i,
128                    description: format!("Element {} removed", i),
129                });
130            }
131            (None, Some(_)) => {
132                changes.push(Change {
133                    kind: ChangeKind::Added,
134                    index: i,
135                    description: format!("Element {} added", i),
136                });
137            }
138            (None, None) => unreachable!(),
139        }
140    }
141
142    DiffResult {
143        changes,
144        metadata_changed,
145        page_count_changed,
146    }
147}
148
149/// Simple equality check for two content elements using their debug representation.
150fn elements_equal(a: &ContentElement, b: &ContentElement) -> bool {
151    // Compare using variant tag and bounding box as a proxy for equality.
152    // Full deep comparison would require PartialEq on all subtypes.
153    element_tag(a) == element_tag(b) && a.bbox() == b.bbox()
154}
155
156/// Get a string tag for a content element variant.
157fn element_tag(elem: &ContentElement) -> &'static str {
158    match elem {
159        ContentElement::TextChunk(_) => "TextChunk",
160        ContentElement::TextLine(_) => "TextLine",
161        ContentElement::TextBlock(_) => "TextBlock",
162        ContentElement::Paragraph(_) => "Paragraph",
163        ContentElement::Heading(_) => "Heading",
164        ContentElement::NumberHeading(_) => "NumberHeading",
165        ContentElement::Table(_) => "Table",
166        ContentElement::Figure(_) => "Figure",
167        ContentElement::Formula(_) => "Formula",
168        ContentElement::Picture(_) => "Picture",
169        ContentElement::Caption(_) => "Caption",
170        ContentElement::HeaderFooter(_) => "HeaderFooter",
171        ContentElement::Image(_) => "Image",
172        ContentElement::Line(_) => "Line",
173        ContentElement::LineArt(_) => "LineArt",
174        ContentElement::List(_) => "List",
175        ContentElement::TableBorder(_) => "TableBorder",
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::models::bbox::BoundingBox;
183    use crate::models::chunks::TextChunk;
184    use crate::models::document::PdfDocument;
185    use crate::models::enums::{PdfLayer, TextFormat, TextType};
186
187    fn make_text_chunk(text: &str) -> ContentElement {
188        ContentElement::TextChunk(TextChunk {
189            value: text.to_string(),
190            bbox: BoundingBox::new(None, 0.0, 0.0, 100.0, 10.0),
191            font_name: "Helvetica".to_string(),
192            font_size: 12.0,
193            font_weight: 400.0,
194            italic_angle: 0.0,
195            font_color: "#000000".to_string(),
196            contrast_ratio: 21.0,
197            symbol_ends: vec![],
198            text_format: TextFormat::Normal,
199            text_type: TextType::Regular,
200            pdf_layer: PdfLayer::Main,
201            ocg_visible: true,
202            index: None,
203            page_number: Some(1),
204            level: None,
205            mcid: None,
206        })
207    }
208
209    #[test]
210    fn test_identical_documents() {
211        let doc = PdfDocument::new("test.pdf".to_string());
212        let result = diff_documents(&doc, &doc);
213        assert!(result.is_identical());
214        assert_eq!(result.change_count(), 0);
215    }
216
217    #[test]
218    fn test_metadata_changed() {
219        let mut old = PdfDocument::new("test.pdf".to_string());
220        let mut new = PdfDocument::new("test.pdf".to_string());
221        old.author = Some("Alice".to_string());
222        new.author = Some("Bob".to_string());
223        let result = diff_documents(&old, &new);
224        assert!(result.metadata_changed);
225        assert!(!result.page_count_changed);
226    }
227
228    #[test]
229    fn test_element_added() {
230        let old = PdfDocument::new("test.pdf".to_string());
231        let mut new = PdfDocument::new("test.pdf".to_string());
232        new.kids.push(make_text_chunk("hello"));
233        let result = diff_documents(&old, &new);
234        assert_eq!(result.change_count(), 1);
235        assert_eq!(result.changes[0].kind, ChangeKind::Added);
236    }
237
238    #[test]
239    fn test_element_removed() {
240        let mut old = PdfDocument::new("test.pdf".to_string());
241        old.kids.push(make_text_chunk("hello"));
242        let new = PdfDocument::new("test.pdf".to_string());
243        let result = diff_documents(&old, &new);
244        assert_eq!(result.change_count(), 1);
245        assert_eq!(result.changes[0].kind, ChangeKind::Removed);
246    }
247
248    #[test]
249    fn test_summary() {
250        let old = PdfDocument::new("test.pdf".to_string());
251        let result = diff_documents(&old, &old);
252        assert_eq!(result.summary(), "Documents are identical.");
253    }
254}