oxidize_pdf/verification/
comparators.rs

1//! PDF Comparators for Verification
2//!
3//! This module provides functions to compare generated PDFs with reference PDFs
4//! to verify structural and content equivalence for ISO compliance testing.
5
6use crate::error::Result;
7use crate::verification::parser::{parse_pdf, ParsedPdf};
8use std::collections::HashMap;
9
10/// Difference between two PDFs
11#[derive(Debug, Clone)]
12pub struct PdfDifference {
13    pub location: String,
14    pub expected: String,
15    pub actual: String,
16    pub severity: DifferenceSeverity,
17}
18
19#[derive(Debug, Clone, PartialEq)]
20pub enum DifferenceSeverity {
21    /// Critical differences that break ISO compliance
22    Critical,
23    /// Important differences that may affect functionality
24    Important,
25    /// Minor differences that don't affect compliance
26    Minor,
27    /// Cosmetic differences (timestamps, IDs, etc.)
28    Cosmetic,
29}
30
31/// Result of PDF comparison
32#[derive(Debug, Clone)]
33pub struct ComparisonResult {
34    pub structurally_equivalent: bool,
35    pub content_equivalent: bool,
36    pub differences: Vec<PdfDifference>,
37    pub similarity_score: f64, // 0.0 to 1.0
38}
39
40/// Compare two PDFs for structural equivalence
41pub fn compare_pdfs(generated: &[u8], reference: &[u8]) -> Result<ComparisonResult> {
42    let parsed_generated = parse_pdf(generated)?;
43    let parsed_reference = parse_pdf(reference)?;
44
45    let differences = find_differences(&parsed_generated, &parsed_reference);
46    let similarity_score = calculate_similarity_score(&differences);
47
48    let structurally_equivalent = differences.iter().all(|diff| {
49        diff.severity == DifferenceSeverity::Cosmetic || diff.severity == DifferenceSeverity::Minor
50    });
51
52    let content_equivalent = differences
53        .iter()
54        .all(|diff| diff.severity == DifferenceSeverity::Cosmetic);
55
56    Ok(ComparisonResult {
57        structurally_equivalent,
58        content_equivalent,
59        differences,
60        similarity_score,
61    })
62}
63
64/// Find differences between two parsed PDFs
65fn find_differences(generated: &ParsedPdf, reference: &ParsedPdf) -> Vec<PdfDifference> {
66    let mut differences = Vec::new();
67
68    // Compare versions (minor difference unless major version change)
69    if generated.version != reference.version {
70        let severity = if generated.version.chars().next() != reference.version.chars().next() {
71            DifferenceSeverity::Important
72        } else {
73            DifferenceSeverity::Minor
74        };
75
76        differences.push(PdfDifference {
77            location: "PDF Version".to_string(),
78            expected: reference.version.clone(),
79            actual: generated.version.clone(),
80            severity,
81        });
82    }
83
84    // Compare catalogs
85    differences.extend(compare_catalogs(&generated.catalog, &reference.catalog));
86
87    // Compare page trees
88    differences.extend(compare_page_trees(
89        &generated.page_tree,
90        &reference.page_tree,
91    ));
92
93    // Compare fonts
94    differences.extend(compare_fonts(&generated.fonts, &reference.fonts));
95
96    // Compare color spaces
97    differences.extend(compare_color_spaces(generated, reference));
98
99    // Compare graphics states
100    differences.extend(compare_graphics_states(
101        &generated.graphics_states,
102        &reference.graphics_states,
103    ));
104
105    // Compare text objects
106    differences.extend(compare_text_objects(
107        &generated.text_objects,
108        &reference.text_objects,
109    ));
110
111    // Compare annotations
112    differences.extend(compare_annotations(
113        &generated.annotations,
114        &reference.annotations,
115    ));
116
117    // Compare cross-reference validity
118    if generated.xref_valid != reference.xref_valid {
119        differences.push(PdfDifference {
120            location: "Cross-reference table".to_string(),
121            expected: reference.xref_valid.to_string(),
122            actual: generated.xref_valid.to_string(),
123            severity: DifferenceSeverity::Critical,
124        });
125    }
126
127    differences
128}
129
130/// Compare document catalogs
131fn compare_catalogs(
132    generated: &Option<HashMap<String, String>>,
133    reference: &Option<HashMap<String, String>>,
134) -> Vec<PdfDifference> {
135    let mut differences = Vec::new();
136
137    match (generated, reference) {
138        (Some(gen_catalog), Some(ref_catalog)) => {
139            // Check required entries
140            for key in ["Type", "Pages"] {
141                match (gen_catalog.get(key), ref_catalog.get(key)) {
142                    (Some(gen_val), Some(ref_val)) => {
143                        if gen_val != ref_val {
144                            differences.push(PdfDifference {
145                                location: format!("Catalog/{}", key),
146                                expected: ref_val.clone(),
147                                actual: gen_val.clone(),
148                                severity: DifferenceSeverity::Critical,
149                            });
150                        }
151                    }
152                    (None, Some(ref_val)) => {
153                        differences.push(PdfDifference {
154                            location: format!("Catalog/{}", key),
155                            expected: ref_val.clone(),
156                            actual: "missing".to_string(),
157                            severity: DifferenceSeverity::Critical,
158                        });
159                    }
160                    (Some(gen_val), None) => {
161                        differences.push(PdfDifference {
162                            location: format!("Catalog/{}", key),
163                            expected: "missing".to_string(),
164                            actual: gen_val.clone(),
165                            severity: DifferenceSeverity::Minor,
166                        });
167                    }
168                    (None, None) => {} // Both missing - check if required
169                }
170            }
171        }
172        (None, Some(_)) => {
173            differences.push(PdfDifference {
174                location: "Document Catalog".to_string(),
175                expected: "present".to_string(),
176                actual: "missing".to_string(),
177                severity: DifferenceSeverity::Critical,
178            });
179        }
180        (Some(_), None) => {
181            differences.push(PdfDifference {
182                location: "Document Catalog".to_string(),
183                expected: "missing".to_string(),
184                actual: "present".to_string(),
185                severity: DifferenceSeverity::Minor,
186            });
187        }
188        (None, None) => {
189            differences.push(PdfDifference {
190                location: "Document Catalog".to_string(),
191                expected: "present".to_string(),
192                actual: "missing".to_string(),
193                severity: DifferenceSeverity::Critical,
194            });
195        }
196    }
197
198    differences
199}
200
201/// Compare page trees
202fn compare_page_trees(
203    generated: &Option<crate::verification::parser::PageTree>,
204    reference: &Option<crate::verification::parser::PageTree>,
205) -> Vec<PdfDifference> {
206    let mut differences = Vec::new();
207
208    match (generated, reference) {
209        (Some(gen_tree), Some(ref_tree)) => {
210            if gen_tree.page_count != ref_tree.page_count {
211                differences.push(PdfDifference {
212                    location: "Page Tree/Count".to_string(),
213                    expected: ref_tree.page_count.to_string(),
214                    actual: gen_tree.page_count.to_string(),
215                    severity: DifferenceSeverity::Critical,
216                });
217            }
218
219            if gen_tree.root_type != ref_tree.root_type {
220                differences.push(PdfDifference {
221                    location: "Page Tree/Type".to_string(),
222                    expected: ref_tree.root_type.clone(),
223                    actual: gen_tree.root_type.clone(),
224                    severity: DifferenceSeverity::Critical,
225                });
226            }
227        }
228        (None, Some(_)) => {
229            differences.push(PdfDifference {
230                location: "Page Tree".to_string(),
231                expected: "present".to_string(),
232                actual: "missing".to_string(),
233                severity: DifferenceSeverity::Critical,
234            });
235        }
236        (Some(_), None) => {
237            differences.push(PdfDifference {
238                location: "Page Tree".to_string(),
239                expected: "missing".to_string(),
240                actual: "present".to_string(),
241                severity: DifferenceSeverity::Minor,
242            });
243        }
244        (None, None) => {} // Both missing - may be ok for minimal PDFs
245    }
246
247    differences
248}
249
250/// Compare font lists
251fn compare_fonts(generated: &[String], reference: &[String]) -> Vec<PdfDifference> {
252    let mut differences = Vec::new();
253
254    // Check for missing fonts
255    for ref_font in reference {
256        if !generated.contains(ref_font) {
257            differences.push(PdfDifference {
258                location: format!("Fonts/{}", ref_font),
259                expected: "present".to_string(),
260                actual: "missing".to_string(),
261                severity: DifferenceSeverity::Important,
262            });
263        }
264    }
265
266    // Check for extra fonts (usually not a problem)
267    for gen_font in generated {
268        if !reference.contains(gen_font) {
269            differences.push(PdfDifference {
270                location: format!("Fonts/{}", gen_font),
271                expected: "missing".to_string(),
272                actual: "present".to_string(),
273                severity: DifferenceSeverity::Minor,
274            });
275        }
276    }
277
278    differences
279}
280
281/// Compare color space usage
282fn compare_color_spaces(generated: &ParsedPdf, reference: &ParsedPdf) -> Vec<PdfDifference> {
283    let mut differences = Vec::new();
284
285    if generated.uses_device_rgb != reference.uses_device_rgb {
286        differences.push(PdfDifference {
287            location: "Color Spaces/DeviceRGB".to_string(),
288            expected: reference.uses_device_rgb.to_string(),
289            actual: generated.uses_device_rgb.to_string(),
290            severity: DifferenceSeverity::Important,
291        });
292    }
293
294    if generated.uses_device_cmyk != reference.uses_device_cmyk {
295        differences.push(PdfDifference {
296            location: "Color Spaces/DeviceCMYK".to_string(),
297            expected: reference.uses_device_cmyk.to_string(),
298            actual: generated.uses_device_cmyk.to_string(),
299            severity: DifferenceSeverity::Important,
300        });
301    }
302
303    if generated.uses_device_gray != reference.uses_device_gray {
304        differences.push(PdfDifference {
305            location: "Color Spaces/DeviceGray".to_string(),
306            expected: reference.uses_device_gray.to_string(),
307            actual: generated.uses_device_gray.to_string(),
308            severity: DifferenceSeverity::Important,
309        });
310    }
311
312    differences
313}
314
315/// Compare graphics states
316fn compare_graphics_states(
317    generated: &[crate::verification::parser::GraphicsState],
318    reference: &[crate::verification::parser::GraphicsState],
319) -> Vec<PdfDifference> {
320    let mut differences = Vec::new();
321
322    if generated.len() != reference.len() {
323        differences.push(PdfDifference {
324            location: "Graphics States/Count".to_string(),
325            expected: reference.len().to_string(),
326            actual: generated.len().to_string(),
327            severity: DifferenceSeverity::Important,
328        });
329    }
330
331    // Compare first few graphics states (detailed comparison would be complex)
332    let min_len = generated.len().min(reference.len());
333    for i in 0..min_len.min(3) {
334        // Only compare first 3 for performance
335        let gen_state = &generated[i];
336        let ref_state = &reference[i];
337
338        if gen_state.line_width != ref_state.line_width {
339            differences.push(PdfDifference {
340                location: format!("Graphics State {}/LineWidth", i),
341                expected: format!("{:?}", ref_state.line_width),
342                actual: format!("{:?}", gen_state.line_width),
343                severity: DifferenceSeverity::Minor,
344            });
345        }
346    }
347
348    differences
349}
350
351/// Compare text objects
352fn compare_text_objects(
353    generated: &[crate::verification::parser::TextObject],
354    reference: &[crate::verification::parser::TextObject],
355) -> Vec<PdfDifference> {
356    let mut differences = Vec::new();
357
358    if generated.len() != reference.len() {
359        differences.push(PdfDifference {
360            location: "Text Objects/Count".to_string(),
361            expected: reference.len().to_string(),
362            actual: generated.len().to_string(),
363            severity: DifferenceSeverity::Important,
364        });
365    }
366
367    // Compare text content (simplified)
368    let min_len = generated.len().min(reference.len());
369    for i in 0..min_len {
370        let gen_text = &generated[i];
371        let ref_text = &reference[i];
372
373        if gen_text.text_content != ref_text.text_content {
374            differences.push(PdfDifference {
375                location: format!("Text Object {}/Content", i),
376                expected: ref_text.text_content.clone(),
377                actual: gen_text.text_content.clone(),
378                severity: DifferenceSeverity::Important,
379            });
380        }
381    }
382
383    differences
384}
385
386/// Compare annotations
387fn compare_annotations(
388    generated: &[crate::verification::parser::Annotation],
389    reference: &[crate::verification::parser::Annotation],
390) -> Vec<PdfDifference> {
391    let mut differences = Vec::new();
392
393    if generated.len() != reference.len() {
394        differences.push(PdfDifference {
395            location: "Annotations/Count".to_string(),
396            expected: reference.len().to_string(),
397            actual: generated.len().to_string(),
398            severity: DifferenceSeverity::Important,
399        });
400    }
401
402    differences
403}
404
405/// Calculate similarity score based on differences
406fn calculate_similarity_score(differences: &[PdfDifference]) -> f64 {
407    if differences.is_empty() {
408        return 1.0;
409    }
410
411    let mut penalty = 0.0;
412    for diff in differences {
413        penalty += match diff.severity {
414            DifferenceSeverity::Critical => 0.3,
415            DifferenceSeverity::Important => 0.1,
416            DifferenceSeverity::Minor => 0.05,
417            DifferenceSeverity::Cosmetic => 0.01,
418        };
419    }
420
421    (1.0f64 - penalty).max(0.0)
422}
423
424/// Check if two PDFs are structurally equivalent for ISO compliance
425pub fn pdfs_structurally_equivalent(generated: &[u8], reference: &[u8]) -> bool {
426    match compare_pdfs(generated, reference) {
427        Ok(result) => result.structurally_equivalent,
428        Err(_) => false,
429    }
430}
431
432/// Extract structural differences between PDFs
433pub fn extract_pdf_differences(generated: &[u8], reference: &[u8]) -> Result<Vec<PdfDifference>> {
434    let result = compare_pdfs(generated, reference)?;
435    Ok(result.differences)
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441
442    fn create_test_pdf(version: &str, catalog_type: &str) -> Vec<u8> {
443        format!(
444            "%PDF-{}\n1 0 obj\n<<\n/Type /{}\n>>\nendobj\n%%EOF",
445            version, catalog_type
446        )
447        .into_bytes()
448    }
449
450    #[test]
451    fn test_identical_pdfs() {
452        let pdf1 = create_test_pdf("1.4", "Catalog");
453        let pdf2 = create_test_pdf("1.4", "Catalog");
454
455        let result = compare_pdfs(&pdf1, &pdf2).unwrap();
456        assert!(result.content_equivalent);
457        assert_eq!(result.similarity_score, 1.0);
458    }
459
460    #[test]
461    fn test_version_difference() {
462        let pdf1 = create_test_pdf("1.4", "Catalog");
463        let pdf2 = create_test_pdf("1.7", "Catalog");
464
465        let result = compare_pdfs(&pdf1, &pdf2).unwrap();
466        assert!(!result.content_equivalent);
467        assert!(result.similarity_score < 1.0);
468        assert!(result
469            .differences
470            .iter()
471            .any(|d| d.location == "PDF Version"));
472    }
473
474    #[test]
475    fn test_structural_difference() {
476        let pdf1 = create_test_pdf("1.4", "Catalog");
477        let pdf2 = create_test_pdf("1.7", "Catalog"); // Different version should be minor difference
478
479        let result = compare_pdfs(&pdf1, &pdf2).unwrap();
480
481        // Version differences are minor, so should still be structurally equivalent
482        assert!(result.structurally_equivalent);
483        assert!(!result.differences.is_empty()); // But should have differences
484
485        // Check that version difference was detected
486        assert!(result
487            .differences
488            .iter()
489            .any(|d| d.location == "PDF Version"));
490    }
491
492    #[test]
493    fn test_calculate_similarity_score() {
494        let differences = vec![PdfDifference {
495            location: "test".to_string(),
496            expected: "a".to_string(),
497            actual: "b".to_string(),
498            severity: DifferenceSeverity::Critical,
499        }];
500
501        let score = calculate_similarity_score(&differences);
502        assert_eq!(score, 0.7); // 1.0 - 0.3 (critical penalty)
503    }
504}