Skip to main content

pdf_ast/validation/
pdfa.rs

1use crate::ast::{NodeType, PdfDocument};
2use crate::types::{PdfDictionary, PdfValue};
3use crate::validation::{ValidationIssue, ValidationReport, ValidationSeverity};
4
5/// PDF/A-1b validator implementing ISO 19005-1:2005 Level B requirements
6pub struct PdfA1bValidator {
7    strict_mode: bool,
8}
9
10impl PdfA1bValidator {
11    pub fn new() -> Self {
12        Self { strict_mode: true }
13    }
14
15    pub fn with_strict_mode(mut self, strict: bool) -> Self {
16        self.strict_mode = strict;
17        self
18    }
19
20    pub fn validate(&self, document: &PdfDocument) -> ValidationReport {
21        let mut report = ValidationReport::new("PDF/A-1b".to_string(), "1.0".to_string());
22
23        self.validate_version(&mut report, document);
24        report.statistics.total_checks += 1;
25
26        self.validate_color_spaces(&mut report, document);
27        report.statistics.total_checks += 1;
28
29        self.validate_fonts(&mut report, document);
30        report.statistics.total_checks += 1;
31
32        self.validate_images(&mut report, document);
33        report.statistics.total_checks += 1;
34
35        self.validate_multimedia_content(&mut report, document);
36        report.statistics.total_checks += 1;
37
38        self.validate_javascript(&mut report, document);
39        report.statistics.total_checks += 1;
40
41        self.validate_annotations(&mut report, document);
42        report.statistics.total_checks += 1;
43
44        self.validate_forms(&mut report, document);
45        report.statistics.total_checks += 1;
46
47        self.validate_encryption(&mut report, document);
48        report.statistics.total_checks += 1;
49
50        self.validate_metadata(&mut report, document);
51        report.statistics.total_checks += 1;
52
53        self.validate_transparency(&mut report, document);
54        report.statistics.total_checks += 1;
55
56        self.validate_file_specification(&mut report, document);
57        report.statistics.total_checks += 1;
58
59        self.validate_cross_reference(&mut report, document);
60        report.statistics.total_checks += 1;
61
62        // Update passed checks based on total - failed
63        report.statistics.passed_checks = report
64            .statistics
65            .total_checks
66            .saturating_sub(report.statistics.failed_checks);
67
68        report
69    }
70
71    fn validate_version(&self, report: &mut ValidationReport, document: &PdfDocument) {
72        if document.version.major != 1 || document.version.minor > 4 {
73            report.add_issue(ValidationIssue {
74                severity: ValidationSeverity::Error,
75                code: "PDF_A_VERSION".to_string(),
76                message: "PDF/A-1 must be based on PDF version 1.4 or earlier".to_string(),
77                node_id: None,
78                location: Some("Document version".to_string()),
79                suggestion: Some(format!(
80                    "Found version {}.{}",
81                    document.version.major, document.version.minor
82                )),
83            });
84        }
85    }
86
87    fn validate_color_spaces(&self, report: &mut ValidationReport, document: &PdfDocument) {
88        let mut has_device_colors = false;
89        let mut missing_output_intent = true;
90
91        if let Some(catalog_dict) = document.get_catalog() {
92            if catalog_dict.contains_key("OutputIntents") {
93                missing_output_intent = false;
94            }
95        }
96
97        for node in document.ast.get_all_nodes() {
98            match &node.node_type {
99                NodeType::Image => {
100                    if let Some(dict) = node.as_dict() {
101                        if let Some(colorspace_value) = dict.get("ColorSpace") {
102                            if let Some(colorspace_name) = colorspace_value.as_name() {
103                                match colorspace_name.without_slash() {
104                                    "DeviceRGB" | "DeviceGray" | "DeviceCMYK" => {
105                                        has_device_colors = true;
106                                    }
107                                    _ => {}
108                                }
109                            }
110                        }
111                    }
112                }
113                NodeType::Page => {
114                    if let Some(dict) = node.as_dict() {
115                        self.check_resources_for_device_colors(dict, &mut has_device_colors);
116                    }
117                }
118                _ => {}
119            }
120        }
121
122        if has_device_colors && missing_output_intent {
123            report.add_issue(ValidationIssue {
124                severity: ValidationSeverity::Error,
125                code: "PDF_A_COLOR_SPACE".to_string(),
126                message: "Device color spaces require OutputIntent specification".to_string(),
127                node_id: None,
128                location: Some("Color management".to_string()),
129                suggestion: Some(
130                    "Found device color spaces but no OutputIntents in catalog".to_string(),
131                ),
132            });
133        }
134
135        if missing_output_intent && self.strict_mode {
136            report.add_issue(ValidationIssue {
137                severity: ValidationSeverity::Warning,
138                code: "PDF_A_OUTPUT_INTENT".to_string(),
139                message: "PDF/A-1b should include OutputIntents for color management".to_string(),
140                node_id: None,
141                location: Some("Color management".to_string()),
142                suggestion: None,
143            });
144        }
145    }
146
147    fn check_resources_for_device_colors(
148        &self,
149        page_dict: &PdfDictionary,
150        has_device_colors: &mut bool,
151    ) {
152        if let Some(resources_value) = page_dict.get("Resources") {
153            if let Some(resources_dict) = resources_value.as_dict() {
154                if let Some(colorspaces_value) = resources_dict.get("ColorSpace") {
155                    if let Some(colorspaces_dict) = colorspaces_value.as_dict() {
156                        for (_name, colorspace_value) in colorspaces_dict.iter() {
157                            if let Some(colorspace_name) = colorspace_value.as_name() {
158                                match colorspace_name.without_slash() {
159                                    "DeviceRGB" | "DeviceGray" | "DeviceCMYK" => {
160                                        *has_device_colors = true;
161                                    }
162                                    _ => {}
163                                }
164                            }
165                        }
166                    }
167                }
168            }
169        }
170    }
171
172    fn validate_fonts(&self, report: &mut ValidationReport, document: &PdfDocument) {
173        let mut unembedded_fonts = Vec::new();
174        let mut invalid_encodings = Vec::new();
175
176        for node in document.ast.get_all_nodes() {
177            if matches!(
178                node.node_type,
179                NodeType::Font
180                    | NodeType::Type1Font
181                    | NodeType::TrueTypeFont
182                    | NodeType::Type3Font
183                    | NodeType::CIDFont
184            ) {
185                if let Some(font_dict) = node.as_dict() {
186                    let font_name = font_dict
187                        .get("BaseFont")
188                        .and_then(|v| v.as_name())
189                        .map(|n| n.without_slash())
190                        .unwrap_or("Unknown");
191
192                    // PDF/A-1b requires ALL fonts to be embedded, including the standard 14 fonts
193                    let is_embedded = self.is_font_embedded(font_dict);
194                    if !is_embedded {
195                        unembedded_fonts.push(font_name.to_string());
196
197                        report.add_issue(ValidationIssue {
198                            severity: ValidationSeverity::Error,
199                            code: "PDF_A_FONT_EMBEDDING".to_string(),
200                            message: "All fonts must be embedded in PDF/A-1b".to_string(),
201                            node_id: Some(node.id),
202                            location: Some("Font embedding".to_string()),
203                            suggestion: Some(format!("Font '{}' is not embedded", font_name)),
204                        });
205                    }
206
207                    if let Some(subtype) = font_dict.get("Subtype").and_then(|v| v.as_name()) {
208                        if subtype.without_slash() != "Type3" {
209                            self.validate_font_encoding(
210                                font_dict,
211                                font_name,
212                                &mut invalid_encodings,
213                            );
214                        }
215                    }
216                }
217            }
218        }
219
220        for encoding_issue in invalid_encodings {
221            report.add_issue(ValidationIssue {
222                severity: ValidationSeverity::Error,
223                code: "PDF_A_FONT_ENCODING".to_string(),
224                message: "Font encoding must be specified or use standard encoding".to_string(),
225                node_id: None,
226                location: Some("Font encoding".to_string()),
227                suggestion: Some(encoding_issue),
228            });
229        }
230    }
231
232    fn is_font_embedded(&self, font_dict: &PdfDictionary) -> bool {
233        font_dict.contains_key("FontFile") ||
234        font_dict.contains_key("FontFile2") ||
235        font_dict.contains_key("FontFile3") ||
236        // CID fonts store embedding info in DescendantFonts
237        font_dict.get("DescendantFonts")
238            .and_then(|v| v.as_array())
239            // Assumes embedded if DescendantFonts exists; full validation requires resolving references
240            .map(|arr| !arr.is_empty())
241            .unwrap_or(false)
242    }
243
244    fn is_standard_font(&self, font_name: &str) -> bool {
245        matches!(
246            font_name,
247            "Times-Roman"
248                | "Times-Bold"
249                | "Times-Italic"
250                | "Times-BoldItalic"
251                | "Helvetica"
252                | "Helvetica-Bold"
253                | "Helvetica-Oblique"
254                | "Helvetica-BoldOblique"
255                | "Courier"
256                | "Courier-Bold"
257                | "Courier-Oblique"
258                | "Courier-BoldOblique"
259                | "Symbol"
260                | "ZapfDingbats"
261        )
262    }
263
264    fn validate_font_encoding(
265        &self,
266        font_dict: &PdfDictionary,
267        font_name: &str,
268        invalid_encodings: &mut Vec<String>,
269    ) {
270        if !font_dict.contains_key("Encoding") && !self.is_standard_font(font_name) {
271            if let Some(subtype) = font_dict.get("Subtype").and_then(|v| v.as_name()) {
272                if matches!(subtype.without_slash(), "Type1" | "MMType1" | "TrueType") {
273                    invalid_encodings
274                        .push(format!("Font '{}' lacks encoding specification", font_name));
275                }
276            }
277        }
278    }
279
280    fn validate_images(&self, report: &mut ValidationReport, document: &PdfDocument) {
281        for node in document.ast.get_all_nodes() {
282            if matches!(node.node_type, NodeType::Image | NodeType::ImageXObject) {
283                if let Some(image_dict) = node.as_dict() {
284                    if let Some(filter_value) = image_dict.get("Filter") {
285                        let has_lzw = match filter_value {
286                            PdfValue::Name(name) => name.without_slash() == "LZWDecode",
287                            PdfValue::Array(filters) => filters.iter().any(|f| {
288                                f.as_name()
289                                    .map(|n| n.without_slash() == "LZWDecode")
290                                    .unwrap_or(false)
291                            }),
292                            _ => false,
293                        };
294
295                        if has_lzw && self.strict_mode {
296                            report.add_issue(ValidationIssue {
297                                severity: ValidationSeverity::Warning,
298                                code: "PDF_A_LZW_DECODE".to_string(),
299                                message: "LZWDecode filter should be avoided in PDF/A-1"
300                                    .to_string(),
301                                node_id: None,
302                                location: Some("Image compression".to_string()),
303                                suggestion: Some("Consider using FlateDecode instead".to_string()),
304                            });
305                        }
306                    }
307                }
308            }
309        }
310    }
311
312    fn validate_multimedia_content(&self, report: &mut ValidationReport, document: &PdfDocument) {
313        let mut has_multimedia = false;
314
315        for node in document.ast.get_all_nodes() {
316            if node.node_type == NodeType::Annotation {
317                if let Some(annot_dict) = node.as_dict() {
318                    if let Some(subtype) = annot_dict.get("Subtype").and_then(|v| v.as_name()) {
319                        match subtype.without_slash() {
320                            "Movie" | "Sound" | "Screen" | "RichMedia" => {
321                                has_multimedia = true;
322                                break;
323                            }
324                            _ => {}
325                        }
326                    }
327                }
328            }
329        }
330
331        if has_multimedia {
332            report.add_issue(ValidationIssue {
333                severity: ValidationSeverity::Error,
334                code: "PDF_A_MULTIMEDIA".to_string(),
335                message: "PDF/A-1b does not permit multimedia content".to_string(),
336                node_id: None,
337                location: Some("Multimedia restrictions".to_string()),
338                suggestion: Some(
339                    "Remove multimedia annotations like Movie, Sound, or Screen".to_string(),
340                ),
341            });
342        }
343    }
344
345    fn validate_javascript(&self, report: &mut ValidationReport, document: &PdfDocument) {
346        for node in document.ast.get_all_nodes() {
347            if matches!(node.node_type, NodeType::JavaScriptAction) {
348                report.add_issue(ValidationIssue {
349                    severity: ValidationSeverity::Error,
350                    code: "PDF_A_JAVASCRIPT".to_string(),
351                    message: "JavaScript is not permitted in PDF/A-1b".to_string(),
352                    node_id: Some(node.id),
353                    location: Some("JavaScript action node".to_string()),
354                    suggestion: Some("Remove all JavaScript actions".to_string()),
355                });
356                return;
357            }
358        }
359
360        let mut has_javascript = false;
361
362        if let Some(catalog_dict) = document.get_catalog() {
363            if let Some(names_value) = catalog_dict.get("Names") {
364                if let Some(names_dict) = names_value.as_dict() {
365                    if names_dict.contains_key("JavaScript") {
366                        has_javascript = true;
367                    }
368                }
369            }
370
371            if let Some(open_action) = catalog_dict.get("OpenAction") {
372                if let Some(action_dict) = open_action.as_dict() {
373                    if let Some(s_value) = action_dict.get("S") {
374                        if let Some(s_name) = s_value.as_name() {
375                            if s_name.without_slash() == "JavaScript" {
376                                has_javascript = true;
377                            }
378                        }
379                    }
380                }
381            }
382        }
383
384        for node in document.ast.get_all_nodes() {
385            if let Some(dict) = node.as_dict() {
386                if let Some(type_value) = dict.get("Type") {
387                    if let Some(type_name) = type_value.as_name() {
388                        if type_name.without_slash() == "Action" {
389                            if let Some(s_value) = dict.get("S") {
390                                if let Some(s_name) = s_value.as_name() {
391                                    if s_name.without_slash() == "JavaScript" {
392                                        has_javascript = true;
393                                        break;
394                                    }
395                                }
396                            }
397                        }
398                    }
399                }
400                if matches!(node.node_type, NodeType::Annotation | NodeType::Action) {
401                    if let Some(s_value) = dict.get("S") {
402                        if let Some(s_name) = s_value.as_name() {
403                            if s_name.without_slash() == "JavaScript" {
404                                has_javascript = true;
405                                break;
406                            }
407                        }
408                    }
409                }
410            }
411        }
412
413        if has_javascript {
414            report.add_issue(ValidationIssue {
415                severity: ValidationSeverity::Error,
416                code: "PDF_A_JAVASCRIPT".to_string(),
417                message: "PDF/A-1b does not permit JavaScript".to_string(),
418                node_id: None,
419                location: Some("JavaScript restrictions".to_string()),
420                suggestion: Some("Remove all JavaScript actions and scripts".to_string()),
421            });
422        }
423    }
424
425    fn validate_annotations(&self, report: &mut ValidationReport, document: &PdfDocument) {
426        let prohibited_subtypes = ["Movie", "Sound", "FileAttachment"];
427
428        for node in document.ast.get_all_nodes() {
429            if matches!(node.node_type, NodeType::Annotation) {
430                if let Some(annot_dict) = node.as_dict() {
431                    if let Some(subtype) = annot_dict.get("Subtype").and_then(|v| v.as_name()) {
432                        let subtype_str = subtype.without_slash();
433                        if prohibited_subtypes.contains(&subtype_str) {
434                            report.add_issue(ValidationIssue {
435                                severity: ValidationSeverity::Error,
436                                code: "PDF_A_ANNOTATION_TYPE".to_string(),
437                                message: format!(
438                                    "Annotation subtype '{}' not permitted in PDF/A-1b",
439                                    subtype_str
440                                ),
441                                node_id: None,
442                                location: Some("Annotation restrictions".to_string()),
443                                suggestion: None,
444                            });
445                        }
446
447                        if !annot_dict.contains_key("AP") && subtype_str != "Popup" {
448                            report.add_issue(ValidationIssue {
449                                severity: ValidationSeverity::Warning,
450                                code: "PDF_A_ANNOTATION_APPEARANCE".to_string(),
451                                message: "Annotations should have appearance streams in PDF/A-1b"
452                                    .to_string(),
453                                node_id: None,
454                                location: Some("Annotation appearance".to_string()),
455                                suggestion: Some(format!(
456                                    "Annotation of type '{}' lacks appearance",
457                                    subtype_str
458                                )),
459                            });
460                        }
461                    }
462                }
463            }
464        }
465    }
466
467    fn validate_forms(&self, report: &mut ValidationReport, document: &PdfDocument) {
468        if let Some(catalog_dict) = document.get_catalog() {
469            if let Some(acroform_value) = catalog_dict.get("AcroForm") {
470                if let Some(acroform_dict) = acroform_value.as_dict() {
471                    if acroform_dict.contains_key("XFA") {
472                        report.add_issue(ValidationIssue {
473                            severity: ValidationSeverity::Error,
474                            code: "PDF_A_XFA".to_string(),
475                            message: "XFA forms are not permitted in PDF/A-1b".to_string(),
476                            node_id: None,
477                            location: Some("Form restrictions".to_string()),
478                            suggestion: Some("Use AcroForm instead of XFA".to_string()),
479                        });
480                    }
481                }
482            }
483        }
484    }
485
486    fn validate_encryption(&self, report: &mut ValidationReport, document: &PdfDocument) {
487        if document.metadata.encrypted {
488            report.add_issue(ValidationIssue {
489                severity: ValidationSeverity::Error,
490                code: "PDF_A_ENCRYPTION".to_string(),
491                message: "PDF/A-1b documents must not be encrypted".to_string(),
492                node_id: None,
493                location: Some("Encryption restrictions".to_string()),
494                suggestion: Some("Remove all encryption from the document".to_string()),
495            });
496        }
497    }
498
499    fn validate_metadata(&self, report: &mut ValidationReport, document: &PdfDocument) {
500        let mut has_xmp_metadata = false;
501
502        if let Some(catalog_dict) = document.get_catalog() {
503            if catalog_dict.contains_key("Metadata") {
504                has_xmp_metadata = true;
505            }
506        }
507
508        if !has_xmp_metadata {
509            report.add_issue(ValidationIssue {
510                severity: ValidationSeverity::Error,
511                code: "PDF_A_XMP_METADATA".to_string(),
512                message: "PDF/A-1b requires XMP metadata in catalog".to_string(),
513                node_id: None,
514                location: Some("Metadata requirements".to_string()),
515                suggestion: Some("Add XMP metadata stream to document catalog".to_string()),
516            });
517        }
518
519        // Full XMP-Info synchronization requires parsing XMP content
520        if self.strict_mode {
521            report.add_issue(ValidationIssue {
522                severity: ValidationSeverity::Warning,
523                code: "PDF_A_METADATA_SYNC".to_string(),
524                message: "Verify XMP metadata synchronization with Info dictionary".to_string(),
525                node_id: None,
526                location: Some("Metadata synchronization".to_string()),
527                suggestion: None,
528            });
529        }
530    }
531
532    fn validate_transparency(&self, report: &mut ValidationReport, document: &PdfDocument) {
533        for node in document.ast.get_all_nodes() {
534            if let Some(dict) = node.as_dict() {
535                // BM=blend mode, CA/ca=opacity, SMask=soft mask - all indicate transparency
536                if dict.contains_key("BM")
537                    || dict.contains_key("CA")
538                    || dict.contains_key("ca")
539                    || dict.contains_key("SMask")
540                {
541                    report.add_issue(ValidationIssue {
542                        severity: ValidationSeverity::Error,
543                        code: "PDF_A_TRANSPARENCY".to_string(),
544                        message: "PDF/A-1b does not permit transparency in graphics states"
545                            .to_string(),
546                        node_id: Some(node.id),
547                        location: Some("Graphics state".to_string()),
548                        suggestion: Some("Remove transparency effects from ExtGState".to_string()),
549                    });
550                    return; // Found transparency, report and exit
551                }
552
553                if let Some(type_value) = dict.get("Type") {
554                    if let Some(type_name) = type_value.as_name() {
555                        if type_name.without_slash() == "Group" {
556                            if let Some(s_value) = dict.get("S") {
557                                if let Some(s_name) = s_value.as_name() {
558                                    if s_name.without_slash() == "Transparency" {
559                                        report.add_issue(ValidationIssue {
560                                            severity: ValidationSeverity::Error,
561                                            code: "PDF_A_TRANSPARENCY".to_string(),
562                                            message: "PDF/A-1b does not permit transparency groups"
563                                                .to_string(),
564                                            node_id: Some(node.id),
565                                            location: Some("Transparency group".to_string()),
566                                            suggestion: Some(
567                                                "Remove transparency group specification"
568                                                    .to_string(),
569                                            ),
570                                        });
571                                        return;
572                                    }
573                                }
574                            }
575                        }
576                    }
577                }
578
579                if let Some(s_value) = dict.get("S") {
580                    if let Some(s_name) = s_value.as_name() {
581                        if s_name.without_slash() == "Transparency" {
582                            if let Some(type_value) = dict.get("Type") {
583                                if let Some(type_name) = type_value.as_name() {
584                                    if type_name.without_slash() == "Group" {
585                                        report.add_issue(ValidationIssue {
586                                            severity: ValidationSeverity::Error,
587                                            code: "PDF_A_TRANSPARENCY".to_string(),
588                                            message: "PDF/A-1b does not permit transparency groups"
589                                                .to_string(),
590                                            node_id: Some(node.id),
591                                            location: Some("Transparency group".to_string()),
592                                            suggestion: Some(
593                                                "Remove transparency group specification"
594                                                    .to_string(),
595                                            ),
596                                        });
597                                        return;
598                                    }
599                                }
600                            }
601                        }
602                    }
603                }
604
605                if let Some(group_value) = dict.get("Group") {
606                    if let Some(group_dict) = group_value.as_dict() {
607                        if let Some(s_value) = group_dict.get("S") {
608                            if let Some(s_name) = s_value.as_name() {
609                                if s_name.without_slash() == "Transparency" {
610                                    report.add_issue(ValidationIssue {
611                                        severity: ValidationSeverity::Error,
612                                        code: "PDF_A_TRANSPARENCY".to_string(),
613                                        message: "PDF/A-1b does not permit transparency groups"
614                                            .to_string(),
615                                        node_id: Some(node.id),
616                                        location: Some("Transparency group".to_string()),
617                                        suggestion: Some(
618                                            "Remove transparency group specification".to_string(),
619                                        ),
620                                    });
621                                    return;
622                                }
623                            }
624                        }
625                    }
626                }
627            }
628        }
629    }
630
631    fn validate_file_specification(&self, report: &mut ValidationReport, document: &PdfDocument) {
632        if document.metadata.has_embedded_files {
633            report.add_issue(ValidationIssue {
634                severity: ValidationSeverity::Error,
635                code: "PDF_A_EMBEDDED_FILES".to_string(),
636                message: "PDF/A-1b does not permit embedded files".to_string(),
637                node_id: None,
638                location: Some("File specification restrictions".to_string()),
639                suggestion: Some("Remove all embedded file attachments".to_string()),
640            });
641        }
642    }
643
644    fn validate_cross_reference(&self, report: &mut ValidationReport, document: &PdfDocument) {
645        // PDF/A-1b allows tables or streams, but mixing both is discouraged
646        let has_xref_tables = !document.xref.entries.is_empty();
647        let has_xref_streams = !document.xref.streams.is_empty();
648
649        if has_xref_tables && has_xref_streams {
650            report.add_issue(ValidationIssue {
651                severity: ValidationSeverity::Warning,
652                code: "PDF_A_XREF_FORMAT".to_string(),
653                message: "Mixed cross-reference formats detected".to_string(),
654                node_id: None,
655                location: Some("Cross-reference validation".to_string()),
656                suggestion: Some(
657                    "Consider using consistent cross-reference format throughout".to_string(),
658                ),
659            });
660        }
661    }
662}
663
664impl Default for PdfA1bValidator {
665    fn default() -> Self {
666        Self::new()
667    }
668}