Skip to main content

pdf_compliance/
lib.rs

1#![deny(missing_docs)]
2//! PDF compliance checking: PDF/A, PDF/UA, and PDF/X.
3//!
4//! Validates PDF documents against conformance profiles defined by:
5//! - **ISO 19005** — PDF/A archival format (parts 1–4)
6//! - **ISO 14289** — PDF/UA accessibility
7//! - **ISO 15930** — PDF/X prepress exchange
8//!
9//! # Quick Start
10//!
11//! ```no_run
12//! use std::sync::Arc;
13//! use pdf_syntax::Pdf;
14//! use pdf_compliance::{preferred_pdfa_level, validate_pdfa, Severity};
15//!
16//! let data = Arc::new(std::fs::read("document.pdf").unwrap());
17//! let pdf = Pdf::new(data).unwrap();
18//!
19//! // Prefer the declared level, but promote PDF/A-1 inputs to PDF/A-2B when
20//! // the source uses features like xref streams, transparency, or JPEG2000.
21//! let level = preferred_pdfa_level(&pdf);
22//! let report = validate_pdfa(&pdf, level);
23//!
24//! if report.is_compliant() {
25//!     println!("PDF/A-{}{} compliant", level.part(), level.conformance());
26//! } else {
27//!     println!("{} error(s), {} warning(s)", report.error_count(), report.warning_count());
28//!     for issue in &report.issues {
29//!         if issue.severity == Severity::Error {
30//!             println!("  [{}] {:?}: {}", issue.rule, issue.severity, issue.message);
31//!         }
32//!     }
33//! }
34//! ```
35//!
36//! # Key Types
37//!
38//! | Type | Description |
39//! |---|---|
40//! | [`PdfALevel`] | PDF/A conformance level: `A1b`, `A2b`, `A2u`, `A3b`, `A4`, … |
41//! | [`PdfXLevel`] | PDF/X level: `X1a2003`, `X32003`, `X4` |
42//! | [`ComplianceReport`] | Validation outcome with issue list and pass/fail flag |
43//! | [`ComplianceIssue`] | Rule ID, severity, message, and optional location |
44//! | [`Severity`] | `Error`, `Warning`, `Info` |
45
46pub(crate) mod pdfa;
47pub(crate) mod pdfua;
48pub(crate) mod pdfx;
49pub mod tagged;
50
51pub(crate) mod check;
52mod xmp;
53
54use pdf_syntax::Pdf;
55
56/// PDF/A conformance level (ISO 19005 parts 1–4).
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum PdfALevel {
59    /// PDF/A-1a — conformance level A (tagged, accessible).
60    A1a,
61    /// PDF/A-1b — conformance level B (basic).
62    A1b,
63    /// PDF/A-2a — conformance level A (tagged, accessible).
64    A2a,
65    /// PDF/A-2b — conformance level B (basic).
66    A2b,
67    /// PDF/A-2u — conformance level U (Unicode).
68    A2u,
69    /// PDF/A-3a — conformance level A (tagged, accessible).
70    A3a,
71    /// PDF/A-3b — conformance level B (basic).
72    A3b,
73    /// PDF/A-3u — conformance level U (Unicode).
74    A3u,
75    /// PDF/A-4 base (ISO 19005-4, no conformance letter).
76    A4,
77    /// PDF/A-4f — allows file attachments.
78    A4f,
79    /// PDF/A-4e — allows engineering content (3D, rich media).
80    A4e,
81}
82
83impl PdfALevel {
84    /// ISO 19005 part number.
85    pub fn part(self) -> u8 {
86        match self {
87            Self::A1a | Self::A1b => 1,
88            Self::A2a | Self::A2b | Self::A2u => 2,
89            Self::A3a | Self::A3b | Self::A3u => 3,
90            Self::A4 | Self::A4f | Self::A4e => 4,
91        }
92    }
93
94    /// Conformance letter (a, b, u, f, e, or empty for PDF/A-4 base).
95    pub fn conformance(self) -> &'static str {
96        match self {
97            Self::A1a | Self::A2a | Self::A3a => "A",
98            Self::A1b | Self::A2b | Self::A3b => "B",
99            Self::A2u | Self::A3u => "U",
100            Self::A4 => "",
101            Self::A4f => "F",
102            Self::A4e => "E",
103        }
104    }
105
106    /// Whether this level requires tagged PDF (level "a").
107    pub fn requires_tagged(self) -> bool {
108        // PDF/A-4 and its variants (4, 4e, 4f) have no "A" conformance level and
109        // never require tagged content. Only PDF/A-1A, 2A, 3A require tagged. (#FP-6.6.1)
110        matches!(self, Self::A1a | Self::A2a | Self::A3a)
111    }
112
113    /// Detect PDF/A level from part number and conformance letter.
114    pub fn from_parts(part: u8, conformance: &str) -> Option<Self> {
115        match (part, conformance.to_ascii_uppercase().as_str()) {
116            (1, "A") => Some(Self::A1a),
117            (1, "B") | (1, _) => Some(Self::A1b),
118            (2, "A") => Some(Self::A2a),
119            (2, "U") => Some(Self::A2u),
120            (2, "B") | (2, _) => Some(Self::A2b),
121            (3, "A") => Some(Self::A3a),
122            (3, "U") => Some(Self::A3u),
123            (3, "B") | (3, _) => Some(Self::A3b),
124            (4, "F") => Some(Self::A4f),
125            (4, "E") => Some(Self::A4e),
126            (4, _) => Some(Self::A4),
127            _ => None,
128        }
129    }
130}
131
132/// PDF/X conformance level (ISO 15930).
133#[derive(Debug, Clone, Copy, PartialEq, Eq)]
134pub enum PdfXLevel {
135    /// PDF/X-1a:2003 — CMYK-only, no transparency.
136    X1a2003,
137    /// PDF/X-3:2003 — allows color-managed workflows, no transparency.
138    X32003,
139    /// PDF/X-4 — allows transparency and ICC-based colors.
140    X4,
141}
142
143impl PdfXLevel {
144    /// Whether this level forbids transparency.
145    pub fn forbids_transparency(self) -> bool {
146        matches!(self, Self::X1a2003 | Self::X32003)
147    }
148
149    /// Human-readable version string.
150    pub fn version_string(self) -> &'static str {
151        match self {
152            Self::X1a2003 => "PDF/X-1a:2003",
153            Self::X32003 => "PDF/X-3:2003",
154            Self::X4 => "PDF/X-4",
155        }
156    }
157
158    /// GTS version identifier for XMP metadata.
159    pub fn gts_version(self) -> &'static str {
160        match self {
161            Self::X1a2003 => "PDF/X-1a:2003",
162            Self::X32003 => "PDF/X-3:2003",
163            Self::X4 => "PDF/X-4",
164        }
165    }
166}
167
168/// A single compliance issue found during checking.
169#[derive(Debug, Clone)]
170pub struct ComplianceIssue {
171    /// Rule identifier (e.g., "6.1.2" for PDF/A clause).
172    pub rule: String,
173    /// Issue severity.
174    pub severity: Severity,
175    /// Human-readable description.
176    pub message: String,
177    /// Location in the document (object number, page, etc.).
178    pub location: Option<String>,
179}
180
181/// Severity of a compliance issue.
182#[derive(Debug, Clone, Copy, PartialEq, Eq)]
183pub enum Severity {
184    /// Conformance violation — document is non-compliant.
185    Error,
186    /// Potential issue that may affect compliance.
187    Warning,
188    /// Informational observation.
189    Info,
190}
191
192/// A complete compliance report.
193#[derive(Debug, Clone, Default)]
194pub struct ComplianceReport {
195    /// All issues found during the check.
196    pub issues: Vec<ComplianceIssue>,
197    /// The checked conformance level (if PDF/A).
198    pub pdfa_level: Option<PdfALevel>,
199    /// Whether the document is compliant.
200    pub compliant: bool,
201}
202
203impl ComplianceReport {
204    /// Returns `true` if no errors were found (warnings/info are allowed).
205    pub fn is_compliant(&self) -> bool {
206        !self.issues.iter().any(|i| i.severity == Severity::Error)
207    }
208
209    /// Number of errors.
210    pub fn error_count(&self) -> usize {
211        self.issues
212            .iter()
213            .filter(|i| i.severity == Severity::Error)
214            .count()
215    }
216
217    /// Number of warnings.
218    pub fn warning_count(&self) -> usize {
219        self.issues
220            .iter()
221            .filter(|i| i.severity == Severity::Warning)
222            .count()
223    }
224}
225
226/// Validate a PDF against a PDF/A conformance level.
227#[must_use]
228pub fn validate_pdfa(pdf: &Pdf, level: PdfALevel) -> ComplianceReport {
229    pdfa::validate(pdf, level)
230}
231
232/// Like `validate_pdfa` but prints per-check timing to stderr.
233#[must_use]
234pub fn validate_pdfa_timed(pdf: &Pdf, level: PdfALevel) -> ComplianceReport {
235    pdfa::validate_timed(pdf, level)
236}
237
238/// Like `validate_pdfa` but updates a progress tracker with the name of the
239/// current check.  Useful for diagnosing timeouts — the caller can read the
240/// tracker to see which check was last running.
241#[must_use]
242pub fn validate_pdfa_with_progress(
243    pdf: &Pdf,
244    level: PdfALevel,
245    progress: &std::sync::Mutex<String>,
246) -> ComplianceReport {
247    pdfa::validate_with_progress(pdf, level, progress)
248}
249
250/// Detect the PDF/A level declared in XMP metadata.
251///
252/// Uses lenient parsing: extracts part/conformance even when the pdfaid namespace URI is
253/// wrong, matching veraPDF's profile-selection behaviour. Compliance violations (§6.7.9,
254/// §6.7.11) are still reported by the strict checks in `validate_pdfa`.
255#[must_use]
256pub fn detect_pdfa_level(pdf: &Pdf) -> Option<PdfALevel> {
257    let xmp = check::get_xmp_metadata(pdf)?;
258    let (part, conformance) = check::parse_xmp_pdfa_lenient(&xmp)?;
259    PdfALevel::from_parts(part, &conformance)
260}
261
262/// Choose the preferred PDF/A level for validating or converting a source PDF.
263///
264/// Policy:
265/// - keep the declared XMP level when it is already PDF/A-2 or later;
266/// - promote declared PDF/A-1 documents to `A2b` when the source uses
267///   cross-reference streams, transparency, or JPEG2000;
268/// - default to `A2b` when no PDF/A level is declared.
269#[must_use]
270pub fn preferred_pdfa_level(pdf: &Pdf) -> PdfALevel {
271    match detect_pdfa_level(pdf) {
272        Some(level) if level.part() >= 2 => level,
273        Some(_level)
274            if check::has_xref_streams(pdf)
275                || check::uses_transparency(pdf)
276                || check::uses_jpeg2000(pdf) =>
277        {
278            PdfALevel::A2b
279        }
280        Some(level) => level,
281        None => PdfALevel::A2b,
282    }
283}
284
285/// Validate a PDF against PDF/UA-1 (ISO 14289-1).
286pub fn validate_pdfua(pdf: &Pdf) -> ComplianceReport {
287    pdfua::validate(pdf)
288}
289
290/// Validate a PDF against a PDF/X conformance level.
291pub fn validate_pdfx(pdf: &Pdf, level: PdfXLevel) -> ComplianceReport {
292    pdfx::validate(pdf, level)
293}
294
295/// Parse the structure tree from a PDF.
296pub fn parse_structure_tree(pdf: &Pdf) -> Option<tagged::StructureTree> {
297    tagged::parse(pdf)
298}
299
300#[cfg(test)]
301mod tests {
302    use super::{detect_pdfa_level, preferred_pdfa_level, PdfALevel};
303    use pdf_syntax::Pdf;
304
305    // -----------------------------------------------------------------------
306    // Minimal PDF builder (no external test deps: pure bytes)
307    // -----------------------------------------------------------------------
308
309    /// Build a minimal PDF from `(obj_number, body_bytes)` pairs.
310    ///
311    /// Objects are emitted with a correct cross-reference table so that
312    /// pdf-syntax can parse them. Gaps in the numbering become free entries.
313    fn build_pdf(objs: &[(u32, &[u8])]) -> Vec<u8> {
314        let max = objs.iter().map(|(n, _)| *n).max().unwrap_or(0);
315        let mut pdf: Vec<u8> = b"%PDF-1.4\n".to_vec();
316        let mut offsets = std::collections::HashMap::new();
317        for (n, body) in objs {
318            offsets.insert(*n, pdf.len());
319            pdf.extend_from_slice(format!("{n} 0 obj\n").as_bytes());
320            pdf.extend_from_slice(body);
321            pdf.extend_from_slice(b"\nendobj\n");
322        }
323        let xref_off = pdf.len();
324        pdf.extend_from_slice(format!("xref\n0 {}\n", max + 1).as_bytes());
325        pdf.extend_from_slice(b"0000000000 65535 f \n");
326        for n in 1..=max {
327            match offsets.get(&n) {
328                Some(off) => pdf.extend_from_slice(format!("{off:010} 00000 n \n").as_bytes()),
329                None => pdf.extend_from_slice(b"0000000000 65535 f \n"),
330            }
331        }
332        pdf.extend_from_slice(
333            format!(
334                "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF",
335                max + 1
336            )
337            .as_bytes(),
338        );
339        pdf
340    }
341
342    fn pdfa1b_xmp() -> Vec<u8> {
343        br#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
344<x:xmpmeta xmlns:x="adobe:ns:meta/">
345  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
346    <rdf:Description rdf:about="" xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
347      <pdfaid:part>1</pdfaid:part>
348      <pdfaid:conformance>B</pdfaid:conformance>
349    </rdf:Description>
350  </rdf:RDF>
351</x:xmpmeta>
352<?xpacket end="w"?>"#
353            .to_vec()
354    }
355
356    fn parse_pdf(bytes: Vec<u8>) -> Pdf {
357        Pdf::new(bytes).unwrap()
358    }
359
360    // -----------------------------------------------------------------------
361    // Tests
362    // -----------------------------------------------------------------------
363
364    #[test]
365    fn preferred_level_defaults_to_a2b_without_xmp() {
366        // Minimal 1-page PDF with no XMP — no declared level.
367        let bytes = build_pdf(&[
368            (1, b"<< /Type /Catalog /Pages 2 0 R >>"),
369            (2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>"),
370            (3, b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>"),
371        ]);
372        let pdf = parse_pdf(bytes);
373        assert_eq!(detect_pdfa_level(&pdf), None);
374        assert_eq!(preferred_pdfa_level(&pdf), PdfALevel::A2b);
375    }
376
377    #[test]
378    fn preferred_level_keeps_declared_a1b_when_source_is_part1_compatible() {
379        // 1-page PDF with PDF/A-1b XMP, classic xref table, no transparency, no JPX.
380        let xmp_body = {
381            let xmp = pdfa1b_xmp();
382            let len = xmp.len();
383            let mut body = format!(
384                "<< /Type /Metadata /Subtype /XML /Length {len} >>\nstream\n"
385            )
386            .into_bytes();
387            body.extend_from_slice(&xmp);
388            body.extend_from_slice(b"\nendstream");
389            body
390        };
391        let bytes = build_pdf(&[
392            (1, b"<< /Type /Catalog /Pages 2 0 R /Metadata 4 0 R >>"),
393            (2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>"),
394            (3, b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>"),
395            (4, &xmp_body),
396        ]);
397        let pdf = parse_pdf(bytes);
398        assert_eq!(detect_pdfa_level(&pdf), Some(PdfALevel::A1b));
399        assert!(!crate::check::has_xref_streams(&pdf));
400        assert!(!crate::check::uses_transparency(&pdf));
401        assert!(!crate::check::uses_jpeg2000(&pdf));
402        assert_eq!(preferred_pdfa_level(&pdf), PdfALevel::A1b);
403    }
404
405    #[test]
406    fn preferred_level_promotes_declared_a1b_for_xref_streams() {
407        // PDF/A-1b XMP + cross-reference stream (has_xref_streams checks for
408        // "/XRefStm" in the raw bytes OR a startxref pointing to a stream).
409        // Easiest: embed "/XRefStm 0" in the file so the byte scan hits it.
410        let xmp_body = {
411            let xmp = pdfa1b_xmp();
412            let len = xmp.len();
413            let mut body = format!(
414                "<< /Type /Metadata /Subtype /XML /Length {len} >>\nstream\n"
415            )
416            .into_bytes();
417            body.extend_from_slice(&xmp);
418            body.extend_from_slice(b"\nendstream");
419            body
420        };
421        // Build the normal table-based PDF, then append an "/XRefStm" hint.
422        let mut bytes = build_pdf(&[
423            (1, b"<< /Type /Catalog /Pages 2 0 R /Metadata 4 0 R >>"),
424            (2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>"),
425            (3, b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>"),
426            (4, &xmp_body),
427        ]);
428        // Inject the marker so has_xref_streams returns true.
429        bytes.extend_from_slice(b"\n%% /XRefStm 0\n");
430        let pdf = parse_pdf(bytes);
431        assert_eq!(detect_pdfa_level(&pdf), Some(PdfALevel::A1b));
432        assert!(crate::check::has_xref_streams(&pdf));
433        assert_eq!(preferred_pdfa_level(&pdf), PdfALevel::A2b);
434    }
435
436    #[test]
437    fn preferred_level_promotes_declared_a1b_for_transparency() {
438        // PDF/A-1b XMP + page with a transparent ExtGState (ca < 1).
439        let xmp_body = {
440            let xmp = pdfa1b_xmp();
441            let len = xmp.len();
442            let mut body = format!(
443                "<< /Type /Metadata /Subtype /XML /Length {len} >>\nstream\n"
444            )
445            .into_bytes();
446            body.extend_from_slice(&xmp);
447            body.extend_from_slice(b"\nendstream");
448            body
449        };
450        let bytes = build_pdf(&[
451            (1, b"<< /Type /Catalog /Pages 2 0 R /Metadata 5 0 R >>"),
452            (2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>"),
453            // Page has /Resources with an ExtGState containing /ca 0.5
454            (
455                3,
456                b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] \
457                  /Resources << /ExtGState << /GS1 4 0 R >> >> >>",
458            ),
459            (4, b"<< /Type /ExtGState /ca 0.5 >>"),
460            (5, &xmp_body),
461        ]);
462        let pdf = parse_pdf(bytes);
463        assert_eq!(detect_pdfa_level(&pdf), Some(PdfALevel::A1b));
464        assert!(crate::check::uses_transparency(&pdf));
465        assert_eq!(preferred_pdfa_level(&pdf), PdfALevel::A2b);
466    }
467
468    #[test]
469    fn preferred_level_promotes_declared_a1b_for_jpeg2000() {
470        // PDF/A-1b XMP + an image stream with /Filter /JPXDecode.
471        let xmp_body = {
472            let xmp = pdfa1b_xmp();
473            let len = xmp.len();
474            let mut body = format!(
475                "<< /Type /Metadata /Subtype /XML /Length {len} >>\nstream\n"
476            )
477            .into_bytes();
478            body.extend_from_slice(&xmp);
479            body.extend_from_slice(b"\nendstream");
480            body
481        };
482        // 8 placeholder bytes as "image" content (not real JPEG2000; the check
483        // only looks for /Filter /JPXDecode, not decoding validity).
484        let img_body =
485            b"<< /Type /XObject /Subtype /Image /Width 1 /Height 1 \
486              /BitsPerComponent 8 /ColorSpace /DeviceGray /Filter /JPXDecode \
487              /Length 8 >>\nstream\n00000000\nendstream";
488        let bytes = build_pdf(&[
489            (1, b"<< /Type /Catalog /Pages 2 0 R /Metadata 5 0 R >>"),
490            (2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>"),
491            (3, b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>"),
492            (4, img_body),
493            (5, &xmp_body),
494        ]);
495        let pdf = parse_pdf(bytes);
496        assert_eq!(detect_pdfa_level(&pdf), Some(PdfALevel::A1b));
497        assert!(crate::check::uses_jpeg2000(&pdf));
498        assert_eq!(preferred_pdfa_level(&pdf), PdfALevel::A2b);
499    }
500}