pdf_oxide 0.3.22

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
//! PDF/A compliance types and data structures.

use std::fmt;

/// PDF/A conformance level.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PdfALevel {
    /// PDF/A-1a: Full conformance with logical structure
    A1a,
    /// PDF/A-1b: Basic conformance (visual preservation)
    A1b,
    /// PDF/A-2a: PDF 1.7 based, full conformance
    A2a,
    /// PDF/A-2b: PDF 1.7 based, basic conformance
    A2b,
    /// PDF/A-2u: PDF/A-2b plus Unicode mapping
    A2u,
    /// PDF/A-3a: PDF/A-2a plus embedded files
    A3a,
    /// PDF/A-3b: PDF/A-2b plus embedded files
    A3b,
    /// PDF/A-3u: PDF/A-3b plus Unicode mapping
    A3u,
}

impl PdfALevel {
    /// Get the PDF/A part (1, 2, or 3).
    pub fn part(&self) -> PdfAPart {
        match self {
            PdfALevel::A1a | PdfALevel::A1b => PdfAPart::Part1,
            PdfALevel::A2a | PdfALevel::A2b | PdfALevel::A2u => PdfAPart::Part2,
            PdfALevel::A3a | PdfALevel::A3b | PdfALevel::A3u => PdfAPart::Part3,
        }
    }

    /// Get the conformance level letter.
    pub fn conformance(&self) -> char {
        match self {
            PdfALevel::A1a | PdfALevel::A2a | PdfALevel::A3a => 'A',
            PdfALevel::A1b | PdfALevel::A2b | PdfALevel::A3b => 'B',
            PdfALevel::A2u | PdfALevel::A3u => 'U',
        }
    }

    /// Check if this level requires logical structure (Tagged PDF).
    pub fn requires_structure(&self) -> bool {
        matches!(self, PdfALevel::A1a | PdfALevel::A2a | PdfALevel::A3a)
    }

    /// Check if this level requires Unicode mapping.
    pub fn requires_unicode(&self) -> bool {
        matches!(
            self,
            PdfALevel::A1a | PdfALevel::A2a | PdfALevel::A2u | PdfALevel::A3a | PdfALevel::A3u
        )
    }

    /// Check if transparency is allowed.
    pub fn allows_transparency(&self) -> bool {
        !matches!(self, PdfALevel::A1a | PdfALevel::A1b)
    }

    /// Check if JPEG2000 is allowed.
    pub fn allows_jpeg2000(&self) -> bool {
        !matches!(self, PdfALevel::A1a | PdfALevel::A1b)
    }

    /// Check if arbitrary embedded files are allowed.
    pub fn allows_embedded_files(&self) -> bool {
        matches!(self, PdfALevel::A3a | PdfALevel::A3b | PdfALevel::A3u)
    }

    /// Get the XMP pdfaid:part value.
    pub fn xmp_part(&self) -> &'static str {
        match self.part() {
            PdfAPart::Part1 => "1",
            PdfAPart::Part2 => "2",
            PdfAPart::Part3 => "3",
        }
    }

    /// Get the XMP pdfaid:conformance value.
    pub fn xmp_conformance(&self) -> &'static str {
        match self.conformance() {
            'A' => "A",
            'B' => "B",
            'U' => "U",
            _ => "B",
        }
    }

    /// Parse from XMP pdfaid:part and pdfaid:conformance values.
    pub fn from_xmp(part: &str, conformance: &str) -> Option<Self> {
        match (part, conformance.to_uppercase().as_str()) {
            ("1", "A") => Some(PdfALevel::A1a),
            ("1", "B") => Some(PdfALevel::A1b),
            ("2", "A") => Some(PdfALevel::A2a),
            ("2", "B") => Some(PdfALevel::A2b),
            ("2", "U") => Some(PdfALevel::A2u),
            ("3", "A") => Some(PdfALevel::A3a),
            ("3", "B") => Some(PdfALevel::A3b),
            ("3", "U") => Some(PdfALevel::A3u),
            _ => None,
        }
    }
}

impl fmt::Display for PdfALevel {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let name = match self {
            PdfALevel::A1a => "PDF/A-1a",
            PdfALevel::A1b => "PDF/A-1b",
            PdfALevel::A2a => "PDF/A-2a",
            PdfALevel::A2b => "PDF/A-2b",
            PdfALevel::A2u => "PDF/A-2u",
            PdfALevel::A3a => "PDF/A-3a",
            PdfALevel::A3b => "PDF/A-3b",
            PdfALevel::A3u => "PDF/A-3u",
        };
        write!(f, "{}", name)
    }
}

/// PDF/A part (version).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PdfAPart {
    /// PDF/A-1 (based on PDF 1.4)
    Part1,
    /// PDF/A-2 (based on PDF 1.7)
    Part2,
    /// PDF/A-3 (based on PDF 1.7, with embedded files)
    Part3,
}

impl fmt::Display for PdfAPart {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            PdfAPart::Part1 => write!(f, "PDF/A-1"),
            PdfAPart::Part2 => write!(f, "PDF/A-2"),
            PdfAPart::Part3 => write!(f, "PDF/A-3"),
        }
    }
}

/// Result of PDF/A validation.
#[derive(Debug, Clone)]
pub struct ValidationResult {
    /// Whether the document is compliant with the target level.
    pub is_compliant: bool,
    /// The level validated against.
    pub level: PdfALevel,
    /// Detected PDF/A level from XMP metadata (if any).
    pub detected_level: Option<PdfALevel>,
    /// Compliance errors (violations).
    pub errors: Vec<ComplianceError>,
    /// Compliance warnings (non-fatal issues).
    pub warnings: Vec<ComplianceWarning>,
    /// Summary statistics.
    pub stats: ValidationStats,
}

impl Default for ValidationResult {
    fn default() -> Self {
        Self {
            is_compliant: false,
            level: PdfALevel::A2b,
            detected_level: None,
            errors: Vec::new(),
            warnings: Vec::new(),
            stats: ValidationStats::default(),
        }
    }
}

impl ValidationResult {
    /// Create a new validation result for a specific level.
    pub fn new(level: PdfALevel) -> Self {
        Self {
            level,
            ..Default::default()
        }
    }

    /// Add an error to the result.
    pub fn add_error(&mut self, error: ComplianceError) {
        self.errors.push(error);
        self.is_compliant = false;
    }

    /// Add a warning to the result.
    pub fn add_warning(&mut self, warning: ComplianceWarning) {
        self.warnings.push(warning);
    }

    /// Check if there are any errors.
    pub fn has_errors(&self) -> bool {
        !self.errors.is_empty()
    }

    /// Check if there are any warnings.
    pub fn has_warnings(&self) -> bool {
        !self.warnings.is_empty()
    }
}

/// Validation statistics.
#[derive(Debug, Clone, Default)]
pub struct ValidationStats {
    /// Number of fonts checked.
    pub fonts_checked: usize,
    /// Number of fonts embedded.
    pub fonts_embedded: usize,
    /// Number of images checked.
    pub images_checked: usize,
    /// Number of color spaces checked.
    pub color_spaces_checked: usize,
    /// Number of annotations checked.
    pub annotations_checked: usize,
    /// Number of pages checked.
    pub pages_checked: usize,
}

/// Compliance error (violation).
#[derive(Debug, Clone)]
pub struct ComplianceError {
    /// Error code.
    pub code: ErrorCode,
    /// Human-readable message.
    pub message: String,
    /// Location in the document (if applicable).
    pub location: Option<String>,
    /// Clause reference in the standard.
    pub clause: Option<String>,
}

impl ComplianceError {
    /// Create a new compliance error.
    pub fn new(code: ErrorCode, message: impl Into<String>) -> Self {
        Self {
            code,
            message: message.into(),
            location: None,
            clause: None,
        }
    }

    /// Set the location.
    pub fn with_location(mut self, location: impl Into<String>) -> Self {
        self.location = Some(location.into());
        self
    }

    /// Set the clause reference.
    pub fn with_clause(mut self, clause: impl Into<String>) -> Self {
        self.clause = Some(clause.into());
        self
    }
}

impl fmt::Display for ComplianceError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "[{}] {}", self.code, self.message)?;
        if let Some(ref loc) = self.location {
            write!(f, " (at {})", loc)?;
        }
        Ok(())
    }
}

/// Error codes for PDF/A violations.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ErrorCode {
    // Metadata errors
    /// Missing XMP metadata
    MissingXmpMetadata,
    /// Missing PDF/A identification in XMP
    MissingPdfaIdentification,
    /// Invalid PDF/A identification
    InvalidPdfaIdentification,
    /// XMP metadata not synchronized with document info
    XmpMetadataMismatch,

    // Font errors
    /// Font not embedded
    FontNotEmbedded,
    /// Font missing required tables
    FontMissingTables,
    /// Font has invalid encoding
    FontInvalidEncoding,
    /// Font missing ToUnicode CMap
    FontMissingToUnicode,

    // Color errors
    /// Device-dependent color used without output intent
    DeviceColorWithoutIntent,
    /// Missing output intent
    MissingOutputIntent,
    /// Invalid ICC profile
    InvalidIccProfile,
    /// Incompatible ICC profile version
    IccProfileVersionMismatch,

    // Image errors
    /// Image uses unsupported compression
    UnsupportedImageCompression,
    /// Image has invalid color space
    InvalidImageColorSpace,
    /// LZW compression not allowed
    LzwCompressionNotAllowed,

    // Structure errors
    /// Missing document structure (for level A)
    MissingDocumentStructure,
    /// Invalid structure tree
    InvalidStructureTree,
    /// Missing language specification
    MissingLanguage,

    // Content errors
    /// Transparency used (PDF/A-1)
    TransparencyNotAllowed,
    /// JavaScript present
    JavaScriptNotAllowed,
    /// Audio/video content present
    MultimediaNotAllowed,
    /// External content reference
    ExternalContentNotAllowed,
    /// Encryption present
    EncryptionNotAllowed,

    // Annotation errors
    /// Invalid annotation
    InvalidAnnotation,
    /// Widget annotation without appearance stream
    MissingAppearanceStream,

    // Action errors
    /// Invalid action type
    InvalidAction,
    /// Launch action not allowed
    LaunchActionNotAllowed,

    // File errors
    /// Embedded file not allowed (PDF/A-1, PDF/A-2)
    EmbeddedFileNotAllowed,
    /// Embedded file missing AF relationship (PDF/A-3)
    MissingAfRelationship,

    // Other errors
    /// PostScript XObject not allowed
    PostScriptNotAllowed,
    /// Reference XObject not allowed
    ReferenceXObjectNotAllowed,
    /// Optional content (layers) issue
    OptionalContentIssue,
}

impl fmt::Display for ErrorCode {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let code = match self {
            ErrorCode::MissingXmpMetadata => "XMP-001",
            ErrorCode::MissingPdfaIdentification => "XMP-002",
            ErrorCode::InvalidPdfaIdentification => "XMP-003",
            ErrorCode::XmpMetadataMismatch => "XMP-004",
            ErrorCode::FontNotEmbedded => "FONT-001",
            ErrorCode::FontMissingTables => "FONT-002",
            ErrorCode::FontInvalidEncoding => "FONT-003",
            ErrorCode::FontMissingToUnicode => "FONT-004",
            ErrorCode::DeviceColorWithoutIntent => "COLOR-001",
            ErrorCode::MissingOutputIntent => "COLOR-002",
            ErrorCode::InvalidIccProfile => "COLOR-003",
            ErrorCode::IccProfileVersionMismatch => "COLOR-004",
            ErrorCode::UnsupportedImageCompression => "IMAGE-001",
            ErrorCode::InvalidImageColorSpace => "IMAGE-002",
            ErrorCode::LzwCompressionNotAllowed => "IMAGE-003",
            ErrorCode::MissingDocumentStructure => "STRUCT-001",
            ErrorCode::InvalidStructureTree => "STRUCT-002",
            ErrorCode::MissingLanguage => "STRUCT-003",
            ErrorCode::TransparencyNotAllowed => "CONTENT-001",
            ErrorCode::JavaScriptNotAllowed => "CONTENT-002",
            ErrorCode::MultimediaNotAllowed => "CONTENT-003",
            ErrorCode::ExternalContentNotAllowed => "CONTENT-004",
            ErrorCode::EncryptionNotAllowed => "CONTENT-005",
            ErrorCode::InvalidAnnotation => "ANNOT-001",
            ErrorCode::MissingAppearanceStream => "ANNOT-002",
            ErrorCode::InvalidAction => "ACTION-001",
            ErrorCode::LaunchActionNotAllowed => "ACTION-002",
            ErrorCode::EmbeddedFileNotAllowed => "FILE-001",
            ErrorCode::MissingAfRelationship => "FILE-002",
            ErrorCode::PostScriptNotAllowed => "XOBJ-001",
            ErrorCode::ReferenceXObjectNotAllowed => "XOBJ-002",
            ErrorCode::OptionalContentIssue => "OC-001",
        };
        write!(f, "{}", code)
    }
}

/// Compliance warning (non-fatal issue).
#[derive(Debug, Clone)]
pub struct ComplianceWarning {
    /// Warning code.
    pub code: WarningCode,
    /// Human-readable message.
    pub message: String,
    /// Location in the document (if applicable).
    pub location: Option<String>,
}

impl ComplianceWarning {
    /// Create a new compliance warning.
    pub fn new(code: WarningCode, message: impl Into<String>) -> Self {
        Self {
            code,
            message: message.into(),
            location: None,
        }
    }

    /// Set the location.
    pub fn with_location(mut self, location: impl Into<String>) -> Self {
        self.location = Some(location.into());
        self
    }
}

impl fmt::Display for ComplianceWarning {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "[{}] {}", self.code, self.message)?;
        if let Some(ref loc) = self.location {
            write!(f, " (at {})", loc)?;
        }
        Ok(())
    }
}

/// Warning codes.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum WarningCode {
    /// Deprecated feature used
    DeprecatedFeature,
    /// Large file size
    LargeFileSize,
    /// Missing recommended metadata
    MissingRecommendedMetadata,
    /// Font subset very small
    SmallFontSubset,
    /// High-resolution image
    HighResolutionImage,
    /// Complex structure
    ComplexStructure,
    /// Partial check performed (full validation requires additional features)
    PartialCheck,
}

impl fmt::Display for WarningCode {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let code = match self {
            WarningCode::DeprecatedFeature => "WARN-001",
            WarningCode::LargeFileSize => "WARN-002",
            WarningCode::MissingRecommendedMetadata => "WARN-003",
            WarningCode::SmallFontSubset => "WARN-004",
            WarningCode::HighResolutionImage => "WARN-005",
            WarningCode::ComplexStructure => "WARN-006",
            WarningCode::PartialCheck => "WARN-007",
        };
        write!(f, "{}", code)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pdf_a_level_properties() {
        assert_eq!(PdfALevel::A1a.part(), PdfAPart::Part1);
        assert_eq!(PdfALevel::A2b.part(), PdfAPart::Part2);
        assert_eq!(PdfALevel::A3u.part(), PdfAPart::Part3);

        assert!(PdfALevel::A1a.requires_structure());
        assert!(!PdfALevel::A1b.requires_structure());
        assert!(PdfALevel::A2a.requires_structure());

        assert!(!PdfALevel::A1a.allows_transparency());
        assert!(PdfALevel::A2b.allows_transparency());

        assert!(!PdfALevel::A2b.allows_embedded_files());
        assert!(PdfALevel::A3b.allows_embedded_files());
    }

    #[test]
    fn test_pdf_a_level_xmp() {
        assert_eq!(PdfALevel::A1b.xmp_part(), "1");
        assert_eq!(PdfALevel::A1b.xmp_conformance(), "B");
        assert_eq!(PdfALevel::A2u.xmp_conformance(), "U");
    }

    #[test]
    fn test_pdf_a_level_from_xmp() {
        assert_eq!(PdfALevel::from_xmp("1", "A"), Some(PdfALevel::A1a));
        assert_eq!(PdfALevel::from_xmp("2", "b"), Some(PdfALevel::A2b));
        assert_eq!(PdfALevel::from_xmp("3", "U"), Some(PdfALevel::A3u));
        assert_eq!(PdfALevel::from_xmp("4", "A"), None);
    }

    #[test]
    fn test_pdf_a_level_display() {
        assert_eq!(format!("{}", PdfALevel::A1b), "PDF/A-1b");
        assert_eq!(format!("{}", PdfALevel::A2u), "PDF/A-2u");
    }

    #[test]
    fn test_validation_result() {
        let mut result = ValidationResult::new(PdfALevel::A2b);
        assert!(!result.is_compliant);
        assert!(!result.has_errors());

        result.add_error(ComplianceError::new(
            ErrorCode::FontNotEmbedded,
            "Font 'Arial' is not embedded",
        ));
        assert!(result.has_errors());
        assert!(!result.is_compliant);
    }

    #[test]
    fn test_compliance_error_display() {
        let error = ComplianceError::new(ErrorCode::FontNotEmbedded, "Font not embedded")
            .with_location("Page 1");
        let display = format!("{}", error);
        assert!(display.contains("[FONT-001]"));
        assert!(display.contains("Page 1"));
    }
}