oxidize-pdf 2.5.1

A pure Rust PDF generation and manipulation library with zero external dependencies
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
//! PDF Parser for Verification
//!
//! This module provides a simple parser to extract key information from generated PDFs
//! for verification purposes. It's not a complete PDF parser, but focuses on the
//! elements needed to verify ISO compliance.

use crate::error::{PdfError, Result};
use std::collections::HashMap;

/// Parsed representation of a PDF for verification
#[derive(Debug, Clone)]
pub struct ParsedPdf {
    /// PDF version from header
    pub version: String,
    /// Document catalog dictionary
    pub catalog: Option<HashMap<String, String>>,
    /// Page tree information
    pub page_tree: Option<PageTree>,
    /// Font information
    pub fonts: Vec<String>,
    /// Color space usage flags
    pub uses_device_rgb: bool,
    pub uses_device_cmyk: bool,
    pub uses_device_gray: bool,
    /// Graphics state information
    pub graphics_states: Vec<GraphicsState>,
    /// Text objects found
    pub text_objects: Vec<TextObject>,
    /// Annotations found
    pub annotations: Vec<Annotation>,
    /// Cross-reference table info
    pub xref_valid: bool,
    /// Total objects in PDF
    pub object_count: usize,
}

#[derive(Debug, Clone)]
pub struct PageTree {
    pub root_type: String,
    pub page_count: usize,
    pub kids_arrays: Vec<Vec<String>>,
}

#[derive(Debug, Clone)]
pub struct GraphicsState {
    pub line_width: Option<f64>,
    pub line_cap: Option<i32>,
    pub line_join: Option<i32>,
    pub fill_color: Option<String>,
    pub stroke_color: Option<String>,
}

#[derive(Debug, Clone)]
pub struct TextObject {
    pub font: Option<String>,
    pub font_size: Option<f64>,
    pub text_content: String,
}

#[derive(Debug, Clone)]
pub struct Annotation {
    pub subtype: String,
    pub rect: Option<[f64; 4]>,
    pub contents: Option<String>,
}

/// Parse PDF bytes and extract verification information
pub fn parse_pdf(pdf_bytes: &[u8]) -> Result<ParsedPdf> {
    let pdf_text = String::from_utf8_lossy(pdf_bytes);

    let parsed = ParsedPdf {
        version: extract_version(&pdf_text)?,
        catalog: extract_catalog(&pdf_text),
        page_tree: extract_page_tree(&pdf_text),
        fonts: extract_fonts(&pdf_text),
        uses_device_rgb: detect_rgb_usage(&pdf_text),
        uses_device_cmyk: detect_cmyk_usage(&pdf_text),
        uses_device_gray: detect_gray_usage(&pdf_text),
        graphics_states: extract_graphics_states(&pdf_text),
        text_objects: extract_text_objects(&pdf_text),
        annotations: extract_annotations(&pdf_text),
        xref_valid: validate_xref(&pdf_text),
        object_count: count_objects(&pdf_text),
    };

    Ok(parsed)
}

/// Extract PDF version from header
fn extract_version(pdf_text: &str) -> Result<String> {
    if let Some(header_line) = pdf_text.lines().next() {
        if let Some(stripped) = header_line.strip_prefix("%PDF-") {
            return Ok(stripped.to_string());
        }
    }
    Err(PdfError::ParseError(
        "No valid PDF header found".to_string(),
    ))
}

/// Extract document catalog information
fn extract_catalog(pdf_text: &str) -> Option<HashMap<String, String>> {
    // Look for catalog object pattern with flexible spacing
    let catalog_patterns = [
        "/Type /Catalog",
        "/Type/Catalog",
        "/Type  /Catalog", // Multiple spaces
        "/Type Catalog",   // Space but no slash before Catalog (writer format)
    ];

    for pattern in &catalog_patterns {
        if let Some(pattern_pos) = pdf_text.find(pattern) {
            // Find the start of the object containing this pattern
            let before_pattern = &pdf_text[..pattern_pos];
            if let Some(obj_start) = before_pattern.rfind(" obj") {
                // Find the complete object from " obj" to "endobj"
                let from_obj = &pdf_text[obj_start..];
                if let Some(end) = from_obj.find("endobj") {
                    let catalog_content = &from_obj[..end];

                    let mut catalog = HashMap::new();

                    // Extract Type - check for any of the patterns
                    for type_pattern in &catalog_patterns {
                        if catalog_content.contains(type_pattern) {
                            catalog.insert("Type".to_string(), "Catalog".to_string());
                            break;
                        }
                    }

                    // Extract Version if present
                    if let Some(version_match) = extract_dict_entry(catalog_content, "Version") {
                        catalog.insert("Version".to_string(), version_match);
                    }

                    // Extract Pages reference
                    if let Some(pages_match) = extract_dict_entry(catalog_content, "Pages") {
                        catalog.insert("Pages".to_string(), pages_match);
                    }

                    return Some(catalog);
                }
            }
        }
    }
    None
}

/// Extract page tree information
fn extract_page_tree(pdf_text: &str) -> Option<PageTree> {
    // Look for page tree root with flexible spacing
    let pages_patterns = [
        "/Type /Pages",
        "/Type/Pages",
        "/Type  /Pages", // Multiple spaces
        "/Type Pages",   // Space but no slash before Pages (writer format)
    ];

    for pattern in &pages_patterns {
        if let Some(pages_start) = pdf_text.find(pattern) {
            let pages_section = &pdf_text[pages_start..];
            if let Some(end) = pages_section.find("endobj") {
                let pages_content = &pages_section[..end];

                let page_count = extract_dict_entry(pages_content, "Count")
                    .and_then(|s| {
                        // Handle both "1" and "1 0 R" formats
                        let cleaned = s.split_whitespace().next().unwrap_or("0");
                        cleaned.parse::<usize>().ok()
                    })
                    .unwrap_or(0);

                let mut kids_arrays = Vec::new();
                if let Some(kids_match) = extract_array_entry(pages_content, "Kids") {
                    kids_arrays.push(kids_match);
                }

                return Some(PageTree {
                    root_type: "Pages".to_string(),
                    page_count,
                    kids_arrays,
                });
            }
        }
    }
    None
}

/// Extract font information
fn extract_fonts(pdf_text: &str) -> Vec<String> {
    let mut fonts = Vec::new();

    // Look for font objects
    for line in pdf_text.lines() {
        if line.contains("/Type /Font") || line.contains("/BaseFont") {
            // Extract font name patterns
            if line.contains("Helvetica") {
                fonts.push("Helvetica".to_string());
            }
            if line.contains("Times") {
                fonts.push("Times-Roman".to_string());
            }
            if line.contains("Courier") {
                fonts.push("Courier".to_string());
            }
            if line.contains("Symbol") {
                fonts.push("Symbol".to_string());
            }
            if line.contains("ZapfDingbats") {
                fonts.push("ZapfDingbats".to_string());
            }
        }
    }

    fonts.sort();
    fonts.dedup();
    fonts
}

/// Extract graphics state information
fn extract_graphics_states(pdf_text: &str) -> Vec<GraphicsState> {
    let mut states = Vec::new();

    // Look for content streams with graphics operators
    for line in pdf_text.lines() {
        if line.contains(" w")
            || line.contains(" J")
            || line.contains(" j")
            || line.contains(" rg")
            || line.contains(" RG")
        {
            let mut state = GraphicsState {
                line_width: None,
                line_cap: None,
                line_join: None,
                fill_color: None,
                stroke_color: None,
            };

            // Extract line width (pattern: "number w")
            if let Some(w_match) = extract_graphics_operator(line, "w") {
                state.line_width = w_match.parse().ok();
            }

            // Extract line cap (pattern: "number J")
            if let Some(j_match) = extract_graphics_operator(line, "J") {
                state.line_cap = j_match.parse().ok();
            }

            states.push(state);
        }
    }

    states
}

/// Extract text objects
fn extract_text_objects(pdf_text: &str) -> Vec<TextObject> {
    let mut text_objects = Vec::new();

    // Look for text objects (BT...ET blocks)
    let mut in_text_object = false;
    let mut current_font = None;
    let mut current_size = None;

    for line in pdf_text.lines() {
        if line.contains("BT") {
            in_text_object = true;
            current_font = None;
            current_size = None;
        } else if line.contains("ET") {
            in_text_object = false;
        } else if in_text_object {
            // Extract font settings (pattern: "/FontName size Tf")
            if line.contains(" Tf") {
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 3 {
                    current_font = Some(parts[0].to_string());
                    current_size = parts[1].parse().ok();
                }
            }

            // Extract text content (pattern: "(text) Tj" or "[(text)] TJ")
            if line.contains(" Tj") || line.contains(" TJ") {
                if let Some(text_content) = extract_text_content(line) {
                    text_objects.push(TextObject {
                        font: current_font.clone(),
                        font_size: current_size,
                        text_content,
                    });
                }
            }
        }
    }

    text_objects
}

/// Extract annotations
fn extract_annotations(pdf_text: &str) -> Vec<Annotation> {
    let mut annotations = Vec::new();

    // Look for annotation objects
    if pdf_text.contains("/Type /Annot") {
        // This is a simplified extraction - real implementation would be more complex
        // Process each annotation object section
        let sections = pdf_text.split(" obj").collect::<Vec<&str>>();
        for section in sections {
            if section.contains("/Type /Annot") && section.contains("/Subtype") {
                if let Some(subtype) = extract_dict_entry(section, "Subtype") {
                    // Extract rect array [x1 y1 x2 y2]
                    let rect =
                        extract_array_entry(section, "Rect").and_then(|arr| parse_rect_array(&arr));

                    // Extract contents (string value)
                    let contents = extract_string_entry(section, "Contents");

                    annotations.push(Annotation {
                        subtype,
                        rect,
                        contents,
                    });
                }
            }
        }
    }

    annotations
}

/// Validate cross-reference table
fn validate_xref(pdf_text: &str) -> bool {
    pdf_text.contains("xref") && pdf_text.contains("%%EOF")
}

/// Count total objects in PDF
fn count_objects(pdf_text: &str) -> usize {
    pdf_text.matches(" obj").count()
}

/// Helper: Extract dictionary entry value
fn extract_dict_entry(content: &str, key: &str) -> Option<String> {
    let pattern = format!("/{}", key);
    if let Some(start) = content.find(&pattern) {
        let after_key = &content[start + pattern.len()..];
        let words: Vec<&str> = after_key.split_whitespace().collect();
        if !words.is_empty() {
            // Check if it's a PDF reference (3 words: "N G R")
            if words.len() >= 3 && words[2] == "R" {
                return Some(format!("{} {} {}", words[0], words[1], words[2]));
            }
            // Otherwise return first word without slash
            return Some(words[0].trim_start_matches('/').to_string());
        }
    }
    None
}

/// Helper: Parse rect array from string vector to [f64; 4]
fn parse_rect_array(arr: &[String]) -> Option<[f64; 4]> {
    if arr.len() == 4 {
        let mut rect = [0.0; 4];
        for (i, val_str) in arr.iter().enumerate() {
            if let Ok(val) = val_str.parse::<f64>() {
                rect[i] = val;
            } else {
                return None; // Failed to parse as number
            }
        }
        Some(rect)
    } else {
        None // Wrong number of elements
    }
}

/// Helper: Extract string entry (handles both literal and hex strings)
fn extract_string_entry(content: &str, key: &str) -> Option<String> {
    let pattern = format!("/{}", key);
    if let Some(start) = content.find(&pattern) {
        let after_key = &content[start + pattern.len()..];

        // Skip whitespace
        let trimmed = after_key.trim_start();

        if trimmed.starts_with('(') {
            // Literal string: (content)
            if let Some(end) = trimmed.find(')') {
                let string_content = &trimmed[1..end];
                return Some(string_content.to_string());
            }
        } else if trimmed.starts_with('<') && !trimmed.starts_with("<<") {
            // Hex string: <hexdata>
            if let Some(end) = trimmed.find('>') {
                let hex_content = &trimmed[1..end];
                // For simplicity, just return the hex string as-is
                // In a full implementation, you'd decode the hex
                return Some(format!("hex:{}", hex_content));
            }
        }
    }
    None
}

/// Helper: Extract array entry
fn extract_array_entry(content: &str, key: &str) -> Option<Vec<String>> {
    let pattern = format!("/{} [", key);
    if let Some(start) = content.find(&pattern) {
        let after_start = &content[start + pattern.len()..];
        if let Some(end) = after_start.find(']') {
            let array_content = &after_start[..end];
            let elements: Vec<String> = array_content
                .split_whitespace()
                .map(|s| s.to_string())
                .collect();
            return Some(elements);
        }
    }
    None
}

/// Helper: Extract graphics operator value
fn extract_graphics_operator(line: &str, operator: &str) -> Option<String> {
    let parts: Vec<&str> = line.split_whitespace().collect();
    for (i, part) in parts.iter().enumerate() {
        if *part == operator && i > 0 {
            return Some(parts[i - 1].to_string());
        }
    }
    None
}

/// Helper: Extract text content from text showing operator
fn extract_text_content(line: &str) -> Option<String> {
    // Look for (text) pattern
    if let Some(start) = line.find('(') {
        if let Some(end) = line.find(')') {
            if end > start {
                return Some(line[start + 1..end].to_string());
            }
        }
    }
    None
}

/// Detect RGB color space usage (literal names or operators)
fn detect_rgb_usage(pdf_text: &str) -> bool {
    // Check for literal color space name
    if pdf_text.contains("/DeviceRGB") {
        return true;
    }

    // For compressed content streams, we can't easily parse operators
    // But we can detect RGB usage by other indicators:

    // 1. Look for RGB color operators in uncompressed streams
    for line in pdf_text.lines() {
        let words: Vec<&str> = line.split_whitespace().collect();
        for i in 3..words.len() {
            if (words[i] == "rg" || words[i] == "RG")
                && words[i - 3].parse::<f64>().is_ok()
                && words[i - 2].parse::<f64>().is_ok()
                && words[i - 1].parse::<f64>().is_ok()
            {
                return true;
            }
        }
    }

    // 2. Check for color space resources in the resources section
    if pdf_text.contains("/ColorSpace") && pdf_text.contains("RGB") {
        return true;
    }

    // 3. Heuristic: If document has graphics content and no explicit grayscale/CMYK,
    //    assume RGB is being used (PDF default)
    if pdf_text.contains("/Contents") && pdf_text.contains("/Length") {
        // Has content streams - likely using default RGB
        return true;
    }

    false
}

/// Detect CMYK color space usage (literal names or operators)
fn detect_cmyk_usage(pdf_text: &str) -> bool {
    // Check for literal color space name
    if pdf_text.contains("/DeviceCMYK") {
        return true;
    }

    // Check for CMYK color operators in content streams
    // Look for patterns like "0.5 0.2 0.8 0.1 k" (fill) or "0.5 0.2 0.8 0.1 K" (stroke)
    for line in pdf_text.lines() {
        let words: Vec<&str> = line.split_whitespace().collect();
        for i in 4..words.len() {
            if (words[i] == "k" || words[i] == "K")
                && words[i - 4].parse::<f64>().is_ok()
                && words[i - 3].parse::<f64>().is_ok()
                && words[i - 2].parse::<f64>().is_ok()
                && words[i - 1].parse::<f64>().is_ok()
            {
                return true;
            }
        }
    }

    false
}

/// Detect grayscale color space usage (literal names or operators)
fn detect_gray_usage(pdf_text: &str) -> bool {
    // Check for literal color space name
    if pdf_text.contains("/DeviceGray") {
        return true;
    }

    // Check for grayscale color operators in uncompressed content streams
    for line in pdf_text.lines() {
        let words: Vec<&str> = line.split_whitespace().collect();
        for i in 1..words.len() {
            if (words[i] == "g" || words[i] == "G") && words[i - 1].parse::<f64>().is_ok() {
                return true;
            }
        }
    }

    // Check for grayscale color space resources
    if pdf_text.contains("/ColorSpace") && pdf_text.contains("Gray") {
        return true;
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_version() {
        let pdf_content = "%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n%%EOF";
        let result = extract_version(pdf_content).unwrap();
        assert_eq!(result, "1.4");
    }

    #[test]
    fn test_extract_catalog() {
        let pdf_content = "1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj";
        let catalog = extract_catalog(pdf_content).unwrap();
        assert_eq!(catalog.get("Type"), Some(&"Catalog".to_string()));
        assert_eq!(catalog.get("Pages"), Some(&"2 0 R".to_string()));
    }

    #[test]
    fn test_extract_fonts() {
        let pdf_content =
            "<<\n/Type /Font\n/BaseFont /Helvetica\n>>\n<<\n/BaseFont /Times-Roman\n>>";
        let fonts = extract_fonts(pdf_content);
        assert!(fonts.contains(&"Helvetica".to_string()));
        assert!(fonts.contains(&"Times-Roman".to_string()));
    }

    #[test]
    fn test_color_space_detection() {
        let pdf_content = "%PDF-1.4\nstream\n1 0 0 rg\n/DeviceRGB cs\nendstream\n%%EOF";
        let parsed = parse_pdf(pdf_content.as_bytes()).unwrap();
        assert!(parsed.uses_device_rgb);
        assert!(!parsed.uses_device_cmyk);
    }

    #[test]
    fn test_improved_color_detection() {
        use crate::{Color, Document, Font, Page};

        let mut doc = Document::new();
        doc.set_title("Color Detection Test");

        let mut page = Page::a4();

        // Add text and colored graphics
        page.text()
            .set_font(Font::Helvetica, 12.0)
            .at(50.0, 700.0)
            .write("RGB Color Test")
            .unwrap();

        page.graphics()
            .set_fill_color(Color::rgb(1.0, 0.0, 0.0)) // Red
            .rectangle(50.0, 650.0, 100.0, 30.0)
            .fill();

        doc.add_page(page);
        let pdf_bytes = doc.to_bytes().unwrap();

        // Test improved color detection
        let parsed = parse_pdf(&pdf_bytes).unwrap();

        // Should detect RGB usage through heuristics since content streams are compressed
        assert!(parsed.uses_device_rgb, "Should detect RGB color usage");
        assert!(!parsed.uses_device_cmyk, "Should not detect CMYK");
    }
}