Skip to main content

oxidize_pdf/verification/
parser.rs

1//! PDF Parser for Verification
2//!
3//! This module provides a simple parser to extract key information from generated PDFs
4//! for verification purposes. It's not a complete PDF parser, but focuses on the
5//! elements needed to verify ISO compliance.
6
7use crate::error::{PdfError, Result};
8use std::collections::HashMap;
9
10/// Parsed representation of a PDF for verification
11#[derive(Debug, Clone)]
12pub struct ParsedPdf {
13    /// PDF version from header
14    pub version: String,
15    /// Document catalog dictionary
16    pub catalog: Option<HashMap<String, String>>,
17    /// Page tree information
18    pub page_tree: Option<PageTree>,
19    /// Font information
20    pub fonts: Vec<String>,
21    /// Color space usage flags
22    pub uses_device_rgb: bool,
23    pub uses_device_cmyk: bool,
24    pub uses_device_gray: bool,
25    /// Graphics state information
26    pub graphics_states: Vec<GraphicsState>,
27    /// Text objects found
28    pub text_objects: Vec<TextObject>,
29    /// Annotations found
30    pub annotations: Vec<Annotation>,
31    /// Cross-reference table info
32    pub xref_valid: bool,
33    /// Total objects in PDF
34    pub object_count: usize,
35}
36
37#[derive(Debug, Clone)]
38pub struct PageTree {
39    pub root_type: String,
40    pub page_count: usize,
41    pub kids_arrays: Vec<Vec<String>>,
42}
43
44#[derive(Debug, Clone)]
45pub struct GraphicsState {
46    pub line_width: Option<f64>,
47    pub line_cap: Option<i32>,
48    pub line_join: Option<i32>,
49    pub fill_color: Option<String>,
50    pub stroke_color: Option<String>,
51}
52
53#[derive(Debug, Clone)]
54pub struct TextObject {
55    pub font: Option<String>,
56    pub font_size: Option<f64>,
57    pub text_content: String,
58}
59
60#[derive(Debug, Clone)]
61pub struct Annotation {
62    pub subtype: String,
63    pub rect: Option<[f64; 4]>,
64    pub contents: Option<String>,
65}
66
67/// Parse PDF bytes and extract verification information
68pub fn parse_pdf(pdf_bytes: &[u8]) -> Result<ParsedPdf> {
69    let pdf_text = String::from_utf8_lossy(pdf_bytes);
70
71    let parsed = ParsedPdf {
72        version: extract_version(&pdf_text)?,
73        catalog: extract_catalog(&pdf_text),
74        page_tree: extract_page_tree(&pdf_text),
75        fonts: extract_fonts(&pdf_text),
76        uses_device_rgb: detect_rgb_usage(&pdf_text),
77        uses_device_cmyk: detect_cmyk_usage(&pdf_text),
78        uses_device_gray: detect_gray_usage(&pdf_text),
79        graphics_states: extract_graphics_states(&pdf_text),
80        text_objects: extract_text_objects(&pdf_text),
81        annotations: extract_annotations(&pdf_text),
82        xref_valid: validate_xref(&pdf_text),
83        object_count: count_objects(&pdf_text),
84    };
85
86    Ok(parsed)
87}
88
89/// Extract PDF version from header
90fn extract_version(pdf_text: &str) -> Result<String> {
91    if let Some(header_line) = pdf_text.lines().next() {
92        if let Some(stripped) = header_line.strip_prefix("%PDF-") {
93            return Ok(stripped.to_string());
94        }
95    }
96    Err(PdfError::ParseError(
97        "No valid PDF header found".to_string(),
98    ))
99}
100
101/// Extract document catalog information
102fn extract_catalog(pdf_text: &str) -> Option<HashMap<String, String>> {
103    // Look for catalog object pattern with flexible spacing
104    let catalog_patterns = [
105        "/Type /Catalog",
106        "/Type/Catalog",
107        "/Type  /Catalog", // Multiple spaces
108        "/Type Catalog",   // Space but no slash before Catalog (writer format)
109    ];
110
111    for pattern in &catalog_patterns {
112        if let Some(pattern_pos) = pdf_text.find(pattern) {
113            // Find the start of the object containing this pattern
114            let before_pattern = &pdf_text[..pattern_pos];
115            if let Some(obj_start) = before_pattern.rfind(" obj") {
116                // Find the complete object from " obj" to "endobj"
117                let from_obj = &pdf_text[obj_start..];
118                if let Some(end) = from_obj.find("endobj") {
119                    let catalog_content = &from_obj[..end];
120
121                    let mut catalog = HashMap::new();
122
123                    // Extract Type - check for any of the patterns
124                    for type_pattern in &catalog_patterns {
125                        if catalog_content.contains(type_pattern) {
126                            catalog.insert("Type".to_string(), "Catalog".to_string());
127                            break;
128                        }
129                    }
130
131                    // Extract Version if present
132                    if let Some(version_match) = extract_dict_entry(catalog_content, "Version") {
133                        catalog.insert("Version".to_string(), version_match);
134                    }
135
136                    // Extract Pages reference
137                    if let Some(pages_match) = extract_dict_entry(catalog_content, "Pages") {
138                        catalog.insert("Pages".to_string(), pages_match);
139                    }
140
141                    return Some(catalog);
142                }
143            }
144        }
145    }
146    None
147}
148
149/// Extract page tree information
150fn extract_page_tree(pdf_text: &str) -> Option<PageTree> {
151    // Look for page tree root with flexible spacing
152    let pages_patterns = [
153        "/Type /Pages",
154        "/Type/Pages",
155        "/Type  /Pages", // Multiple spaces
156        "/Type Pages",   // Space but no slash before Pages (writer format)
157    ];
158
159    for pattern in &pages_patterns {
160        if let Some(pages_start) = pdf_text.find(pattern) {
161            let pages_section = &pdf_text[pages_start..];
162            if let Some(end) = pages_section.find("endobj") {
163                let pages_content = &pages_section[..end];
164
165                let page_count = extract_dict_entry(pages_content, "Count")
166                    .and_then(|s| {
167                        // Handle both "1" and "1 0 R" formats
168                        let cleaned = s.split_whitespace().next().unwrap_or("0");
169                        cleaned.parse::<usize>().ok()
170                    })
171                    .unwrap_or(0);
172
173                let mut kids_arrays = Vec::new();
174                if let Some(kids_match) = extract_array_entry(pages_content, "Kids") {
175                    kids_arrays.push(kids_match);
176                }
177
178                return Some(PageTree {
179                    root_type: "Pages".to_string(),
180                    page_count,
181                    kids_arrays,
182                });
183            }
184        }
185    }
186    None
187}
188
189/// Extract font information
190fn extract_fonts(pdf_text: &str) -> Vec<String> {
191    let mut fonts = Vec::new();
192
193    // Look for font objects
194    for line in pdf_text.lines() {
195        if line.contains("/Type /Font") || line.contains("/BaseFont") {
196            // Extract font name patterns
197            if line.contains("Helvetica") {
198                fonts.push("Helvetica".to_string());
199            }
200            if line.contains("Times") {
201                fonts.push("Times-Roman".to_string());
202            }
203            if line.contains("Courier") {
204                fonts.push("Courier".to_string());
205            }
206            if line.contains("Symbol") {
207                fonts.push("Symbol".to_string());
208            }
209            if line.contains("ZapfDingbats") {
210                fonts.push("ZapfDingbats".to_string());
211            }
212        }
213    }
214
215    fonts.sort();
216    fonts.dedup();
217    fonts
218}
219
220/// Extract graphics state information
221fn extract_graphics_states(pdf_text: &str) -> Vec<GraphicsState> {
222    let mut states = Vec::new();
223
224    // Look for content streams with graphics operators
225    for line in pdf_text.lines() {
226        if line.contains(" w")
227            || line.contains(" J")
228            || line.contains(" j")
229            || line.contains(" rg")
230            || line.contains(" RG")
231        {
232            let mut state = GraphicsState {
233                line_width: None,
234                line_cap: None,
235                line_join: None,
236                fill_color: None,
237                stroke_color: None,
238            };
239
240            // Extract line width (pattern: "number w")
241            if let Some(w_match) = extract_graphics_operator(line, "w") {
242                state.line_width = w_match.parse().ok();
243            }
244
245            // Extract line cap (pattern: "number J")
246            if let Some(j_match) = extract_graphics_operator(line, "J") {
247                state.line_cap = j_match.parse().ok();
248            }
249
250            states.push(state);
251        }
252    }
253
254    states
255}
256
257/// Extract text objects
258fn extract_text_objects(pdf_text: &str) -> Vec<TextObject> {
259    let mut text_objects = Vec::new();
260
261    // Look for text objects (BT...ET blocks)
262    let mut in_text_object = false;
263    let mut current_font = None;
264    let mut current_size = None;
265
266    for line in pdf_text.lines() {
267        if line.contains("BT") {
268            in_text_object = true;
269            current_font = None;
270            current_size = None;
271        } else if line.contains("ET") {
272            in_text_object = false;
273        } else if in_text_object {
274            // Extract font settings (pattern: "/FontName size Tf")
275            if line.contains(" Tf") {
276                let parts: Vec<&str> = line.split_whitespace().collect();
277                if parts.len() >= 3 {
278                    current_font = Some(parts[0].to_string());
279                    current_size = parts[1].parse().ok();
280                }
281            }
282
283            // Extract text content (pattern: "(text) Tj" or "[(text)] TJ")
284            if line.contains(" Tj") || line.contains(" TJ") {
285                if let Some(text_content) = extract_text_content(line) {
286                    text_objects.push(TextObject {
287                        font: current_font.clone(),
288                        font_size: current_size,
289                        text_content,
290                    });
291                }
292            }
293        }
294    }
295
296    text_objects
297}
298
299/// Extract annotations
300fn extract_annotations(pdf_text: &str) -> Vec<Annotation> {
301    let mut annotations = Vec::new();
302
303    // Look for annotation objects
304    if pdf_text.contains("/Type /Annot") {
305        // This is a simplified extraction - real implementation would be more complex
306        // Process each annotation object section
307        let sections = pdf_text.split(" obj").collect::<Vec<&str>>();
308        for section in sections {
309            if section.contains("/Type /Annot") && section.contains("/Subtype") {
310                if let Some(subtype) = extract_dict_entry(section, "Subtype") {
311                    // Extract rect array [x1 y1 x2 y2]
312                    let rect =
313                        extract_array_entry(section, "Rect").and_then(|arr| parse_rect_array(&arr));
314
315                    // Extract contents (string value)
316                    let contents = extract_string_entry(section, "Contents");
317
318                    annotations.push(Annotation {
319                        subtype,
320                        rect,
321                        contents,
322                    });
323                }
324            }
325        }
326    }
327
328    annotations
329}
330
331/// Validate cross-reference table
332fn validate_xref(pdf_text: &str) -> bool {
333    pdf_text.contains("xref") && pdf_text.contains("%%EOF")
334}
335
336/// Count total objects in PDF
337fn count_objects(pdf_text: &str) -> usize {
338    pdf_text.matches(" obj").count()
339}
340
341/// Helper: Extract dictionary entry value
342fn extract_dict_entry(content: &str, key: &str) -> Option<String> {
343    let pattern = format!("/{}", key);
344    if let Some(start) = content.find(&pattern) {
345        let after_key = &content[start + pattern.len()..];
346        let words: Vec<&str> = after_key.split_whitespace().collect();
347        if !words.is_empty() {
348            // Check if it's a PDF reference (3 words: "N G R")
349            if words.len() >= 3 && words[2] == "R" {
350                return Some(format!("{} {} {}", words[0], words[1], words[2]));
351            }
352            // Otherwise return first word without slash
353            return Some(words[0].trim_start_matches('/').to_string());
354        }
355    }
356    None
357}
358
359/// Helper: Parse rect array from string vector to [f64; 4]
360fn parse_rect_array(arr: &[String]) -> Option<[f64; 4]> {
361    if arr.len() == 4 {
362        let mut rect = [0.0; 4];
363        for (i, val_str) in arr.iter().enumerate() {
364            if let Ok(val) = val_str.parse::<f64>() {
365                rect[i] = val;
366            } else {
367                return None; // Failed to parse as number
368            }
369        }
370        Some(rect)
371    } else {
372        None // Wrong number of elements
373    }
374}
375
376/// Helper: Extract string entry (handles both literal and hex strings)
377fn extract_string_entry(content: &str, key: &str) -> Option<String> {
378    let pattern = format!("/{}", key);
379    if let Some(start) = content.find(&pattern) {
380        let after_key = &content[start + pattern.len()..];
381
382        // Skip whitespace
383        let trimmed = after_key.trim_start();
384
385        if trimmed.starts_with('(') {
386            // Literal string: (content)
387            if let Some(end) = trimmed.find(')') {
388                let string_content = &trimmed[1..end];
389                return Some(string_content.to_string());
390            }
391        } else if trimmed.starts_with('<') && !trimmed.starts_with("<<") {
392            // Hex string: <hexdata>
393            if let Some(end) = trimmed.find('>') {
394                let hex_content = &trimmed[1..end];
395                // For simplicity, just return the hex string as-is
396                // In a full implementation, you'd decode the hex
397                return Some(format!("hex:{}", hex_content));
398            }
399        }
400    }
401    None
402}
403
404/// Helper: Extract array entry
405fn extract_array_entry(content: &str, key: &str) -> Option<Vec<String>> {
406    let pattern = format!("/{} [", key);
407    if let Some(start) = content.find(&pattern) {
408        let after_start = &content[start + pattern.len()..];
409        if let Some(end) = after_start.find(']') {
410            let array_content = &after_start[..end];
411            let elements: Vec<String> = array_content
412                .split_whitespace()
413                .map(|s| s.to_string())
414                .collect();
415            return Some(elements);
416        }
417    }
418    None
419}
420
421/// Helper: Extract graphics operator value
422fn extract_graphics_operator(line: &str, operator: &str) -> Option<String> {
423    let parts: Vec<&str> = line.split_whitespace().collect();
424    for (i, part) in parts.iter().enumerate() {
425        if *part == operator && i > 0 {
426            return Some(parts[i - 1].to_string());
427        }
428    }
429    None
430}
431
432/// Helper: Extract text content from text showing operator
433fn extract_text_content(line: &str) -> Option<String> {
434    // Look for (text) pattern
435    if let Some(start) = line.find('(') {
436        if let Some(end) = line.find(')') {
437            if end > start {
438                return Some(line[start + 1..end].to_string());
439            }
440        }
441    }
442    None
443}
444
445/// Detect RGB color space usage (literal names or operators)
446fn detect_rgb_usage(pdf_text: &str) -> bool {
447    // Check for literal color space name
448    if pdf_text.contains("/DeviceRGB") {
449        return true;
450    }
451
452    // For compressed content streams, we can't easily parse operators
453    // But we can detect RGB usage by other indicators:
454
455    // 1. Look for RGB color operators in uncompressed streams
456    for line in pdf_text.lines() {
457        let words: Vec<&str> = line.split_whitespace().collect();
458        for i in 3..words.len() {
459            if (words[i] == "rg" || words[i] == "RG")
460                && words[i - 3].parse::<f64>().is_ok()
461                && words[i - 2].parse::<f64>().is_ok()
462                && words[i - 1].parse::<f64>().is_ok()
463            {
464                return true;
465            }
466        }
467    }
468
469    // 2. Check for color space resources in the resources section
470    if pdf_text.contains("/ColorSpace") && pdf_text.contains("RGB") {
471        return true;
472    }
473
474    // 3. Heuristic: If document has graphics content and no explicit grayscale/CMYK,
475    //    assume RGB is being used (PDF default)
476    if pdf_text.contains("/Contents") && pdf_text.contains("/Length") {
477        // Has content streams - likely using default RGB
478        return true;
479    }
480
481    false
482}
483
484/// Detect CMYK color space usage (literal names or operators)
485fn detect_cmyk_usage(pdf_text: &str) -> bool {
486    // Check for literal color space name
487    if pdf_text.contains("/DeviceCMYK") {
488        return true;
489    }
490
491    // Check for CMYK color operators in content streams
492    // Look for patterns like "0.5 0.2 0.8 0.1 k" (fill) or "0.5 0.2 0.8 0.1 K" (stroke)
493    for line in pdf_text.lines() {
494        let words: Vec<&str> = line.split_whitespace().collect();
495        for i in 4..words.len() {
496            if (words[i] == "k" || words[i] == "K")
497                && words[i - 4].parse::<f64>().is_ok()
498                && words[i - 3].parse::<f64>().is_ok()
499                && words[i - 2].parse::<f64>().is_ok()
500                && words[i - 1].parse::<f64>().is_ok()
501            {
502                return true;
503            }
504        }
505    }
506
507    false
508}
509
510/// Detect grayscale color space usage (literal names or operators)
511fn detect_gray_usage(pdf_text: &str) -> bool {
512    // Check for literal color space name
513    if pdf_text.contains("/DeviceGray") {
514        return true;
515    }
516
517    // Check for grayscale color operators in uncompressed content streams
518    for line in pdf_text.lines() {
519        let words: Vec<&str> = line.split_whitespace().collect();
520        for i in 1..words.len() {
521            if (words[i] == "g" || words[i] == "G") && words[i - 1].parse::<f64>().is_ok() {
522                return true;
523            }
524        }
525    }
526
527    // Check for grayscale color space resources
528    if pdf_text.contains("/ColorSpace") && pdf_text.contains("Gray") {
529        return true;
530    }
531
532    false
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538
539    #[test]
540    fn test_extract_version() {
541        let pdf_content = "%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n%%EOF";
542        let result = extract_version(pdf_content).unwrap();
543        assert_eq!(result, "1.4");
544    }
545
546    #[test]
547    fn test_extract_catalog() {
548        let pdf_content = "1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj";
549        let catalog = extract_catalog(pdf_content).unwrap();
550        assert_eq!(catalog.get("Type"), Some(&"Catalog".to_string()));
551        assert_eq!(catalog.get("Pages"), Some(&"2 0 R".to_string()));
552    }
553
554    #[test]
555    fn test_extract_fonts() {
556        let pdf_content =
557            "<<\n/Type /Font\n/BaseFont /Helvetica\n>>\n<<\n/BaseFont /Times-Roman\n>>";
558        let fonts = extract_fonts(pdf_content);
559        assert!(fonts.contains(&"Helvetica".to_string()));
560        assert!(fonts.contains(&"Times-Roman".to_string()));
561    }
562
563    #[test]
564    fn test_color_space_detection() {
565        let pdf_content = "%PDF-1.4\nstream\n1 0 0 rg\n/DeviceRGB cs\nendstream\n%%EOF";
566        let parsed = parse_pdf(pdf_content.as_bytes()).unwrap();
567        assert!(parsed.uses_device_rgb);
568        assert!(!parsed.uses_device_cmyk);
569    }
570
571    #[test]
572    fn test_improved_color_detection() {
573        use crate::{Color, Document, Font, Page};
574
575        let mut doc = Document::new();
576        doc.set_title("Color Detection Test");
577
578        let mut page = Page::a4();
579
580        // Add text and colored graphics
581        page.text()
582            .set_font(Font::Helvetica, 12.0)
583            .at(50.0, 700.0)
584            .write("RGB Color Test")
585            .unwrap();
586
587        page.graphics()
588            .set_fill_color(Color::rgb(1.0, 0.0, 0.0)) // Red
589            .rectangle(50.0, 650.0, 100.0, 30.0)
590            .fill();
591
592        doc.add_page(page);
593        let pdf_bytes = doc.to_bytes().unwrap();
594
595        // Test improved color detection
596        let parsed = parse_pdf(&pdf_bytes).unwrap();
597
598        // Should detect RGB usage through heuristics since content streams are compressed
599        assert!(parsed.uses_device_rgb, "Should detect RGB color usage");
600        assert!(!parsed.uses_device_cmyk, "Should not detect CMYK");
601    }
602}