oxidize_pdf/parser/
mod.rs

1//! PDF Parser Module - Complete PDF parsing and rendering support
2//!
3//! This module provides a comprehensive, 100% native Rust implementation for parsing PDF files
4//! according to the ISO 32000-1 (PDF 1.7) and ISO 32000-2 (PDF 2.0) specifications.
5//!
6//! # Overview
7//!
8//! The parser is designed to support building PDF renderers, content extractors, and analysis tools.
9//! It provides multiple levels of API access:
10//!
11//! - **High-level**: `PdfDocument` for easy document manipulation
12//! - **Mid-level**: `ParsedPage`, content streams, and resources
13//! - **Low-level**: Direct access to PDF objects and streams
14//!
15//! # Quick Start
16//!
17//! ```rust,no_run
18//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
19//! use oxidize_pdf::parser::content::ContentParser;
20//!
21//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
22//! // Open a PDF document
23//! let reader = PdfReader::open("document.pdf")?;
24//! let document = PdfDocument::new(reader);
25//!
26//! // Get document information
27//! println!("Pages: {}", document.page_count()?);
28//! println!("Version: {}", document.version()?);
29//!
30//! // Process first page
31//! let page = document.get_page(0)?;
32//! println!("Page size: {}x{} points", page.width(), page.height());
33//!
34//! // Parse content streams
35//! let streams = page.content_streams_with_document(&document)?;
36//! for stream in streams {
37//!     let operations = ContentParser::parse(&stream)?;
38//!     println!("Operations: {}", operations.len());
39//! }
40//!
41//! // Extract text
42//! let text = document.extract_text_from_page(0)?;
43//! println!("Text: {}", text.text);
44//! # Ok(())
45//! # }
46//! ```
47//!
48//! # Architecture
49//!
50//! ```text
51//! ┌─────────────────────────────────────────────────┐
52//! │                 PdfDocument                     │ ← High-level API
53//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
54//! │  │PdfReader │ │PageTree  │ │ResourceManager │  │
55//! │  └──────────┘ └──────────┘ └────────────────┘  │
56//! └─────────────────────────────────────────────────┘
57//!            │              │              │
58//!            ↓              ↓              ↓
59//! ┌─────────────────────────────────────────────────┐
60//! │              ParsedPage                         │ ← Page API
61//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
62//! │  │Properties│ │Resources │ │Content Streams │  │
63//! │  └──────────┘ └──────────┘ └────────────────┘  │
64//! └─────────────────────────────────────────────────┘
65//!            │              │              │
66//!            ↓              ↓              ↓
67//! ┌─────────────────────────────────────────────────┐
68//! │         ContentParser & PdfObject               │ ← Low-level API
69//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
70//! │  │Tokenizer │ │Operators │ │Object Types    │  │
71//! │  └──────────┘ └──────────┘ └────────────────┘  │
72//! └─────────────────────────────────────────────────┘
73//! ```
74//!
75//! # Features
76//!
77//! - **Complete PDF Object Model**: All PDF object types supported
78//! - **Content Stream Parsing**: Full operator support for rendering
79//! - **Resource Management**: Fonts, images, color spaces, patterns
80//! - **Text Extraction**: With position and formatting information
81//! - **Page Navigation**: Efficient page tree traversal
82//! - **Stream Filters**: Decompression support (FlateDecode, ASCIIHex, etc.)
83//! - **Reference Resolution**: Automatic handling of indirect objects
84//!
85//! # Example: Building a Simple Renderer
86//!
87//! ```rust,no_run
88//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
89//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
90//!
91//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
92//! struct SimpleRenderer {
93//!     current_path: Vec<(f32, f32)>,
94//! }
95//!
96//! impl SimpleRenderer {
97//!     fn render_page(document: &PdfDocument<std::fs::File>, page_idx: u32) -> Result<(), Box<dyn std::error::Error>> {
98//!         let page = document.get_page(page_idx)?;
99//!         let streams = page.content_streams_with_document(&document)?;
100//!         
101//!         let mut renderer = SimpleRenderer {
102//!             current_path: Vec::new(),
103//!         };
104//!         
105//!         for stream in streams {
106//!             let operations = ContentParser::parse(&stream)?;
107//!             for op in operations {
108//!                 match op {
109//!                     ContentOperation::MoveTo(x, y) => {
110//!                         renderer.current_path.clear();
111//!                         renderer.current_path.push((x, y));
112//!                     }
113//!                     ContentOperation::LineTo(x, y) => {
114//!                         renderer.current_path.push((x, y));
115//!                     }
116//!                     ContentOperation::Stroke => {
117//!                         println!("Draw path with {} points", renderer.current_path.len());
118//!                         renderer.current_path.clear();
119//!                     }
120//!                     ContentOperation::ShowText(text) => {
121//!                         println!("Draw text: {:?}", String::from_utf8_lossy(&text));
122//!                     }
123//!                     _ => {} // Handle other operations
124//!                 }
125//!             }
126//!         }
127//!         Ok(())
128//!     }
129//! }
130//! # Ok(())
131//! # }
132//! ```
133
134pub mod content;
135pub mod document;
136pub mod filters;
137pub mod header;
138pub mod lexer;
139pub mod object_stream;
140pub mod objects;
141pub mod page_tree;
142pub mod reader;
143pub mod trailer;
144pub mod xref;
145
146use crate::error::OxidizePdfError;
147
148// Re-export main types for convenient access
149pub use self::content::{ContentOperation, ContentParser, TextElement};
150pub use self::document::{PdfDocument, ResourceManager};
151pub use self::objects::{PdfArray, PdfDictionary, PdfName, PdfObject, PdfStream, PdfString};
152pub use self::page_tree::ParsedPage;
153pub use self::reader::{DocumentMetadata, PdfReader};
154
155/// Result type for parser operations
156pub type ParseResult<T> = Result<T, ParseError>;
157
158/// PDF Parser errors covering all failure modes during parsing.
159///
160/// # Error Categories
161///
162/// - **I/O Errors**: File access and reading issues
163/// - **Format Errors**: Invalid PDF structure or syntax
164/// - **Unsupported Features**: Encryption, newer PDF versions
165/// - **Reference Errors**: Invalid or circular object references
166/// - **Stream Errors**: Decompression or filter failures
167///
168/// # Example
169///
170/// ```rust
171/// use oxidize_pdf::parser::{PdfReader, ParseError};
172///
173/// # fn example() -> Result<(), ParseError> {
174/// match PdfReader::open("missing.pdf") {
175///     Ok(_) => println!("File opened"),
176///     Err(ParseError::Io(e)) => println!("IO error: {}", e),
177///     Err(ParseError::InvalidHeader) => println!("Not a valid PDF"),
178///     Err(e) => println!("Other error: {}", e),
179/// }
180/// # Ok(())
181/// # }
182/// ```
183#[derive(Debug, thiserror::Error)]
184pub enum ParseError {
185    /// I/O error during file operations
186    #[error("IO error: {0}")]
187    Io(#[from] std::io::Error),
188
189    /// PDF file doesn't start with valid header (%PDF-)
190    #[error("Invalid PDF header")]
191    InvalidHeader,
192
193    /// PDF version is not supported
194    #[error("Unsupported PDF version: {0}")]
195    UnsupportedVersion(String),
196
197    /// Syntax error in PDF structure
198    #[error("Syntax error at position {position}: {message}")]
199    SyntaxError { position: usize, message: String },
200
201    #[error("Unexpected token: expected {expected}, found {found}")]
202    UnexpectedToken { expected: String, found: String },
203
204    /// Invalid or non-existent object reference
205    #[error("Invalid object reference: {0} {1} R")]
206    InvalidReference(u32, u16),
207
208    /// Required dictionary key is missing
209    #[error("Missing required key: {0}")]
210    MissingKey(String),
211
212    #[error("Invalid xref table")]
213    InvalidXRef,
214
215    #[error("Invalid trailer")]
216    InvalidTrailer,
217
218    #[error("Circular reference detected")]
219    CircularReference,
220
221    /// Error decoding/decompressing stream data
222    #[error("Stream decode error: {0}")]
223    StreamDecodeError(String),
224
225    /// PDF encryption is not currently supported
226    #[error("Encryption not supported")]
227    EncryptionNotSupported,
228}
229
230impl From<ParseError> for OxidizePdfError {
231    fn from(err: ParseError) -> Self {
232        OxidizePdfError::ParseError(err.to_string())
233    }
234}