oxidize_pdf/parser/
mod.rs

1//! PDF Parser Module - Complete PDF parsing and rendering support
2//!
3//! This module provides a comprehensive, 100% native Rust implementation for parsing PDF files
4//! according to the ISO 32000-1 (PDF 1.7) and ISO 32000-2 (PDF 2.0) specifications.
5//!
6//! # Overview
7//!
8//! The parser is designed to support building PDF renderers, content extractors, and analysis tools.
9//! It provides multiple levels of API access:
10//!
11//! - **High-level**: `PdfDocument` for easy document manipulation
12//! - **Mid-level**: `ParsedPage`, content streams, and resources
13//! - **Low-level**: Direct access to PDF objects and streams
14//!
15//! # Quick Start
16//!
17//! ```rust,no_run
18//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
19//! use oxidize_pdf::parser::content::ContentParser;
20//!
21//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
22//! // Open a PDF document
23//! let reader = PdfReader::open("document.pdf")?;
24//! let document = PdfDocument::new(reader);
25//!
26//! // Get document information
27//! println!("Pages: {}", document.page_count()?);
28//! println!("Version: {}", document.version()?);
29//!
30//! // Process first page
31//! let page = document.get_page(0)?;
32//! println!("Page size: {}x{} points", page.width(), page.height());
33//!
34//! // Parse content streams
35//! let streams = page.content_streams_with_document(&document)?;
36//! for stream in streams {
37//!     let operations = ContentParser::parse(&stream)?;
38//!     println!("Operations: {}", operations.len());
39//! }
40//!
41//! // Extract text
42//! let text = document.extract_text_from_page(0)?;
43//! println!("Text: {}", text.text);
44//! # Ok(())
45//! # }
46//! ```
47//!
48//! # Architecture
49//!
50//! ```text
51//! ┌─────────────────────────────────────────────────┐
52//! │                 PdfDocument                     │ ← High-level API
53//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
54//! │  │PdfReader │ │PageTree  │ │ResourceManager │  │
55//! │  └──────────┘ └──────────┘ └────────────────┘  │
56//! └─────────────────────────────────────────────────┘
57//!            │              │              │
58//!            ↓              ↓              ↓
59//! ┌─────────────────────────────────────────────────┐
60//! │              ParsedPage                         │ ← Page API
61//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
62//! │  │Properties│ │Resources │ │Content Streams │  │
63//! │  └──────────┘ └──────────┘ └────────────────┘  │
64//! └─────────────────────────────────────────────────┘
65//!            │              │              │
66//!            ↓              ↓              ↓
67//! ┌─────────────────────────────────────────────────┐
68//! │         ContentParser & PdfObject               │ ← Low-level API
69//! │  ┌──────────┐ ┌──────────┐ ┌────────────────┐  │
70//! │  │Tokenizer │ │Operators │ │Object Types    │  │
71//! │  └──────────┘ └──────────┘ └────────────────┘  │
72//! └─────────────────────────────────────────────────┘
73//! ```
74//!
75//! # Features
76//!
77//! - **Complete PDF Object Model**: All PDF object types supported
78//! - **Content Stream Parsing**: Full operator support for rendering
79//! - **Resource Management**: Fonts, images, color spaces, patterns
80//! - **Text Extraction**: With position and formatting information
81//! - **Page Navigation**: Efficient page tree traversal
82//! - **Stream Filters**: Decompression support (FlateDecode, ASCIIHex, etc.)
83//! - **Reference Resolution**: Automatic handling of indirect objects
84//!
85//! # Example: Building a Simple Renderer
86//!
87//! ```rust,no_run
88//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
89//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
90//!
91//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
92//! struct SimpleRenderer {
93//!     current_path: Vec<(f32, f32)>,
94//! }
95//!
96//! impl SimpleRenderer {
97//!     fn render_page(document: &PdfDocument<std::fs::File>, page_idx: u32) -> Result<(), Box<dyn std::error::Error>> {
98//!         let page = document.get_page(page_idx)?;
99//!         let streams = page.content_streams_with_document(&document)?;
100//!         
101//!         let mut renderer = SimpleRenderer {
102//!             current_path: Vec::new(),
103//!         };
104//!         
105//!         for stream in streams {
106//!             let operations = ContentParser::parse(&stream)?;
107//!             for op in operations {
108//!                 match op {
109//!                     ContentOperation::MoveTo(x, y) => {
110//!                         renderer.current_path.clear();
111//!                         renderer.current_path.push((x, y));
112//!                     }
113//!                     ContentOperation::LineTo(x, y) => {
114//!                         renderer.current_path.push((x, y));
115//!                     }
116//!                     ContentOperation::Stroke => {
117//!                         println!("Draw path with {} points", renderer.current_path.len());
118//!                         renderer.current_path.clear();
119//!                     }
120//!                     ContentOperation::ShowText(text) => {
121//!                         println!("Draw text: {:?}", String::from_utf8_lossy(&text));
122//!                     }
123//!                     _ => {} // Handle other operations
124//!                 }
125//!             }
126//!         }
127//!         Ok(())
128//!     }
129//! }
130//! # Ok(())
131//! # }
132//! ```
133
134pub mod content;
135pub mod document;
136pub mod filters;
137pub mod header;
138pub mod lexer;
139pub mod object_stream;
140pub mod objects;
141pub mod page_tree;
142pub mod reader;
143pub mod trailer;
144pub mod xref;
145
146#[cfg(test)]
147pub mod test_helpers;
148
149use crate::error::OxidizePdfError;
150
151// Re-export main types for convenient access
152pub use self::content::{ContentOperation, ContentParser, TextElement};
153pub use self::document::{PdfDocument, ResourceManager};
154pub use self::objects::{PdfArray, PdfDictionary, PdfName, PdfObject, PdfStream, PdfString};
155pub use self::page_tree::ParsedPage;
156pub use self::reader::{DocumentMetadata, PdfReader};
157
158/// Result type for parser operations
159pub type ParseResult<T> = Result<T, ParseError>;
160
161/// PDF Parser errors covering all failure modes during parsing.
162///
163/// # Error Categories
164///
165/// - **I/O Errors**: File access and reading issues
166/// - **Format Errors**: Invalid PDF structure or syntax
167/// - **Unsupported Features**: Encryption, newer PDF versions
168/// - **Reference Errors**: Invalid or circular object references
169/// - **Stream Errors**: Decompression or filter failures
170///
171/// # Example
172///
173/// ```rust
174/// use oxidize_pdf::parser::{PdfReader, ParseError};
175///
176/// # fn example() -> Result<(), ParseError> {
177/// match PdfReader::open("missing.pdf") {
178///     Ok(_) => println!("File opened"),
179///     Err(ParseError::Io(e)) => println!("IO error: {}", e),
180///     Err(ParseError::InvalidHeader) => println!("Not a valid PDF"),
181///     Err(e) => println!("Other error: {}", e),
182/// }
183/// # Ok(())
184/// # }
185/// ```
186#[derive(Debug, thiserror::Error)]
187pub enum ParseError {
188    /// I/O error during file operations
189    #[error("IO error: {0}")]
190    Io(#[from] std::io::Error),
191
192    /// PDF file doesn't start with valid header (%PDF-)
193    #[error("Invalid PDF header")]
194    InvalidHeader,
195
196    /// PDF version is not supported
197    #[error("Unsupported PDF version: {0}")]
198    UnsupportedVersion(String),
199
200    /// Syntax error in PDF structure
201    #[error("Syntax error at position {position}: {message}")]
202    SyntaxError { position: usize, message: String },
203
204    #[error("Unexpected token: expected {expected}, found {found}")]
205    UnexpectedToken { expected: String, found: String },
206
207    /// Invalid or non-existent object reference
208    #[error("Invalid object reference: {0} {1} R")]
209    InvalidReference(u32, u16),
210
211    /// Required dictionary key is missing
212    #[error("Missing required key: {0}")]
213    MissingKey(String),
214
215    #[error("Invalid xref table")]
216    InvalidXRef,
217
218    #[error("Invalid trailer")]
219    InvalidTrailer,
220
221    #[error("Circular reference detected")]
222    CircularReference,
223
224    /// Error decoding/decompressing stream data
225    #[error("Stream decode error: {0}")]
226    StreamDecodeError(String),
227
228    /// PDF encryption is not currently supported
229    #[error("Encryption not supported")]
230    EncryptionNotSupported,
231}
232
233impl From<ParseError> for OxidizePdfError {
234    fn from(err: ParseError) -> Self {
235        OxidizePdfError::ParseError(err.to_string())
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242
243    #[test]
244    fn test_module_exports() {
245        // Verify that all important types are properly exported
246        
247        // Test that we can create a PdfObject
248        let _obj = PdfObject::Null;
249        
250        // Test that we can create a PdfDictionary
251        let _dict = PdfDictionary::new();
252        
253        // Test that we can create a PdfArray
254        let _array = PdfArray::new();
255        
256        // Test that we can create a PdfName
257        let _name = PdfName::new("Test".to_string());
258        
259        // Test that we can create a PdfString
260        let _string = PdfString::new(b"Test".to_vec());
261    }
262    
263    #[test]
264    fn test_parse_error_conversion() {
265        let io_error = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
266        let parse_error = ParseError::Io(io_error);
267        let oxidize_error: OxidizePdfError = parse_error.into();
268        
269        match oxidize_error {
270            OxidizePdfError::ParseError(_) => assert!(true),
271            _ => assert!(false, "Expected ParseError variant"),
272        }
273    }
274    
275    #[test]
276    fn test_parse_error_messages() {
277        let errors = vec![
278            ParseError::InvalidHeader,
279            ParseError::UnsupportedVersion("2.5".to_string()),
280            ParseError::InvalidXRef,
281            ParseError::InvalidTrailer,
282            ParseError::CircularReference,
283            ParseError::EncryptionNotSupported,
284        ];
285        
286        for error in errors {
287            let message = error.to_string();
288            assert!(!message.is_empty());
289        }
290    }
291}
oxidize_pdf/parser/mod.rs

oxidize_pdf/parser/
mod.rs