oxidize_pdf/parser/mod.rs
1//! PDF Parser Module - Complete PDF parsing and rendering support
2//!
3//! This module provides a comprehensive, 100% native Rust implementation for parsing PDF files
4//! according to the ISO 32000-1 (PDF 1.7) and ISO 32000-2 (PDF 2.0) specifications.
5//!
6//! # Overview
7//!
8//! The parser is designed to support building PDF renderers, content extractors, and analysis tools.
9//! It provides multiple levels of API access:
10//!
11//! - **High-level**: `PdfDocument` for easy document manipulation
12//! - **Mid-level**: `ParsedPage`, content streams, and resources
13//! - **Low-level**: Direct access to PDF objects and streams
14//!
15//! # Quick Start
16//!
17//! ```rust,no_run
18//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
19//! use oxidize_pdf::parser::content::ContentParser;
20//!
21//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
22//! // Open a PDF document
23//! let reader = PdfReader::open("document.pdf")?;
24//! let document = PdfDocument::new(reader);
25//!
26//! // Get document information
27//! println!("Pages: {}", document.page_count()?);
28//! println!("Version: {}", document.version()?);
29//!
30//! // Process first page
31//! let page = document.get_page(0)?;
32//! println!("Page size: {}x{} points", page.width(), page.height());
33//!
34//! // Parse content streams
35//! let streams = page.content_streams_with_document(&document)?;
36//! for stream in streams {
37//! let operations = ContentParser::parse(&stream)?;
38//! println!("Operations: {}", operations.len());
39//! }
40//!
41//! // Extract text
42//! let text = document.extract_text_from_page(0)?;
43//! println!("Text: {}", text.text);
44//! # Ok(())
45//! # }
46//! ```
47//!
48//! # Architecture
49//!
50//! ```text
51//! ┌─────────────────────────────────────────────────┐
52//! │ PdfDocument │ ← High-level API
53//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
54//! │ │PdfReader │ │PageTree │ │ResourceManager │ │
55//! │ └──────────┘ └──────────┘ └────────────────┘ │
56//! └─────────────────────────────────────────────────┘
57//! │ │ │
58//! ↓ ↓ ↓
59//! ┌─────────────────────────────────────────────────┐
60//! │ ParsedPage │ ← Page API
61//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
62//! │ │Properties│ │Resources │ │Content Streams │ │
63//! │ └──────────┘ └──────────┘ └────────────────┘ │
64//! └─────────────────────────────────────────────────┘
65//! │ │ │
66//! ↓ ↓ ↓
67//! ┌─────────────────────────────────────────────────┐
68//! │ ContentParser & PdfObject │ ← Low-level API
69//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
70//! │ │Tokenizer │ │Operators │ │Object Types │ │
71//! │ └──────────┘ └──────────┘ └────────────────┘ │
72//! └─────────────────────────────────────────────────┘
73//! ```
74//!
75//! # Features
76//!
77//! - **Complete PDF Object Model**: All PDF object types supported
78//! - **Content Stream Parsing**: Full operator support for rendering
79//! - **Resource Management**: Fonts, images, color spaces, patterns
80//! - **Text Extraction**: With position and formatting information
81//! - **Page Navigation**: Efficient page tree traversal
82//! - **Stream Filters**: Decompression support (FlateDecode, ASCIIHex, etc.)
83//! - **Reference Resolution**: Automatic handling of indirect objects
84//!
85//! # Example: Building a Simple Renderer
86//!
87//! ```rust,no_run
88//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
89//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
90//!
91//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
92//! struct SimpleRenderer {
93//! current_path: Vec<(f32, f32)>,
94//! }
95//!
96//! impl SimpleRenderer {
97//! fn render_page(document: &PdfDocument<std::fs::File>, page_idx: u32) -> Result<(), Box<dyn std::error::Error>> {
98//! let page = document.get_page(page_idx)?;
99//! let streams = page.content_streams_with_document(&document)?;
100//!
101//! let mut renderer = SimpleRenderer {
102//! current_path: Vec::new(),
103//! };
104//!
105//! for stream in streams {
106//! let operations = ContentParser::parse(&stream)?;
107//! for op in operations {
108//! match op {
109//! ContentOperation::MoveTo(x, y) => {
110//! renderer.current_path.clear();
111//! renderer.current_path.push((x, y));
112//! }
113//! ContentOperation::LineTo(x, y) => {
114//! renderer.current_path.push((x, y));
115//! }
116//! ContentOperation::Stroke => {
117//! println!("Draw path with {} points", renderer.current_path.len());
118//! renderer.current_path.clear();
119//! }
120//! ContentOperation::ShowText(text) => {
121//! println!("Draw text: {:?}", String::from_utf8_lossy(&text));
122//! }
123//! _ => {} // Handle other operations
124//! }
125//! }
126//! }
127//! Ok(())
128//! }
129//! }
130//! # Ok(())
131//! # }
132//! ```
133
134pub mod content;
135pub mod document;
136pub mod filters;
137pub mod header;
138pub mod lexer;
139pub mod object_stream;
140pub mod objects;
141pub mod page_tree;
142pub mod reader;
143pub mod trailer;
144pub mod xref;
145
146#[cfg(test)]
147pub mod test_helpers;
148
149use crate::error::OxidizePdfError;
150
151// Re-export main types for convenient access
152pub use self::content::{ContentOperation, ContentParser, TextElement};
153pub use self::document::{PdfDocument, ResourceManager};
154pub use self::objects::{PdfArray, PdfDictionary, PdfName, PdfObject, PdfStream, PdfString};
155pub use self::page_tree::ParsedPage;
156pub use self::reader::{DocumentMetadata, PdfReader};
157
158/// Result type for parser operations
159pub type ParseResult<T> = Result<T, ParseError>;
160
161/// PDF Parser errors covering all failure modes during parsing.
162///
163/// # Error Categories
164///
165/// - **I/O Errors**: File access and reading issues
166/// - **Format Errors**: Invalid PDF structure or syntax
167/// - **Unsupported Features**: Encryption, newer PDF versions
168/// - **Reference Errors**: Invalid or circular object references
169/// - **Stream Errors**: Decompression or filter failures
170///
171/// # Example
172///
173/// ```rust
174/// use oxidize_pdf::parser::{PdfReader, ParseError};
175///
176/// # fn example() -> Result<(), ParseError> {
177/// match PdfReader::open("missing.pdf") {
178/// Ok(_) => println!("File opened"),
179/// Err(ParseError::Io(e)) => println!("IO error: {}", e),
180/// Err(ParseError::InvalidHeader) => println!("Not a valid PDF"),
181/// Err(e) => println!("Other error: {}", e),
182/// }
183/// # Ok(())
184/// # }
185/// ```
186#[derive(Debug, thiserror::Error)]
187pub enum ParseError {
188 /// I/O error during file operations
189 #[error("IO error: {0}")]
190 Io(#[from] std::io::Error),
191
192 /// PDF file doesn't start with valid header (%PDF-)
193 #[error("Invalid PDF header")]
194 InvalidHeader,
195
196 /// PDF version is not supported
197 #[error("Unsupported PDF version: {0}")]
198 UnsupportedVersion(String),
199
200 /// Syntax error in PDF structure
201 #[error("Syntax error at position {position}: {message}")]
202 SyntaxError { position: usize, message: String },
203
204 #[error("Unexpected token: expected {expected}, found {found}")]
205 UnexpectedToken { expected: String, found: String },
206
207 /// Invalid or non-existent object reference
208 #[error("Invalid object reference: {0} {1} R")]
209 InvalidReference(u32, u16),
210
211 /// Required dictionary key is missing
212 #[error("Missing required key: {0}")]
213 MissingKey(String),
214
215 #[error("Invalid xref table")]
216 InvalidXRef,
217
218 #[error("Invalid trailer")]
219 InvalidTrailer,
220
221 #[error("Circular reference detected")]
222 CircularReference,
223
224 /// Error decoding/decompressing stream data
225 #[error("Stream decode error: {0}")]
226 StreamDecodeError(String),
227
228 /// PDF encryption is not currently supported
229 #[error("Encryption not supported")]
230 EncryptionNotSupported,
231}
232
233impl From<ParseError> for OxidizePdfError {
234 fn from(err: ParseError) -> Self {
235 OxidizePdfError::ParseError(err.to_string())
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::*;
242
243 #[test]
244 fn test_module_exports() {
245 // Verify that all important types are properly exported
246
247 // Test that we can create a PdfObject
248 let _obj = PdfObject::Null;
249
250 // Test that we can create a PdfDictionary
251 let _dict = PdfDictionary::new();
252
253 // Test that we can create a PdfArray
254 let _array = PdfArray::new();
255
256 // Test that we can create a PdfName
257 let _name = PdfName::new("Test".to_string());
258
259 // Test that we can create a PdfString
260 let _string = PdfString::new(b"Test".to_vec());
261 }
262
263 #[test]
264 fn test_parse_error_conversion() {
265 let io_error = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
266 let parse_error = ParseError::Io(io_error);
267 let oxidize_error: OxidizePdfError = parse_error.into();
268
269 match oxidize_error {
270 OxidizePdfError::ParseError(_) => assert!(true),
271 _ => assert!(false, "Expected ParseError variant"),
272 }
273 }
274
275 #[test]
276 fn test_parse_error_messages() {
277 let errors = vec![
278 ParseError::InvalidHeader,
279 ParseError::UnsupportedVersion("2.5".to_string()),
280 ParseError::InvalidXRef,
281 ParseError::InvalidTrailer,
282 ParseError::CircularReference,
283 ParseError::EncryptionNotSupported,
284 ];
285
286 for error in errors {
287 let message = error.to_string();
288 assert!(!message.is_empty());
289 }
290 }
291}