oxidize_pdf/parser/mod.rs
1//! PDF Parser Module - Complete PDF parsing and rendering support
2//!
3//! This module provides a comprehensive, 100% native Rust implementation for parsing PDF files
4//! according to the ISO 32000-1 (PDF 1.7) and ISO 32000-2 (PDF 2.0) specifications.
5//!
6//! # Overview
7//!
8//! The parser is designed to support building PDF renderers, content extractors, and analysis tools.
9//! It provides multiple levels of API access:
10//!
11//! - **High-level**: `PdfDocument` for easy document manipulation
12//! - **Mid-level**: `ParsedPage`, content streams, and resources
13//! - **Low-level**: Direct access to PDF objects and streams
14//!
15//! # Quick Start
16//!
17//! ```rust,no_run
18//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
19//! use oxidize_pdf::parser::content::ContentParser;
20//!
21//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
22//! // Open a PDF document
23//! let reader = PdfReader::open("document.pdf")?;
24//! let document = PdfDocument::new(reader);
25//!
26//! // Get document information
27//! println!("Pages: {}", document.page_count()?);
28//! println!("Version: {}", document.version()?);
29//!
30//! // Process first page
31//! let page = document.get_page(0)?;
32//! println!("Page size: {}x{} points", page.width(), page.height());
33//!
34//! // Parse content streams
35//! let streams = page.content_streams_with_document(&document)?;
36//! for stream in streams {
37//! let operations = ContentParser::parse(&stream)?;
38//! println!("Operations: {}", operations.len());
39//! }
40//!
41//! // Extract text
42//! let text = document.extract_text_from_page(0)?;
43//! println!("Text: {}", text.text);
44//! # Ok(())
45//! # }
46//! ```
47//!
48//! # Architecture
49//!
50//! ```text
51//! ┌─────────────────────────────────────────────────┐
52//! │ PdfDocument │ ← High-level API
53//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
54//! │ │PdfReader │ │PageTree │ │ResourceManager │ │
55//! │ └──────────┘ └──────────┘ └────────────────┘ │
56//! └─────────────────────────────────────────────────┘
57//! │ │ │
58//! ↓ ↓ ↓
59//! ┌─────────────────────────────────────────────────┐
60//! │ ParsedPage │ ← Page API
61//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
62//! │ │Properties│ │Resources │ │Content Streams │ │
63//! │ └──────────┘ └──────────┘ └────────────────┘ │
64//! └─────────────────────────────────────────────────┘
65//! │ │ │
66//! ↓ ↓ ↓
67//! ┌─────────────────────────────────────────────────┐
68//! │ ContentParser & PdfObject │ ← Low-level API
69//! │ ┌──────────┐ ┌──────────┐ ┌────────────────┐ │
70//! │ │Tokenizer │ │Operators │ │Object Types │ │
71//! │ └──────────┘ └──────────┘ └────────────────┘ │
72//! └─────────────────────────────────────────────────┘
73//! ```
74//!
75//! # Features
76//!
77//! - **Complete PDF Object Model**: All PDF object types supported
78//! - **Content Stream Parsing**: Full operator support for rendering
79//! - **Resource Management**: Fonts, images, color spaces, patterns
80//! - **Text Extraction**: With position and formatting information
81//! - **Page Navigation**: Efficient page tree traversal
82//! - **Stream Filters**: Decompression support (FlateDecode, ASCIIHex, etc.)
83//! - **Reference Resolution**: Automatic handling of indirect objects
84//!
85//! # Example: Building a Simple Renderer
86//!
87//! ```rust,no_run
88//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
89//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
90//!
91//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
92//! struct SimpleRenderer {
93//! current_path: Vec<(f32, f32)>,
94//! }
95//!
96//! impl SimpleRenderer {
97//! fn render_page(document: &PdfDocument<std::fs::File>, page_idx: u32) -> Result<(), Box<dyn std::error::Error>> {
98//! let page = document.get_page(page_idx)?;
99//! let streams = page.content_streams_with_document(&document)?;
100//!
101//! let mut renderer = SimpleRenderer {
102//! current_path: Vec::new(),
103//! };
104//!
105//! for stream in streams {
106//! let operations = ContentParser::parse(&stream)?;
107//! for op in operations {
108//! match op {
109//! ContentOperation::MoveTo(x, y) => {
110//! renderer.current_path.clear();
111//! renderer.current_path.push((x, y));
112//! }
113//! ContentOperation::LineTo(x, y) => {
114//! renderer.current_path.push((x, y));
115//! }
116//! ContentOperation::Stroke => {
117//! println!("Draw path with {} points", renderer.current_path.len());
118//! renderer.current_path.clear();
119//! }
120//! ContentOperation::ShowText(text) => {
121//! println!("Draw text: {:?}", String::from_utf8_lossy(&text));
122//! }
123//! _ => {} // Handle other operations
124//! }
125//! }
126//! }
127//! Ok(())
128//! }
129//! }
130//! # Ok(())
131//! # }
132//! ```
133
134pub mod content;
135pub mod document;
136pub mod filters;
137pub mod header;
138pub mod lexer;
139pub mod object_stream;
140pub mod objects;
141pub mod page_tree;
142pub mod reader;
143pub mod trailer;
144pub mod xref;
145
146use crate::error::OxidizePdfError;
147
148// Re-export main types for convenient access
149pub use self::content::{ContentOperation, ContentParser, TextElement};
150pub use self::document::{PdfDocument, ResourceManager};
151pub use self::objects::{PdfArray, PdfDictionary, PdfName, PdfObject, PdfStream, PdfString};
152pub use self::page_tree::ParsedPage;
153pub use self::reader::{DocumentMetadata, PdfReader};
154
155/// Result type for parser operations
156pub type ParseResult<T> = Result<T, ParseError>;
157
158/// PDF Parser errors covering all failure modes during parsing.
159///
160/// # Error Categories
161///
162/// - **I/O Errors**: File access and reading issues
163/// - **Format Errors**: Invalid PDF structure or syntax
164/// - **Unsupported Features**: Encryption, newer PDF versions
165/// - **Reference Errors**: Invalid or circular object references
166/// - **Stream Errors**: Decompression or filter failures
167///
168/// # Example
169///
170/// ```rust
171/// use oxidize_pdf::parser::{PdfReader, ParseError};
172///
173/// # fn example() -> Result<(), ParseError> {
174/// match PdfReader::open("missing.pdf") {
175/// Ok(_) => println!("File opened"),
176/// Err(ParseError::Io(e)) => println!("IO error: {}", e),
177/// Err(ParseError::InvalidHeader) => println!("Not a valid PDF"),
178/// Err(e) => println!("Other error: {}", e),
179/// }
180/// # Ok(())
181/// # }
182/// ```
183#[derive(Debug, thiserror::Error)]
184pub enum ParseError {
185 /// I/O error during file operations
186 #[error("IO error: {0}")]
187 Io(#[from] std::io::Error),
188
189 /// PDF file doesn't start with valid header (%PDF-)
190 #[error("Invalid PDF header")]
191 InvalidHeader,
192
193 /// PDF version is not supported
194 #[error("Unsupported PDF version: {0}")]
195 UnsupportedVersion(String),
196
197 /// Syntax error in PDF structure
198 #[error("Syntax error at position {position}: {message}")]
199 SyntaxError { position: usize, message: String },
200
201 #[error("Unexpected token: expected {expected}, found {found}")]
202 UnexpectedToken { expected: String, found: String },
203
204 /// Invalid or non-existent object reference
205 #[error("Invalid object reference: {0} {1} R")]
206 InvalidReference(u32, u16),
207
208 /// Required dictionary key is missing
209 #[error("Missing required key: {0}")]
210 MissingKey(String),
211
212 #[error("Invalid xref table")]
213 InvalidXRef,
214
215 #[error("Invalid trailer")]
216 InvalidTrailer,
217
218 #[error("Circular reference detected")]
219 CircularReference,
220
221 /// Error decoding/decompressing stream data
222 #[error("Stream decode error: {0}")]
223 StreamDecodeError(String),
224
225 /// PDF encryption is not currently supported
226 #[error("Encryption not supported")]
227 EncryptionNotSupported,
228}
229
230impl From<ParseError> for OxidizePdfError {
231 fn from(err: ParseError) -> Self {
232 OxidizePdfError::ParseError(err.to_string())
233 }
234}