oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
13//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
14//! - **Resource Access**: Work with fonts, images, and other PDF resources
15//! - **Pure Rust**: No C dependencies or external libraries
16//! - **100% Native**: Complete PDF implementation from scratch
17//!
18//! ## Quick Start
19//!
20//! ### Creating PDFs
21//!
22//! ```rust
23//! use oxidize_pdf::{Document, Page, Font, Color, Result};
24//!
25//! # fn main() -> Result<()> {
26//! // Create a new document
27//! let mut doc = Document::new();
28//! doc.set_title("My PDF");
29//!
30//! // Create a page
31//! let mut page = Page::a4();
32//!
33//! // Add text
34//! page.text()
35//! .set_font(Font::Helvetica, 24.0)
36//! .at(50.0, 700.0)
37//! .write("Hello, PDF!")?;
38//!
39//! // Add graphics
40//! page.graphics()
41//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
42//! .circle(300.0, 400.0, 50.0)
43//! .fill();
44//!
45//! // Save the document
46//! doc.add_page(page);
47//! doc.save("output.pdf")?;
48//! # Ok(())
49//! # }
50//! ```
51//!
52//! ### Parsing PDFs
53//!
54//! ```rust,no_run
55//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
56//!
57//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
58//! // Open and parse a PDF
59//! let reader = PdfReader::open("document.pdf")?;
60//! let document = PdfDocument::new(reader);
61//!
62//! // Get document information
63//! println!("Pages: {}", document.page_count()?);
64//! println!("Version: {}", document.version()?);
65//!
66//! // Process pages
67//! for i in 0..document.page_count()? {
68//! let page = document.get_page(i)?;
69//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
70//! }
71//!
72//! // Extract text
73//! let text_pages = document.extract_text()?;
74//! for (i, page_text) in text_pages.iter().enumerate() {
75//! println!("Page {} text: {}", i+1, page_text.text);
76//! }
77//! # Ok(())
78//! # }
79//! ```
80//!
81//! ## Modules
82//!
83//! ### Generation Modules
84//! - [`document`] - PDF document creation and management
85//! - [`page`] - Page creation and layout
86//! - [`graphics`] - Vector graphics and images
87//! - [`text`] - Text rendering and flow
88//! - [`writer`] - Low-level PDF writing
89//!
90//! ### Parsing Modules
91//! - [`parser`] - Complete PDF parsing and reading
92//! - [`parser::PdfDocument`] - High-level document interface
93//! - [`parser::ParsedPage`] - Page representation with resources
94//! - [`parser::ContentParser`] - Content stream parsing
95//! - [`parser::PdfObject`] - Low-level PDF objects
96//!
97//! ### Manipulation Modules
98//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
99//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
100//! - [`text::extraction`] - Text extraction with positioning
101//!
102//! ### OCR Modules (v0.1.3+)
103//! - [`text::ocr`] - OCR trait system and types
104//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
105//! - [`text::ocr`] - OCR integration for scanned documents
106//!
107//! ## Examples
108//!
109//! ### Content Stream Processing
110//!
111//! ```rust,no_run
112//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
113//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
114//!
115//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
116//! let reader = PdfReader::open("document.pdf")?;
117//! let document = PdfDocument::new(reader);
118//! let page = document.get_page(0)?;
119//!
120//! // Get and parse content streams
121//! let streams = page.content_streams_with_document(&document)?;
122//! for stream in streams {
123//! let operations = ContentParser::parse(&stream)?;
124//!
125//! for op in operations {
126//! match op {
127//! ContentOperation::ShowText(text) => {
128//! println!("Text: {:?}", String::from_utf8_lossy(&text));
129//! }
130//! ContentOperation::SetFont(name, size) => {
131//! println!("Font: {} at {} pt", name, size);
132//! }
133//! ContentOperation::MoveTo(x, y) => {
134//! println!("Move to ({}, {})", x, y);
135//! }
136//! _ => {} // Handle other operations
137//! }
138//! }
139//! }
140//! # Ok(())
141//! # }
142//! ```
143//!
144//! ### Resource Access
145//!
146//! ```rust,no_run
147//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
148//!
149//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
150//! let reader = PdfReader::open("document.pdf")?;
151//! let document = PdfDocument::new(reader);
152//! let page = document.get_page(0)?;
153//!
154//! // Access page resources
155//! if let Some(resources) = page.get_resources() {
156//! // Check fonts
157//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
158//! for (name, _) in &fonts.0 {
159//! println!("Font resource: {}", name.as_str());
160//! }
161//! }
162//!
163//! // Check images/XObjects
164//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
165//! for (name, _) in &xobjects.0 {
166//! println!("XObject resource: {}", name.as_str());
167//! }
168//! }
169//! }
170//! # Ok(())
171//! # }
172//! ```
173
174pub mod document;
175pub mod error;
176pub mod graphics;
177pub mod objects;
178pub mod operations;
179pub mod page;
180pub mod parser;
181pub mod text;
182pub mod writer;
183
184#[cfg(feature = "semantic")]
185pub mod semantic;
186
187// Re-export generation types
188pub use document::{Document, DocumentMetadata};
189pub use error::{OxidizePdfError, PdfError, Result};
190pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
191pub use page::{Margins, Page};
192pub use text::{
193 measure_text, split_into_words, Font, FontFamily, FragmentType, ImagePreprocessing,
194 MockOcrProvider, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
195 OcrTextFragment, TextAlign, TextContext, TextFlowContext,
196};
197
198// Re-export parsing types
199pub use parser::{
200 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParsedPage,
201 PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream, PdfString,
202};
203
204// Re-export operations
205pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
206
207/// Current version of oxidize-pdf
208pub const VERSION: &str = env!("CARGO_PKG_VERSION");
209
210/// Scanned page analysis and OCR example
211///
212/// ```rust,no_run
213/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
214/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
215/// use oxidize_pdf::parser::PdfReader;
216///
217/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
218/// let document = PdfReader::open_document("scanned.pdf")?;
219/// let analyzer = PageContentAnalyzer::new(document);
220///
221/// // Analyze pages for scanned content
222/// let analyses = analyzer.analyze_document()?;
223/// for analysis in analyses {
224/// match analysis.page_type {
225/// PageType::Scanned => {
226/// println!("Page {} is scanned - applying OCR", analysis.page_number);
227///
228/// // Process with OCR
229/// let ocr_provider = MockOcrProvider::new();
230/// let ocr_result = analyzer.extract_text_from_scanned_page(
231/// analysis.page_number,
232/// &ocr_provider
233/// )?;
234///
235/// println!("OCR extracted: {}", ocr_result.text);
236/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
237/// }
238/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
239/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
240/// }
241/// }
242/// # Ok(())
243/// # }
244/// ```
245/// Supported PDF versions
246pub mod pdf_version {
247 /// PDF 1.0 - 1.7 are fully supported
248 pub const SUPPORTED_VERSIONS: &[&str] =
249 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
250 /// PDF 2.0 support is planned
251 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
252}
253
254#[cfg(test)]
255mod tests {
256 use super::*;
257
258 #[test]
259 fn test_create_empty_document() {
260 let doc = Document::new();
261 assert_eq!(doc.pages.len(), 0);
262 }
263
264 #[test]
265 fn test_create_page() {
266 let page = Page::new(595.0, 842.0);
267 assert_eq!(page.width(), 595.0);
268 assert_eq!(page.height(), 842.0);
269 }
270
271 #[test]
272 fn test_version_info() {
273 assert!(!VERSION.is_empty());
274 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
275 }
276}