oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
13//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
14//! - **Resource Access**: Work with fonts, images, and other PDF resources
15//! - **Pure Rust**: No C dependencies or external libraries
16//! - **100% Native**: Complete PDF implementation from scratch
17//!
18//! ## Quick Start
19//!
20//! ### Creating PDFs
21//!
22//! ```rust
23//! use oxidize_pdf::{Document, Page, Font, Color, Result};
24//!
25//! # fn main() -> Result<()> {
26//! // Create a new document
27//! let mut doc = Document::new();
28//! doc.set_title("My PDF");
29//!
30//! // Create a page
31//! let mut page = Page::a4();
32//!
33//! // Add text
34//! page.text()
35//! .set_font(Font::Helvetica, 24.0)
36//! .at(50.0, 700.0)
37//! .write("Hello, PDF!")?;
38//!
39//! // Add graphics
40//! page.graphics()
41//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
42//! .circle(300.0, 400.0, 50.0)
43//! .fill();
44//!
45//! // Save the document
46//! doc.add_page(page);
47//! doc.save("output.pdf")?;
48//! # Ok(())
49//! # }
50//! ```
51//!
52//! ### Parsing PDFs
53//!
54//! ```rust,no_run
55//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
56//!
57//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
58//! // Open and parse a PDF
59//! let reader = PdfReader::open("document.pdf")?;
60//! let document = PdfDocument::new(reader);
61//!
62//! // Get document information
63//! println!("Pages: {}", document.page_count()?);
64//! println!("Version: {}", document.version()?);
65//!
66//! // Process pages
67//! for i in 0..document.page_count()? {
68//! let page = document.get_page(i)?;
69//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
70//! }
71//!
72//! // Extract text
73//! let text_pages = document.extract_text()?;
74//! for (i, page_text) in text_pages.iter().enumerate() {
75//! println!("Page {} text: {}", i+1, page_text.text);
76//! }
77//! # Ok(())
78//! # }
79//! ```
80//!
81//! ## Modules
82//!
83//! ### Generation Modules
84//! - [`document`] - PDF document creation and management
85//! - [`page`] - Page creation and layout
86//! - [`graphics`] - Vector graphics and images
87//! - [`text`] - Text rendering and flow
88//! - [`writer`] - Low-level PDF writing
89//!
90//! ### Parsing Modules
91//! - [`parser`] - Complete PDF parsing and reading
92//! - [`parser::PdfDocument`] - High-level document interface
93//! - [`parser::ParsedPage`] - Page representation with resources
94//! - [`parser::ContentParser`] - Content stream parsing
95//! - [`parser::PdfObject`] - Low-level PDF objects
96//!
97//! ### Manipulation Modules
98//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
99//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
100//! - [`text::extraction`] - Text extraction with positioning
101//!
102//! ### OCR Modules (v0.1.3+)
103//! - [`text::ocr`] - OCR trait system and types
104//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
105//! - [`text::ocr`] - OCR integration for scanned documents
106//!
107//! ## Examples
108//!
109//! ### Content Stream Processing
110//!
111//! ```rust,no_run
112//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
113//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
114//!
115//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
116//! let reader = PdfReader::open("document.pdf")?;
117//! let document = PdfDocument::new(reader);
118//! let page = document.get_page(0)?;
119//!
120//! // Get and parse content streams
121//! let streams = page.content_streams_with_document(&document)?;
122//! for stream in streams {
123//! let operations = ContentParser::parse(&stream)?;
124//!
125//! for op in operations {
126//! match op {
127//! ContentOperation::ShowText(text) => {
128//! println!("Text: {:?}", String::from_utf8_lossy(&text));
129//! }
130//! ContentOperation::SetFont(name, size) => {
131//! println!("Font: {} at {} pt", name, size);
132//! }
133//! ContentOperation::MoveTo(x, y) => {
134//! println!("Move to ({}, {})", x, y);
135//! }
136//! _ => {} // Handle other operations
137//! }
138//! }
139//! }
140//! # Ok(())
141//! # }
142//! ```
143//!
144//! ### Resource Access
145//!
146//! ```rust,no_run
147//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
148//!
149//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
150//! let reader = PdfReader::open("document.pdf")?;
151//! let document = PdfDocument::new(reader);
152//! let page = document.get_page(0)?;
153//!
154//! // Access page resources
155//! if let Some(resources) = page.get_resources() {
156//! // Check fonts
157//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
158//! for (name, _) in &fonts.0 {
159//! println!("Font resource: {}", name.as_str());
160//! }
161//! }
162//!
163//! // Check images/XObjects
164//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
165//! for (name, _) in &xobjects.0 {
166//! println!("XObject resource: {}", name.as_str());
167//! }
168//! }
169//! }
170//! # Ok(())
171//! # }
172//! ```
173
174pub mod batch;
175pub mod document;
176pub mod error;
177pub mod graphics;
178pub mod memory;
179pub mod objects;
180pub mod operations;
181pub mod page;
182pub mod parser;
183pub mod recovery;
184pub mod streaming;
185pub mod text;
186pub mod writer;
187
188#[cfg(feature = "semantic")]
189pub mod semantic;
190
191// Re-export generation types
192pub use document::{Document, DocumentMetadata};
193pub use error::{OxidizePdfError, PdfError, Result};
194pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
195pub use page::{Margins, Page};
196pub use text::{
197 measure_text, split_into_words, Font, FontFamily, FragmentType, ImagePreprocessing,
198 MockOcrProvider, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
199 OcrTextFragment, TextAlign, TextContext, TextFlowContext,
200};
201
202// Re-export parsing types
203pub use parser::{
204 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParsedPage,
205 PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream, PdfString,
206};
207
208// Re-export operations
209pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
210
211// Re-export memory optimization types
212pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
213
214// Re-export streaming types
215pub use streaming::{
216 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
217 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
218 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
219};
220
221// Re-export batch processing types
222pub use batch::{
223 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
224 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
225 ProgressCallback, ProgressInfo,
226};
227
228// Re-export recovery types
229pub use recovery::{
230 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
231 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
232 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
233};
234
235/// Current version of oxidize-pdf
236pub const VERSION: &str = env!("CARGO_PKG_VERSION");
237
238/// Scanned page analysis and OCR example
239///
240/// ```rust,no_run
241/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
242/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
243/// use oxidize_pdf::parser::PdfReader;
244///
245/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
246/// let document = PdfReader::open_document("scanned.pdf")?;
247/// let analyzer = PageContentAnalyzer::new(document);
248///
249/// // Analyze pages for scanned content
250/// let analyses = analyzer.analyze_document()?;
251/// for analysis in analyses {
252/// match analysis.page_type {
253/// PageType::Scanned => {
254/// println!("Page {} is scanned - applying OCR", analysis.page_number);
255///
256/// // Process with OCR
257/// let ocr_provider = MockOcrProvider::new();
258/// let ocr_result = analyzer.extract_text_from_scanned_page(
259/// analysis.page_number,
260/// &ocr_provider
261/// )?;
262///
263/// println!("OCR extracted: {}", ocr_result.text);
264/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
265/// }
266/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
267/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
268/// }
269/// }
270/// # Ok(())
271/// # }
272/// ```
273/// Supported PDF versions
274pub mod pdf_version {
275 /// PDF 1.0 - 1.7 are fully supported
276 pub const SUPPORTED_VERSIONS: &[&str] =
277 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
278 /// PDF 2.0 support is planned
279 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
280}
281
282#[cfg(test)]
283mod tests {
284 use super::*;
285
286 #[test]
287 fn test_create_empty_document() {
288 let doc = Document::new();
289 assert_eq!(doc.pages.len(), 0);
290 }
291
292 #[test]
293 fn test_create_page() {
294 let page = Page::new(595.0, 842.0);
295 assert_eq!(page.width(), 595.0);
296 assert_eq!(page.height(), 842.0);
297 }
298
299 #[test]
300 fn test_version_info() {
301 assert!(!VERSION.is_empty());
302 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
303 }
304
305 #[test]
306 fn test_pdf_version_constants() {
307 // Test that all expected PDF versions are supported
308 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
309
310 for version in expected_versions {
311 assert!(
312 pdf_version::SUPPORTED_VERSIONS.contains(&version),
313 "Expected PDF version {} to be supported",
314 version
315 );
316 }
317
318 // Test that we have exactly 8 supported versions
319 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
320
321 // Test planned versions
322 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
323 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
324 }
325
326 #[test]
327 fn test_document_with_metadata() {
328 let mut doc = Document::new();
329 doc.set_title("Test Document");
330 doc.set_author("Test Author");
331 doc.set_subject("Test Subject");
332
333 // Verify metadata is set (checking internal state)
334 assert_eq!(doc.pages.len(), 0);
335 // Note: We can't directly test metadata without exposing getters
336 // This test ensures the methods don't panic
337 }
338
339 #[test]
340 fn test_page_creation_variants() {
341 // Test different page creation methods
342 let page_a4 = Page::a4();
343 let page_letter = Page::letter();
344 let page_custom = Page::new(400.0, 600.0);
345
346 // A4 dimensions: 595.276 x 841.89 points (approximation)
347 assert!((page_a4.width() - 595.0).abs() < 10.0);
348 assert!((page_a4.height() - 842.0).abs() < 10.0);
349
350 // Letter dimensions: 612 x 792 points
351 assert_eq!(page_letter.width(), 612.0);
352 assert_eq!(page_letter.height(), 792.0);
353
354 // Custom dimensions
355 assert_eq!(page_custom.width(), 400.0);
356 assert_eq!(page_custom.height(), 600.0);
357 }
358
359 #[test]
360 fn test_color_creation() {
361 let red = Color::rgb(1.0, 0.0, 0.0);
362 let green = Color::rgb(0.0, 1.0, 0.0);
363 let blue = Color::rgb(0.0, 0.0, 1.0);
364 let black = Color::rgb(0.0, 0.0, 0.0);
365 let white = Color::rgb(1.0, 1.0, 1.0);
366
367 // Test color creation doesn't panic
368 let _colors = [red, green, blue, black, white];
369
370 // Test CMYK color (if available)
371 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
372 let _cmyk_test = cyan;
373 }
374
375 #[test]
376 fn test_font_types() {
377 let helvetica = Font::Helvetica;
378 let times = Font::TimesRoman;
379 let courier = Font::Courier;
380
381 // Test font creation doesn't panic
382 let _fonts = [helvetica, times, courier];
383
384 // Test font family
385 let helvetica_family = FontFamily::Helvetica;
386 let times_family = FontFamily::Times;
387 let courier_family = FontFamily::Courier;
388
389 let _families = [helvetica_family, times_family, courier_family];
390 }
391
392 #[test]
393 fn test_error_types() {
394 // Test that error types can be created
395 let pdf_error = PdfError::InvalidStructure("test error".to_string());
396 let _error_test = pdf_error;
397
398 // Test result type
399 let ok_result: Result<i32> = Ok(42);
400 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
401
402 assert!(ok_result.is_ok());
403 assert!(err_result.is_err());
404 }
405
406 #[test]
407 fn test_module_exports() {
408 // Test that all major types are properly exported
409 let _doc = Document::new();
410 let _page = Page::new(100.0, 100.0);
411 let _color = Color::rgb(0.5, 0.5, 0.5);
412 let _font = Font::Helvetica;
413
414 // Test parsing types
415 let _array = PdfArray::new();
416 let _dict = PdfDictionary::new();
417 let _name = PdfName::new("Test".to_string());
418 let _string = PdfString::new(b"Test".to_vec());
419
420 // Test operation types
421 let _margins = Margins {
422 top: 10.0,
423 right: 10.0,
424 bottom: 10.0,
425 left: 10.0,
426 };
427 let _align = TextAlign::Left;
428 }
429
430 #[test]
431 fn test_ocr_types() {
432 // Test OCR-related types
433 let _mock_ocr = MockOcrProvider::new();
434 let _ocr_options = OcrOptions::default();
435 let _ocr_engine = OcrEngine::Tesseract;
436
437 // Test fragment types
438 let _fragment_type = FragmentType::Word;
439 let _image_preprocessing = ImagePreprocessing::default();
440 }
441
442 #[test]
443 fn test_text_utilities() {
444 // Test text utility functions
445 let text = "Hello world test";
446 let words = split_into_words(text);
447 assert!(!words.is_empty());
448 assert!(words.contains(&"Hello"));
449 assert!(words.contains(&"world"));
450
451 // Test text measurement (with mock font)
452 let font = Font::Helvetica;
453 let size = 12.0;
454 let width = measure_text(text, font, size);
455 assert!(width > 0.0);
456 }
457
458 #[test]
459 fn test_image_types() {
460 // Test image-related types
461 let _format = ImageFormat::Jpeg;
462 let _color_space = ImageColorSpace::DeviceRGB;
463
464 // Test that image creation doesn't panic
465 let image_data = vec![0u8; 100];
466 let _image = Image::from_jpeg_data(image_data);
467 }
468
469 #[test]
470 fn test_version_string_format() {
471 // Test that version string follows semantic versioning
472 let version_parts: Vec<&str> = VERSION.split('.').collect();
473 assert!(
474 version_parts.len() >= 2,
475 "Version should have at least major.minor format"
476 );
477
478 // Test that major and minor are numeric
479 assert!(
480 version_parts[0].parse::<u32>().is_ok(),
481 "Major version should be numeric"
482 );
483 assert!(
484 version_parts[1].parse::<u32>().is_ok(),
485 "Minor version should be numeric"
486 );
487
488 // Test that version is not empty
489 assert!(!VERSION.is_empty());
490 assert!(!VERSION.is_empty());
491 }
492}