oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
13//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
14//! - **Resource Access**: Work with fonts, images, and other PDF resources
15//! - **Pure Rust**: No C dependencies or external libraries
16//! - **100% Native**: Complete PDF implementation from scratch
17//!
18//! ## Quick Start
19//!
20//! ### Creating PDFs
21//!
22//! ```rust
23//! use oxidize_pdf::{Document, Page, Font, Color, Result};
24//!
25//! # fn main() -> Result<()> {
26//! // Create a new document
27//! let mut doc = Document::new();
28//! doc.set_title("My PDF");
29//!
30//! // Create a page
31//! let mut page = Page::a4();
32//!
33//! // Add text
34//! page.text()
35//!     .set_font(Font::Helvetica, 24.0)
36//!     .at(50.0, 700.0)
37//!     .write("Hello, PDF!")?;
38//!
39//! // Add graphics
40//! page.graphics()
41//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
42//!     .circle(300.0, 400.0, 50.0)
43//!     .fill();
44//!
45//! // Save the document
46//! doc.add_page(page);
47//! doc.save("output.pdf")?;
48//! # Ok(())
49//! # }
50//! ```
51//!
52//! ### Parsing PDFs
53//!
54//! ```rust,no_run
55//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
56//!
57//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
58//! // Open and parse a PDF
59//! let reader = PdfReader::open("document.pdf")?;
60//! let document = PdfDocument::new(reader);
61//!
62//! // Get document information
63//! println!("Pages: {}", document.page_count()?);
64//! println!("Version: {}", document.version()?);
65//!
66//! // Process pages
67//! for i in 0..document.page_count()? {
68//!     let page = document.get_page(i)?;
69//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
70//! }
71//!
72//! // Extract text
73//! let text_pages = document.extract_text()?;
74//! for (i, page_text) in text_pages.iter().enumerate() {
75//!     println!("Page {} text: {}", i+1, page_text.text);
76//! }
77//! # Ok(())
78//! # }
79//! ```
80//!
81//! ## Modules
82//!
83//! ### Generation Modules
84//! - [`document`] - PDF document creation and management
85//! - [`page`] - Page creation and layout
86//! - [`graphics`] - Vector graphics and images
87//! - [`text`] - Text rendering and flow
88//! - [`writer`] - Low-level PDF writing
89//!
90//! ### Parsing Modules
91//! - [`parser`] - Complete PDF parsing and reading
92//!   - [`parser::PdfDocument`] - High-level document interface
93//!   - [`parser::ParsedPage`] - Page representation with resources
94//!   - [`parser::ContentParser`] - Content stream parsing
95//!   - [`parser::PdfObject`] - Low-level PDF objects
96//!
97//! ### Manipulation Modules
98//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
99//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
100//! - [`text::extraction`] - Text extraction with positioning
101//!
102//! ### OCR Modules (v0.1.3+)
103//! - [`text::ocr`] - OCR trait system and types
104//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
105//! - [`text::ocr`] - OCR integration for scanned documents
106//!
107//! ## Examples
108//!
109//! ### Content Stream Processing
110//!
111//! ```rust,no_run
112//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
113//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
114//!
115//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
116//! let reader = PdfReader::open("document.pdf")?;
117//! let document = PdfDocument::new(reader);
118//! let page = document.get_page(0)?;
119//!
120//! // Get and parse content streams
121//! let streams = page.content_streams_with_document(&document)?;
122//! for stream in streams {
123//!     let operations = ContentParser::parse(&stream)?;
124//!     
125//!     for op in operations {
126//!         match op {
127//!             ContentOperation::ShowText(text) => {
128//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
129//!             }
130//!             ContentOperation::SetFont(name, size) => {
131//!                 println!("Font: {} at {} pt", name, size);
132//!             }
133//!             ContentOperation::MoveTo(x, y) => {
134//!                 println!("Move to ({}, {})", x, y);
135//!             }
136//!             _ => {} // Handle other operations
137//!         }
138//!     }
139//! }
140//! # Ok(())
141//! # }
142//! ```
143//!
144//! ### Resource Access
145//!
146//! ```rust,no_run
147//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
148//!
149//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
150//! let reader = PdfReader::open("document.pdf")?;
151//! let document = PdfDocument::new(reader);
152//! let page = document.get_page(0)?;
153//!
154//! // Access page resources
155//! if let Some(resources) = page.get_resources() {
156//!     // Check fonts
157//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
158//!         for (name, _) in &fonts.0 {
159//!             println!("Font resource: {}", name.as_str());
160//!         }
161//!     }
162//!     
163//!     // Check images/XObjects
164//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
165//!         for (name, _) in &xobjects.0 {
166//!             println!("XObject resource: {}", name.as_str());
167//!         }
168//!     }
169//! }
170//! # Ok(())
171//! # }
172//! ```
173
174pub mod batch;
175pub mod document;
176pub mod error;
177pub mod graphics;
178pub mod memory;
179pub mod objects;
180pub mod operations;
181pub mod page;
182pub mod parser;
183pub mod recovery;
184pub mod streaming;
185pub mod text;
186pub mod writer;
187
188#[cfg(feature = "semantic")]
189pub mod semantic;
190
191// Re-export generation types
192pub use document::{Document, DocumentMetadata};
193pub use error::{OxidizePdfError, PdfError, Result};
194pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
195pub use page::{Margins, Page};
196pub use text::{
197    measure_text, split_into_words, Font, FontFamily, FragmentType, ImagePreprocessing,
198    MockOcrProvider, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
199    OcrTextFragment, TextAlign, TextContext, TextFlowContext,
200};
201
202// Re-export parsing types
203pub use parser::{
204    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParsedPage,
205    PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream, PdfString,
206};
207
208// Re-export operations
209pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
210
211// Re-export memory optimization types
212pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
213
214// Re-export streaming types
215pub use streaming::{
216    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
217    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
218    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
219};
220
221// Re-export batch processing types
222pub use batch::{
223    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
224    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
225    ProgressCallback, ProgressInfo,
226};
227
228// Re-export recovery types
229pub use recovery::{
230    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
231    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
232    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
233};
234
235/// Current version of oxidize-pdf
236pub const VERSION: &str = env!("CARGO_PKG_VERSION");
237
238/// Scanned page analysis and OCR example
239///
240/// ```rust,no_run
241/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
242/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
243/// use oxidize_pdf::parser::PdfReader;
244///
245/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
246/// let document = PdfReader::open_document("scanned.pdf")?;
247/// let analyzer = PageContentAnalyzer::new(document);
248///
249/// // Analyze pages for scanned content
250/// let analyses = analyzer.analyze_document()?;
251/// for analysis in analyses {
252///     match analysis.page_type {
253///         PageType::Scanned => {
254///             println!("Page {} is scanned - applying OCR", analysis.page_number);
255///             
256///             // Process with OCR
257///             let ocr_provider = MockOcrProvider::new();
258///             let ocr_result = analyzer.extract_text_from_scanned_page(
259///                 analysis.page_number,
260///                 &ocr_provider
261///             )?;
262///             
263///             println!("OCR extracted: {}", ocr_result.text);
264///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
265///         }
266///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
267///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
268///     }
269/// }
270/// # Ok(())
271/// # }
272/// ```
273/// Supported PDF versions
274pub mod pdf_version {
275    /// PDF 1.0 - 1.7 are fully supported
276    pub const SUPPORTED_VERSIONS: &[&str] =
277        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
278    /// PDF 2.0 support is planned
279    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn test_create_empty_document() {
288        let doc = Document::new();
289        assert_eq!(doc.pages.len(), 0);
290    }
291
292    #[test]
293    fn test_create_page() {
294        let page = Page::new(595.0, 842.0);
295        assert_eq!(page.width(), 595.0);
296        assert_eq!(page.height(), 842.0);
297    }
298
299    #[test]
300    fn test_version_info() {
301        assert!(!VERSION.is_empty());
302        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
303    }
304
305    #[test]
306    fn test_pdf_version_constants() {
307        // Test that all expected PDF versions are supported
308        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
309
310        for version in expected_versions {
311            assert!(
312                pdf_version::SUPPORTED_VERSIONS.contains(&version),
313                "Expected PDF version {} to be supported",
314                version
315            );
316        }
317
318        // Test that we have exactly 8 supported versions
319        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
320
321        // Test planned versions
322        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
323        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
324    }
325
326    #[test]
327    fn test_document_with_metadata() {
328        let mut doc = Document::new();
329        doc.set_title("Test Document");
330        doc.set_author("Test Author");
331        doc.set_subject("Test Subject");
332
333        // Verify metadata is set (checking internal state)
334        assert_eq!(doc.pages.len(), 0);
335        // Note: We can't directly test metadata without exposing getters
336        // This test ensures the methods don't panic
337    }
338
339    #[test]
340    fn test_page_creation_variants() {
341        // Test different page creation methods
342        let page_a4 = Page::a4();
343        let page_letter = Page::letter();
344        let page_custom = Page::new(400.0, 600.0);
345
346        // A4 dimensions: 595.276 x 841.89 points (approximation)
347        assert!((page_a4.width() - 595.0).abs() < 10.0);
348        assert!((page_a4.height() - 842.0).abs() < 10.0);
349
350        // Letter dimensions: 612 x 792 points
351        assert_eq!(page_letter.width(), 612.0);
352        assert_eq!(page_letter.height(), 792.0);
353
354        // Custom dimensions
355        assert_eq!(page_custom.width(), 400.0);
356        assert_eq!(page_custom.height(), 600.0);
357    }
358
359    #[test]
360    fn test_color_creation() {
361        let red = Color::rgb(1.0, 0.0, 0.0);
362        let green = Color::rgb(0.0, 1.0, 0.0);
363        let blue = Color::rgb(0.0, 0.0, 1.0);
364        let black = Color::rgb(0.0, 0.0, 0.0);
365        let white = Color::rgb(1.0, 1.0, 1.0);
366
367        // Test color creation doesn't panic
368        let _colors = [red, green, blue, black, white];
369
370        // Test CMYK color (if available)
371        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
372        let _cmyk_test = cyan;
373    }
374
375    #[test]
376    fn test_font_types() {
377        let helvetica = Font::Helvetica;
378        let times = Font::TimesRoman;
379        let courier = Font::Courier;
380
381        // Test font creation doesn't panic
382        let _fonts = [helvetica, times, courier];
383
384        // Test font family
385        let helvetica_family = FontFamily::Helvetica;
386        let times_family = FontFamily::Times;
387        let courier_family = FontFamily::Courier;
388
389        let _families = [helvetica_family, times_family, courier_family];
390    }
391
392    #[test]
393    fn test_error_types() {
394        // Test that error types can be created
395        let pdf_error = PdfError::InvalidStructure("test error".to_string());
396        let _error_test = pdf_error;
397
398        // Test result type
399        let ok_result: Result<i32> = Ok(42);
400        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
401
402        assert!(ok_result.is_ok());
403        assert!(err_result.is_err());
404    }
405
406    #[test]
407    fn test_module_exports() {
408        // Test that all major types are properly exported
409        let _doc = Document::new();
410        let _page = Page::new(100.0, 100.0);
411        let _color = Color::rgb(0.5, 0.5, 0.5);
412        let _font = Font::Helvetica;
413
414        // Test parsing types
415        let _array = PdfArray::new();
416        let _dict = PdfDictionary::new();
417        let _name = PdfName::new("Test".to_string());
418        let _string = PdfString::new(b"Test".to_vec());
419
420        // Test operation types
421        let _margins = Margins {
422            top: 10.0,
423            right: 10.0,
424            bottom: 10.0,
425            left: 10.0,
426        };
427        let _align = TextAlign::Left;
428    }
429
430    #[test]
431    fn test_ocr_types() {
432        // Test OCR-related types
433        let _mock_ocr = MockOcrProvider::new();
434        let _ocr_options = OcrOptions::default();
435        let _ocr_engine = OcrEngine::Tesseract;
436
437        // Test fragment types
438        let _fragment_type = FragmentType::Word;
439        let _image_preprocessing = ImagePreprocessing::default();
440    }
441
442    #[test]
443    fn test_text_utilities() {
444        // Test text utility functions
445        let text = "Hello world test";
446        let words = split_into_words(text);
447        assert!(!words.is_empty());
448        assert!(words.contains(&"Hello"));
449        assert!(words.contains(&"world"));
450
451        // Test text measurement (with mock font)
452        let font = Font::Helvetica;
453        let size = 12.0;
454        let width = measure_text(text, font, size);
455        assert!(width > 0.0);
456    }
457
458    #[test]
459    fn test_image_types() {
460        // Test image-related types
461        let _format = ImageFormat::Jpeg;
462        let _color_space = ImageColorSpace::DeviceRGB;
463
464        // Test that image creation doesn't panic
465        let image_data = vec![0u8; 100];
466        let _image = Image::from_jpeg_data(image_data);
467    }
468
469    #[test]
470    fn test_version_string_format() {
471        // Test that version string follows semantic versioning
472        let version_parts: Vec<&str> = VERSION.split('.').collect();
473        assert!(
474            version_parts.len() >= 2,
475            "Version should have at least major.minor format"
476        );
477
478        // Test that major and minor are numeric
479        assert!(
480            version_parts[0].parse::<u32>().is_ok(),
481            "Major version should be numeric"
482        );
483        assert!(
484            version_parts[1].parse::<u32>().is_ok(),
485            "Minor version should be numeric"
486        );
487
488        // Test that version is not empty
489        assert!(!VERSION.is_empty());
490        assert!(!VERSION.is_empty());
491    }
492}