oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
13//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
14//! - **Resource Access**: Work with fonts, images, and other PDF resources
15//! - **Pure Rust**: No C dependencies or external libraries
16//! - **100% Native**: Complete PDF implementation from scratch
17//!
18//! ## Quick Start
19//!
20//! ### Creating PDFs
21//!
22//! ```rust
23//! use oxidize_pdf::{Document, Page, Font, Color, Result};
24//!
25//! # fn main() -> Result<()> {
26//! // Create a new document
27//! let mut doc = Document::new();
28//! doc.set_title("My PDF");
29//!
30//! // Create a page
31//! let mut page = Page::a4();
32//!
33//! // Add text
34//! page.text()
35//!     .set_font(Font::Helvetica, 24.0)
36//!     .at(50.0, 700.0)
37//!     .write("Hello, PDF!")?;
38//!
39//! // Add graphics
40//! page.graphics()
41//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
42//!     .circle(300.0, 400.0, 50.0)
43//!     .fill();
44//!
45//! // Save the document
46//! doc.add_page(page);
47//! doc.save("output.pdf")?;
48//! # Ok(())
49//! # }
50//! ```
51//!
52//! ### Parsing PDFs
53//!
54//! ```rust,no_run
55//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
56//!
57//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
58//! // Open and parse a PDF
59//! let reader = PdfReader::open("document.pdf")?;
60//! let document = PdfDocument::new(reader);
61//!
62//! // Get document information
63//! println!("Pages: {}", document.page_count()?);
64//! println!("Version: {}", document.version()?);
65//!
66//! // Process pages
67//! for i in 0..document.page_count()? {
68//!     let page = document.get_page(i)?;
69//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
70//! }
71//!
72//! // Extract text
73//! let text_pages = document.extract_text()?;
74//! for (i, page_text) in text_pages.iter().enumerate() {
75//!     println!("Page {} text: {}", i+1, page_text.text);
76//! }
77//! # Ok(())
78//! # }
79//! ```
80//!
81//! ## Modules
82//!
83//! ### Generation Modules
84//! - [`document`] - PDF document creation and management
85//! - [`page`] - Page creation and layout
86//! - [`graphics`] - Vector graphics and images
87//! - [`text`] - Text rendering and flow
88//! - [`writer`] - Low-level PDF writing
89//!
90//! ### Parsing Modules
91//! - [`parser`] - Complete PDF parsing and reading
92//!   - [`parser::PdfDocument`] - High-level document interface
93//!   - [`parser::ParsedPage`] - Page representation with resources
94//!   - [`parser::ContentParser`] - Content stream parsing
95//!   - [`parser::PdfObject`] - Low-level PDF objects
96//!
97//! ### Manipulation Modules
98//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
99//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
100//! - [`text::extraction`] - Text extraction with positioning
101//!
102//! ### OCR Modules (v0.1.3+)
103//! - [`text::ocr`] - OCR trait system and types
104//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
105//! - [`text::ocr`] - OCR integration for scanned documents
106//!
107//! ## Examples
108//!
109//! ### Content Stream Processing
110//!
111//! ```rust,no_run
112//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
113//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
114//!
115//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
116//! let reader = PdfReader::open("document.pdf")?;
117//! let document = PdfDocument::new(reader);
118//! let page = document.get_page(0)?;
119//!
120//! // Get and parse content streams
121//! let streams = page.content_streams_with_document(&document)?;
122//! for stream in streams {
123//!     let operations = ContentParser::parse(&stream)?;
124//!     
125//!     for op in operations {
126//!         match op {
127//!             ContentOperation::ShowText(text) => {
128//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
129//!             }
130//!             ContentOperation::SetFont(name, size) => {
131//!                 println!("Font: {} at {} pt", name, size);
132//!             }
133//!             ContentOperation::MoveTo(x, y) => {
134//!                 println!("Move to ({}, {})", x, y);
135//!             }
136//!             _ => {} // Handle other operations
137//!         }
138//!     }
139//! }
140//! # Ok(())
141//! # }
142//! ```
143//!
144//! ### Resource Access
145//!
146//! ```rust,no_run
147//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
148//!
149//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
150//! let reader = PdfReader::open("document.pdf")?;
151//! let document = PdfDocument::new(reader);
152//! let page = document.get_page(0)?;
153//!
154//! // Access page resources
155//! if let Some(resources) = page.get_resources() {
156//!     // Check fonts
157//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
158//!         for (name, _) in &fonts.0 {
159//!             println!("Font resource: {}", name.as_str());
160//!         }
161//!     }
162//!     
163//!     // Check images/XObjects
164//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
165//!         for (name, _) in &xobjects.0 {
166//!             println!("XObject resource: {}", name.as_str());
167//!         }
168//!     }
169//! }
170//! # Ok(())
171//! # }
172//! ```
173
174pub mod document;
175pub mod error;
176pub mod graphics;
177pub mod objects;
178pub mod operations;
179pub mod page;
180pub mod parser;
181pub mod text;
182pub mod writer;
183
184#[cfg(feature = "semantic")]
185pub mod semantic;
186
187// Re-export generation types
188pub use document::{Document, DocumentMetadata};
189pub use error::{OxidizePdfError, PdfError, Result};
190pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
191pub use page::{Margins, Page};
192pub use text::{
193    measure_text, split_into_words, Font, FontFamily, FragmentType, ImagePreprocessing,
194    MockOcrProvider, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
195    OcrTextFragment, TextAlign, TextContext, TextFlowContext,
196};
197
198// Re-export parsing types
199pub use parser::{
200    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParsedPage,
201    PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream, PdfString,
202};
203
204// Re-export operations
205pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
206
207/// Current version of oxidize-pdf
208pub const VERSION: &str = env!("CARGO_PKG_VERSION");
209
210/// Scanned page analysis and OCR example
211///
212/// ```rust,no_run
213/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
214/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
215/// use oxidize_pdf::parser::PdfReader;
216///
217/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
218/// let document = PdfReader::open_document("scanned.pdf")?;
219/// let analyzer = PageContentAnalyzer::new(document);
220///
221/// // Analyze pages for scanned content
222/// let analyses = analyzer.analyze_document()?;
223/// for analysis in analyses {
224///     match analysis.page_type {
225///         PageType::Scanned => {
226///             println!("Page {} is scanned - applying OCR", analysis.page_number);
227///             
228///             // Process with OCR
229///             let ocr_provider = MockOcrProvider::new();
230///             let ocr_result = analyzer.extract_text_from_scanned_page(
231///                 analysis.page_number,
232///                 &ocr_provider
233///             )?;
234///             
235///             println!("OCR extracted: {}", ocr_result.text);
236///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
237///         }
238///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
239///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
240///     }
241/// }
242/// # Ok(())
243/// # }
244/// ```
245/// Supported PDF versions
246pub mod pdf_version {
247    /// PDF 1.0 - 1.7 are fully supported
248    pub const SUPPORTED_VERSIONS: &[&str] =
249        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
250    /// PDF 2.0 support is planned
251    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    #[test]
259    fn test_create_empty_document() {
260        let doc = Document::new();
261        assert_eq!(doc.pages.len(), 0);
262    }
263
264    #[test]
265    fn test_create_page() {
266        let page = Page::new(595.0, 842.0);
267        assert_eq!(page.width(), 595.0);
268        assert_eq!(page.height(), 842.0);
269    }
270
271    #[test]
272    fn test_version_info() {
273        assert!(!VERSION.is_empty());
274        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
275    }
276
277    #[test]
278    fn test_pdf_version_constants() {
279        // Test that all expected PDF versions are supported
280        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
281        
282        for version in expected_versions {
283            assert!(pdf_version::SUPPORTED_VERSIONS.contains(&version),
284                   "Expected PDF version {} to be supported", version);
285        }
286        
287        // Test that we have exactly 8 supported versions
288        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
289        
290        // Test planned versions
291        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
292        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
293    }
294
295    #[test]
296    fn test_document_with_metadata() {
297        let mut doc = Document::new();
298        doc.set_title("Test Document");
299        doc.set_author("Test Author");
300        doc.set_subject("Test Subject");
301        
302        // Verify metadata is set (checking internal state)
303        assert_eq!(doc.pages.len(), 0);
304        // Note: We can't directly test metadata without exposing getters
305        // This test ensures the methods don't panic
306    }
307
308    #[test]
309    fn test_page_creation_variants() {
310        // Test different page creation methods
311        let page_a4 = Page::a4();
312        let page_letter = Page::letter();
313        let page_custom = Page::new(400.0, 600.0);
314        
315        // A4 dimensions: 595.276 x 841.89 points (approximation)
316        assert!((page_a4.width() - 595.0).abs() < 10.0);
317        assert!((page_a4.height() - 842.0).abs() < 10.0);
318        
319        // Letter dimensions: 612 x 792 points
320        assert_eq!(page_letter.width(), 612.0);
321        assert_eq!(page_letter.height(), 792.0);
322        
323        // Custom dimensions
324        assert_eq!(page_custom.width(), 400.0);
325        assert_eq!(page_custom.height(), 600.0);
326    }
327
328    #[test]
329    fn test_color_creation() {
330        let red = Color::rgb(1.0, 0.0, 0.0);
331        let green = Color::rgb(0.0, 1.0, 0.0);
332        let blue = Color::rgb(0.0, 0.0, 1.0);
333        let black = Color::rgb(0.0, 0.0, 0.0);
334        let white = Color::rgb(1.0, 1.0, 1.0);
335        
336        // Test color creation doesn't panic
337        let _colors = [red, green, blue, black, white];
338        
339        // Test CMYK color (if available)
340        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
341        let _cmyk_test = cyan;
342    }
343
344    #[test]
345    fn test_font_types() {
346        let helvetica = Font::Helvetica;
347        let times = Font::TimesRoman;
348        let courier = Font::Courier;
349        
350        // Test font creation doesn't panic
351        let _fonts = [helvetica, times, courier];
352        
353        // Test font family
354        let helvetica_family = FontFamily::Helvetica;
355        let times_family = FontFamily::Times;
356        let courier_family = FontFamily::Courier;
357        
358        let _families = [helvetica_family, times_family, courier_family];
359    }
360
361    #[test]
362    fn test_error_types() {
363        // Test that error types can be created
364        let pdf_error = PdfError::InvalidStructure("test error".to_string());
365        let _error_test = pdf_error;
366        
367        // Test result type
368        let ok_result: Result<i32> = Ok(42);
369        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
370        
371        assert!(ok_result.is_ok());
372        assert!(err_result.is_err());
373    }
374
375    #[test]
376    fn test_module_exports() {
377        // Test that all major types are properly exported
378        let _doc = Document::new();
379        let _page = Page::new(100.0, 100.0);
380        let _color = Color::rgb(0.5, 0.5, 0.5);
381        let _font = Font::Helvetica;
382        
383        // Test parsing types
384        let _array = PdfArray::new();
385        let _dict = PdfDictionary::new();
386        let _name = PdfName::new("Test".to_string());
387        let _string = PdfString::new(b"Test".to_vec());
388        
389        // Test operation types
390        let _margins = Margins { top: 10.0, right: 10.0, bottom: 10.0, left: 10.0 };
391        let _align = TextAlign::Left;
392    }
393
394    #[test]
395    fn test_ocr_types() {
396        // Test OCR-related types
397        let _mock_ocr = MockOcrProvider::new();
398        let _ocr_options = OcrOptions::default();
399        let _ocr_engine = OcrEngine::Tesseract;
400        
401        // Test fragment types
402        let _fragment_type = FragmentType::Word;
403        let _image_preprocessing = ImagePreprocessing::default();
404    }
405
406    #[test]
407    fn test_text_utilities() {
408        // Test text utility functions
409        let text = "Hello world test";
410        let words = split_into_words(text);
411        assert!(words.len() > 0);
412        assert!(words.contains(&"Hello"));
413        assert!(words.contains(&"world"));
414        
415        // Test text measurement (with mock font)
416        let font = Font::Helvetica;
417        let size = 12.0;
418        let width = measure_text(text, font, size);
419        assert!(width > 0.0);
420    }
421
422    #[test]
423    fn test_image_types() {
424        // Test image-related types
425        let _format = ImageFormat::Jpeg;
426        let _color_space = ImageColorSpace::DeviceRGB;
427        
428        // Test that image creation doesn't panic
429        let image_data = vec![0u8; 100];
430        let _image = Image::from_jpeg_data(image_data);
431    }
432
433    #[test]
434    fn test_version_string_format() {
435        // Test that version string follows semantic versioning
436        let version_parts: Vec<&str> = VERSION.split('.').collect();
437        assert!(version_parts.len() >= 2, "Version should have at least major.minor format");
438        
439        // Test that major and minor are numeric
440        assert!(version_parts[0].parse::<u32>().is_ok(), "Major version should be numeric");
441        assert!(version_parts[1].parse::<u32>().is_ok(), "Minor version should be numeric");
442        
443        // Test that version is not empty
444        assert!(!VERSION.is_empty());
445        assert!(VERSION.len() > 0);
446    }
447}