oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
13//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
14//! - **Resource Access**: Work with fonts, images, and other PDF resources
15//! - **Pure Rust**: No C dependencies or external libraries
16//! - **100% Native**: Complete PDF implementation from scratch
17//!
18//! ## Quick Start
19//!
20//! ### Creating PDFs
21//!
22//! ```rust
23//! use oxidize_pdf::{Document, Page, Font, Color, Result};
24//!
25//! # fn main() -> Result<()> {
26//! // Create a new document
27//! let mut doc = Document::new();
28//! doc.set_title("My PDF");
29//!
30//! // Create a page
31//! let mut page = Page::a4();
32//!
33//! // Add text
34//! page.text()
35//! .set_font(Font::Helvetica, 24.0)
36//! .at(50.0, 700.0)
37//! .write("Hello, PDF!")?;
38//!
39//! // Add graphics
40//! page.graphics()
41//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
42//! .circle(300.0, 400.0, 50.0)
43//! .fill();
44//!
45//! // Save the document
46//! doc.add_page(page);
47//! doc.save("output.pdf")?;
48//! # Ok(())
49//! # }
50//! ```
51//!
52//! ### Parsing PDFs
53//!
54//! ```rust,no_run
55//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
56//!
57//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
58//! // Open and parse a PDF
59//! let reader = PdfReader::open("document.pdf")?;
60//! let document = PdfDocument::new(reader);
61//!
62//! // Get document information
63//! println!("Pages: {}", document.page_count()?);
64//! println!("Version: {}", document.version()?);
65//!
66//! // Process pages
67//! for i in 0..document.page_count()? {
68//! let page = document.get_page(i)?;
69//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
70//! }
71//!
72//! // Extract text
73//! let text_pages = document.extract_text()?;
74//! for (i, page_text) in text_pages.iter().enumerate() {
75//! println!("Page {} text: {}", i+1, page_text.text);
76//! }
77//! # Ok(())
78//! # }
79//! ```
80//!
81//! ## Modules
82//!
83//! ### Generation Modules
84//! - [`document`] - PDF document creation and management
85//! - [`page`] - Page creation and layout
86//! - [`graphics`] - Vector graphics and images
87//! - [`text`] - Text rendering and flow
88//! - [`writer`] - Low-level PDF writing
89//!
90//! ### Parsing Modules
91//! - [`parser`] - Complete PDF parsing and reading
92//! - [`parser::PdfDocument`] - High-level document interface
93//! - [`parser::ParsedPage`] - Page representation with resources
94//! - [`parser::ContentParser`] - Content stream parsing
95//! - [`parser::PdfObject`] - Low-level PDF objects
96//!
97//! ### Manipulation Modules
98//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
99//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
100//! - [`text::extraction`] - Text extraction with positioning
101//!
102//! ### OCR Modules (v0.1.3+)
103//! - [`text::ocr`] - OCR trait system and types
104//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
105//! - [`text::ocr`] - OCR integration for scanned documents
106//!
107//! ## Examples
108//!
109//! ### Content Stream Processing
110//!
111//! ```rust,no_run
112//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
113//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
114//!
115//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
116//! let reader = PdfReader::open("document.pdf")?;
117//! let document = PdfDocument::new(reader);
118//! let page = document.get_page(0)?;
119//!
120//! // Get and parse content streams
121//! let streams = page.content_streams_with_document(&document)?;
122//! for stream in streams {
123//! let operations = ContentParser::parse(&stream)?;
124//!
125//! for op in operations {
126//! match op {
127//! ContentOperation::ShowText(text) => {
128//! println!("Text: {:?}", String::from_utf8_lossy(&text));
129//! }
130//! ContentOperation::SetFont(name, size) => {
131//! println!("Font: {} at {} pt", name, size);
132//! }
133//! ContentOperation::MoveTo(x, y) => {
134//! println!("Move to ({}, {})", x, y);
135//! }
136//! _ => {} // Handle other operations
137//! }
138//! }
139//! }
140//! # Ok(())
141//! # }
142//! ```
143//!
144//! ### Resource Access
145//!
146//! ```rust,no_run
147//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
148//!
149//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
150//! let reader = PdfReader::open("document.pdf")?;
151//! let document = PdfDocument::new(reader);
152//! let page = document.get_page(0)?;
153//!
154//! // Access page resources
155//! if let Some(resources) = page.get_resources() {
156//! // Check fonts
157//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
158//! for (name, _) in &fonts.0 {
159//! println!("Font resource: {}", name.as_str());
160//! }
161//! }
162//!
163//! // Check images/XObjects
164//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
165//! for (name, _) in &xobjects.0 {
166//! println!("XObject resource: {}", name.as_str());
167//! }
168//! }
169//! }
170//! # Ok(())
171//! # }
172//! ```
173
174pub mod document;
175pub mod error;
176pub mod graphics;
177pub mod objects;
178pub mod operations;
179pub mod page;
180pub mod parser;
181pub mod text;
182pub mod writer;
183
184#[cfg(feature = "semantic")]
185pub mod semantic;
186
187// Re-export generation types
188pub use document::{Document, DocumentMetadata};
189pub use error::{OxidizePdfError, PdfError, Result};
190pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
191pub use page::{Margins, Page};
192pub use text::{
193 measure_text, split_into_words, Font, FontFamily, FragmentType, ImagePreprocessing,
194 MockOcrProvider, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
195 OcrTextFragment, TextAlign, TextContext, TextFlowContext,
196};
197
198// Re-export parsing types
199pub use parser::{
200 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParsedPage,
201 PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream, PdfString,
202};
203
204// Re-export operations
205pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
206
207/// Current version of oxidize-pdf
208pub const VERSION: &str = env!("CARGO_PKG_VERSION");
209
210/// Scanned page analysis and OCR example
211///
212/// ```rust,no_run
213/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
214/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
215/// use oxidize_pdf::parser::PdfReader;
216///
217/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
218/// let document = PdfReader::open_document("scanned.pdf")?;
219/// let analyzer = PageContentAnalyzer::new(document);
220///
221/// // Analyze pages for scanned content
222/// let analyses = analyzer.analyze_document()?;
223/// for analysis in analyses {
224/// match analysis.page_type {
225/// PageType::Scanned => {
226/// println!("Page {} is scanned - applying OCR", analysis.page_number);
227///
228/// // Process with OCR
229/// let ocr_provider = MockOcrProvider::new();
230/// let ocr_result = analyzer.extract_text_from_scanned_page(
231/// analysis.page_number,
232/// &ocr_provider
233/// )?;
234///
235/// println!("OCR extracted: {}", ocr_result.text);
236/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
237/// }
238/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
239/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
240/// }
241/// }
242/// # Ok(())
243/// # }
244/// ```
245/// Supported PDF versions
246pub mod pdf_version {
247 /// PDF 1.0 - 1.7 are fully supported
248 pub const SUPPORTED_VERSIONS: &[&str] =
249 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
250 /// PDF 2.0 support is planned
251 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
252}
253
254#[cfg(test)]
255mod tests {
256 use super::*;
257
258 #[test]
259 fn test_create_empty_document() {
260 let doc = Document::new();
261 assert_eq!(doc.pages.len(), 0);
262 }
263
264 #[test]
265 fn test_create_page() {
266 let page = Page::new(595.0, 842.0);
267 assert_eq!(page.width(), 595.0);
268 assert_eq!(page.height(), 842.0);
269 }
270
271 #[test]
272 fn test_version_info() {
273 assert!(!VERSION.is_empty());
274 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
275 }
276
277 #[test]
278 fn test_pdf_version_constants() {
279 // Test that all expected PDF versions are supported
280 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
281
282 for version in expected_versions {
283 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&version),
284 "Expected PDF version {} to be supported", version);
285 }
286
287 // Test that we have exactly 8 supported versions
288 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
289
290 // Test planned versions
291 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
292 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
293 }
294
295 #[test]
296 fn test_document_with_metadata() {
297 let mut doc = Document::new();
298 doc.set_title("Test Document");
299 doc.set_author("Test Author");
300 doc.set_subject("Test Subject");
301
302 // Verify metadata is set (checking internal state)
303 assert_eq!(doc.pages.len(), 0);
304 // Note: We can't directly test metadata without exposing getters
305 // This test ensures the methods don't panic
306 }
307
308 #[test]
309 fn test_page_creation_variants() {
310 // Test different page creation methods
311 let page_a4 = Page::a4();
312 let page_letter = Page::letter();
313 let page_custom = Page::new(400.0, 600.0);
314
315 // A4 dimensions: 595.276 x 841.89 points (approximation)
316 assert!((page_a4.width() - 595.0).abs() < 10.0);
317 assert!((page_a4.height() - 842.0).abs() < 10.0);
318
319 // Letter dimensions: 612 x 792 points
320 assert_eq!(page_letter.width(), 612.0);
321 assert_eq!(page_letter.height(), 792.0);
322
323 // Custom dimensions
324 assert_eq!(page_custom.width(), 400.0);
325 assert_eq!(page_custom.height(), 600.0);
326 }
327
328 #[test]
329 fn test_color_creation() {
330 let red = Color::rgb(1.0, 0.0, 0.0);
331 let green = Color::rgb(0.0, 1.0, 0.0);
332 let blue = Color::rgb(0.0, 0.0, 1.0);
333 let black = Color::rgb(0.0, 0.0, 0.0);
334 let white = Color::rgb(1.0, 1.0, 1.0);
335
336 // Test color creation doesn't panic
337 let _colors = [red, green, blue, black, white];
338
339 // Test CMYK color (if available)
340 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
341 let _cmyk_test = cyan;
342 }
343
344 #[test]
345 fn test_font_types() {
346 let helvetica = Font::Helvetica;
347 let times = Font::TimesRoman;
348 let courier = Font::Courier;
349
350 // Test font creation doesn't panic
351 let _fonts = [helvetica, times, courier];
352
353 // Test font family
354 let helvetica_family = FontFamily::Helvetica;
355 let times_family = FontFamily::Times;
356 let courier_family = FontFamily::Courier;
357
358 let _families = [helvetica_family, times_family, courier_family];
359 }
360
361 #[test]
362 fn test_error_types() {
363 // Test that error types can be created
364 let pdf_error = PdfError::InvalidStructure("test error".to_string());
365 let _error_test = pdf_error;
366
367 // Test result type
368 let ok_result: Result<i32> = Ok(42);
369 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
370
371 assert!(ok_result.is_ok());
372 assert!(err_result.is_err());
373 }
374
375 #[test]
376 fn test_module_exports() {
377 // Test that all major types are properly exported
378 let _doc = Document::new();
379 let _page = Page::new(100.0, 100.0);
380 let _color = Color::rgb(0.5, 0.5, 0.5);
381 let _font = Font::Helvetica;
382
383 // Test parsing types
384 let _array = PdfArray::new();
385 let _dict = PdfDictionary::new();
386 let _name = PdfName::new("Test".to_string());
387 let _string = PdfString::new(b"Test".to_vec());
388
389 // Test operation types
390 let _margins = Margins { top: 10.0, right: 10.0, bottom: 10.0, left: 10.0 };
391 let _align = TextAlign::Left;
392 }
393
394 #[test]
395 fn test_ocr_types() {
396 // Test OCR-related types
397 let _mock_ocr = MockOcrProvider::new();
398 let _ocr_options = OcrOptions::default();
399 let _ocr_engine = OcrEngine::Tesseract;
400
401 // Test fragment types
402 let _fragment_type = FragmentType::Word;
403 let _image_preprocessing = ImagePreprocessing::default();
404 }
405
406 #[test]
407 fn test_text_utilities() {
408 // Test text utility functions
409 let text = "Hello world test";
410 let words = split_into_words(text);
411 assert!(words.len() > 0);
412 assert!(words.contains(&"Hello"));
413 assert!(words.contains(&"world"));
414
415 // Test text measurement (with mock font)
416 let font = Font::Helvetica;
417 let size = 12.0;
418 let width = measure_text(text, font, size);
419 assert!(width > 0.0);
420 }
421
422 #[test]
423 fn test_image_types() {
424 // Test image-related types
425 let _format = ImageFormat::Jpeg;
426 let _color_space = ImageColorSpace::DeviceRGB;
427
428 // Test that image creation doesn't panic
429 let image_data = vec![0u8; 100];
430 let _image = Image::from_jpeg_data(image_data);
431 }
432
433 #[test]
434 fn test_version_string_format() {
435 // Test that version string follows semantic versioning
436 let version_parts: Vec<&str> = VERSION.split('.').collect();
437 assert!(version_parts.len() >= 2, "Version should have at least major.minor format");
438
439 // Test that major and minor are numeric
440 assert!(version_parts[0].parse::<u32>().is_ok(), "Major version should be numeric");
441 assert!(version_parts[1].parse::<u32>().is_ok(), "Minor version should be numeric");
442
443 // Test that version is not empty
444 assert!(!VERSION.is_empty());
445 assert!(VERSION.len() > 0);
446 }
447}