oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
13//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
14//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
15//! - **Resource Access**: Work with fonts, images, and other PDF resources
16//! - **Pure Rust**: No C dependencies or external libraries
17//! - **100% Native**: Complete PDF implementation from scratch
18//!
19//! ## Quick Start
20//!
21//! ### Creating PDFs
22//!
23//! ```rust
24//! use oxidize_pdf::{Document, Page, Font, Color, Result};
25//!
26//! # fn main() -> Result<()> {
27//! // Create a new document
28//! let mut doc = Document::new();
29//! doc.set_title("My PDF");
30//!
31//! // Create a page
32//! let mut page = Page::a4();
33//!
34//! // Add text
35//! page.text()
36//! .set_font(Font::Helvetica, 24.0)
37//! .at(50.0, 700.0)
38//! .write("Hello, PDF!")?;
39//!
40//! // Add graphics
41//! page.graphics()
42//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
43//! .circle(300.0, 400.0, 50.0)
44//! .fill();
45//!
46//! // Save the document
47//! doc.add_page(page);
48//! doc.save("output.pdf")?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ### Parsing PDFs
54//!
55//! ```rust,no_run
56//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
57//!
58//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
59//! // Open and parse a PDF
60//! let reader = PdfReader::open("document.pdf")?;
61//! let document = PdfDocument::new(reader);
62//!
63//! // Get document information
64//! println!("Pages: {}", document.page_count()?);
65//! println!("Version: {}", document.version()?);
66//!
67//! // Process pages
68//! for i in 0..document.page_count()? {
69//! let page = document.get_page(i)?;
70//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
71//! }
72//!
73//! // Extract text
74//! let text_pages = document.extract_text()?;
75//! for (i, page_text) in text_pages.iter().enumerate() {
76//! println!("Page {} text: {}", i+1, page_text.text);
77//! }
78//! # Ok(())
79//! # }
80//! ```
81//!
82//! ## Modules
83//!
84//! ### Generation Modules
85//! - [`document`] - PDF document creation and management
86//! - [`page`] - Page creation and layout
87//! - [`graphics`] - Vector graphics and images
88//! - [`text`] - Text rendering and flow
89//! - [`writer`] - Low-level PDF writing
90//!
91//! ### Parsing Modules
92//! - [`parser`] - Complete PDF parsing and reading
93//! - [`parser::PdfDocument`] - High-level document interface
94//! - [`parser::ParsedPage`] - Page representation with resources
95//! - [`parser::ContentParser`] - Content stream parsing
96//! - [`parser::PdfObject`] - Low-level PDF objects
97//!
98//! ### Manipulation Modules
99//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
100//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
101//! - [`text::extraction`] - Text extraction with positioning
102//!
103//! ### OCR Modules (v0.1.3+)
104//! - [`text::ocr`] - OCR trait system and types
105//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
106//! - [`text::ocr`] - OCR integration for scanned documents
107//!
108//! ## Examples
109//!
110//! ### Content Stream Processing
111//!
112//! ```rust,no_run
113//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
114//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
115//!
116//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
117//! let reader = PdfReader::open("document.pdf")?;
118//! let document = PdfDocument::new(reader);
119//! let page = document.get_page(0)?;
120//!
121//! // Get and parse content streams
122//! let streams = page.content_streams_with_document(&document)?;
123//! for stream in streams {
124//! let operations = ContentParser::parse(&stream)?;
125//!
126//! for op in operations {
127//! match op {
128//! ContentOperation::ShowText(text) => {
129//! println!("Text: {:?}", String::from_utf8_lossy(&text));
130//! }
131//! ContentOperation::SetFont(name, size) => {
132//! println!("Font: {} at {} pt", name, size);
133//! }
134//! ContentOperation::MoveTo(x, y) => {
135//! println!("Move to ({}, {})", x, y);
136//! }
137//! _ => {} // Handle other operations
138//! }
139//! }
140//! }
141//! # Ok(())
142//! # }
143//! ```
144//!
145//! ### Resource Access
146//!
147//! ```rust,no_run
148//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
149//!
150//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
151//! let reader = PdfReader::open("document.pdf")?;
152//! let document = PdfDocument::new(reader);
153//! let page = document.get_page(0)?;
154//!
155//! // Access page resources
156//! if let Some(resources) = page.get_resources() {
157//! // Check fonts
158//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
159//! for (name, _) in &fonts.0 {
160//! println!("Font resource: {}", name.as_str());
161//! }
162//! }
163//!
164//! // Check images/XObjects
165//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
166//! for (name, _) in &xobjects.0 {
167//! println!("XObject resource: {}", name.as_str());
168//! }
169//! }
170//! }
171//! # Ok(())
172//! # }
173//! ```
174
175pub mod actions;
176pub mod annotations;
177pub mod batch;
178pub mod compression;
179pub mod document;
180pub mod encryption;
181pub mod error;
182pub mod fonts;
183pub mod forms;
184pub mod geometry;
185pub mod graphics;
186pub mod memory;
187pub mod objects;
188pub mod operations;
189pub mod page;
190pub mod page_forms;
191pub mod page_labels;
192pub mod page_lists;
193pub mod page_tables;
194pub mod parser;
195pub mod recovery;
196pub mod streaming;
197pub mod structure;
198pub mod text;
199pub mod writer;
200
201#[cfg(feature = "semantic")]
202pub mod semantic;
203
204// Re-export generation types
205pub use document::{Document, DocumentMetadata};
206pub use error::{OxidizePdfError, PdfError, Result};
207pub use geometry::{Point, Rectangle};
208pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
209pub use page::{Margins, Page};
210pub use page_lists::{ListStyle, ListType, PageLists};
211pub use page_tables::{PageTables, TableStyle};
212pub use text::{
213 measure_text,
214 split_into_words,
215 AdvancedTable,
216 AdvancedTableCell,
217 AdvancedTableOptions,
218 AlternatingRowColors,
219 BorderLine,
220 BorderStyle as TableBorderStyle,
221 BulletStyle,
222 CellContent,
223 CellPadding,
224 ColumnDefinition,
225 ColumnWidth,
226 Font,
227 FontFamily,
228 FragmentType,
229 HeaderStyle,
230 ImagePreprocessing,
231 LineStyle,
232 ListElement,
233 ListOptions,
234 MockOcrProvider,
235 OcrEngine,
236 OcrError,
237 OcrOptions,
238 OcrProcessingResult,
239 OcrProvider,
240 OcrResult,
241 OcrTextFragment,
242 // List exports
243 OrderedList,
244 OrderedListStyle,
245 // Table exports
246 Table,
247 TableCell,
248 TableOptions,
249 TableRow,
250 TextAlign,
251 TextContext,
252 TextFlowContext,
253 UnorderedList,
254 VerticalAlign,
255};
256
257// Re-export font embedding types
258pub use text::fonts::embedding::{
259 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
260 FontEncoding, FontFlags, FontMetrics, FontType,
261};
262
263// Re-export parsing types
264pub use parser::{
265 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
266 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
267 PdfString,
268};
269
270// Re-export operations
271pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
272
273// Re-export memory optimization types
274pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
275
276// Re-export streaming types
277pub use streaming::{
278 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
279 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
280 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
281};
282
283// Re-export batch processing types
284pub use batch::{
285 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
286 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
287 ProgressCallback, ProgressInfo,
288};
289
290// Re-export recovery types
291pub use recovery::{
292 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
293 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
294 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
295};
296
297// Re-export structure types
298pub use structure::{
299 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
300 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
301};
302
303// Re-export action types
304pub use actions::{
305 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
306 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
307};
308
309// Re-export page label types
310pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
311
312/// Current version of oxidize-pdf
313pub const VERSION: &str = env!("CARGO_PKG_VERSION");
314
315/// Scanned page analysis and OCR example
316///
317/// ```rust,no_run
318/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
319/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
320/// use oxidize_pdf::parser::PdfReader;
321///
322/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
323/// let document = PdfReader::open_document("scanned.pdf")?;
324/// let analyzer = PageContentAnalyzer::new(document);
325///
326/// // Analyze pages for scanned content
327/// let analyses = analyzer.analyze_document()?;
328/// for analysis in analyses {
329/// match analysis.page_type {
330/// PageType::Scanned => {
331/// println!("Page {} is scanned - applying OCR", analysis.page_number);
332///
333/// // Process with OCR
334/// let ocr_provider = MockOcrProvider::new();
335/// let ocr_result = analyzer.extract_text_from_scanned_page(
336/// analysis.page_number,
337/// &ocr_provider
338/// )?;
339///
340/// println!("OCR extracted: {}", ocr_result.text);
341/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
342/// }
343/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
344/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
345/// }
346/// }
347/// # Ok(())
348/// # }
349/// ```
350///
351/// ### Font Embedding
352///
353/// ```rust,no_run
354/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
355/// use std::collections::HashSet;
356///
357/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
358/// // Create font embedder
359/// let mut embedder = FontEmbedder::new();
360///
361/// // Define used glyphs (example with basic ASCII)
362/// let mut used_glyphs = HashSet::new();
363/// used_glyphs.insert(65); // 'A'
364/// used_glyphs.insert(66); // 'B'
365/// used_glyphs.insert(67); // 'C'
366///
367/// // Configure embedding options
368/// let options = EmbeddingOptions {
369/// subset: true, // Create font subset
370/// compress_font_streams: true, // Compress font data
371/// ..Default::default()
372/// };
373///
374/// // Load font data (example - you'd load actual TrueType data)
375/// let font_data = std::fs::read("path/to/font.ttf")?;
376///
377/// // Embed the font
378/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
379/// println!("Embedded font as: {}", font_name);
380///
381/// // Generate PDF dictionary for the embedded font
382/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
383/// println!("Font dictionary generated successfully");
384/// # Ok(())
385/// # }
386/// ```
387///
388/// Supported PDF versions
389pub mod pdf_version {
390 /// PDF 1.0 - 1.7 are fully supported
391 pub const SUPPORTED_VERSIONS: &[&str] =
392 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
393 /// PDF 2.0 support is planned
394 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
395}
396
397#[cfg(test)]
398mod tests {
399 use super::*;
400
401 #[test]
402 fn test_create_empty_document() {
403 let doc = Document::new();
404 assert_eq!(doc.pages.len(), 0);
405 }
406
407 #[test]
408 fn test_create_page() {
409 let page = Page::new(595.0, 842.0);
410 assert_eq!(page.width(), 595.0);
411 assert_eq!(page.height(), 842.0);
412 }
413
414 #[test]
415 fn test_version_info() {
416 assert!(!VERSION.is_empty());
417 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
418 }
419
420 #[test]
421 fn test_pdf_version_constants() {
422 // Test that all expected PDF versions are supported
423 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
424
425 for version in expected_versions {
426 assert!(
427 pdf_version::SUPPORTED_VERSIONS.contains(&version),
428 "Expected PDF version {} to be supported",
429 version
430 );
431 }
432
433 // Test that we have exactly 8 supported versions
434 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
435
436 // Test planned versions
437 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
438 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
439 }
440
441 #[test]
442 fn test_document_with_metadata() {
443 let mut doc = Document::new();
444 doc.set_title("Test Document");
445 doc.set_author("Test Author");
446 doc.set_subject("Test Subject");
447
448 // Verify metadata is set (checking internal state)
449 assert_eq!(doc.pages.len(), 0);
450 // Note: We can't directly test metadata without exposing getters
451 // This test ensures the methods don't panic
452 }
453
454 #[test]
455 fn test_page_creation_variants() {
456 // Test different page creation methods
457 let page_a4 = Page::a4();
458 let page_letter = Page::letter();
459 let page_custom = Page::new(400.0, 600.0);
460
461 // A4 dimensions: 595.276 x 841.89 points (approximation)
462 assert!((page_a4.width() - 595.0).abs() < 10.0);
463 assert!((page_a4.height() - 842.0).abs() < 10.0);
464
465 // Letter dimensions: 612 x 792 points
466 assert_eq!(page_letter.width(), 612.0);
467 assert_eq!(page_letter.height(), 792.0);
468
469 // Custom dimensions
470 assert_eq!(page_custom.width(), 400.0);
471 assert_eq!(page_custom.height(), 600.0);
472 }
473
474 #[test]
475 fn test_color_creation() {
476 let red = Color::rgb(1.0, 0.0, 0.0);
477 let green = Color::rgb(0.0, 1.0, 0.0);
478 let blue = Color::rgb(0.0, 0.0, 1.0);
479 let black = Color::rgb(0.0, 0.0, 0.0);
480 let white = Color::rgb(1.0, 1.0, 1.0);
481
482 // Test color creation doesn't panic
483 let _colors = [red, green, blue, black, white];
484
485 // Test CMYK color (if available)
486 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
487 let _cmyk_test = cyan;
488 }
489
490 #[test]
491 fn test_font_types() {
492 let helvetica = Font::Helvetica;
493 let times = Font::TimesRoman;
494 let courier = Font::Courier;
495
496 // Test font creation doesn't panic
497 let _fonts = [helvetica, times, courier];
498
499 // Test font family
500 let helvetica_family = FontFamily::Helvetica;
501 let times_family = FontFamily::Times;
502 let courier_family = FontFamily::Courier;
503
504 let _families = [helvetica_family, times_family, courier_family];
505 }
506
507 #[test]
508 fn test_error_types() {
509 // Test that error types can be created
510 let pdf_error = PdfError::InvalidStructure("test error".to_string());
511 let _error_test = pdf_error;
512
513 // Test result type
514 let ok_result: Result<i32> = Ok(42);
515 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
516
517 assert!(ok_result.is_ok());
518 assert!(err_result.is_err());
519 }
520
521 #[test]
522 fn test_module_exports() {
523 // Test that all major types are properly exported
524 let _doc = Document::new();
525 let _page = Page::new(100.0, 100.0);
526 let _color = Color::rgb(0.5, 0.5, 0.5);
527 let _font = Font::Helvetica;
528
529 // Test parsing types
530 let _array = PdfArray::new();
531 let _dict = PdfDictionary::new();
532 let _name = PdfName::new("Test".to_string());
533 let _string = PdfString::new(b"Test".to_vec());
534
535 // Test operation types
536 let _margins = Margins {
537 top: 10.0,
538 right: 10.0,
539 bottom: 10.0,
540 left: 10.0,
541 };
542 let _align = TextAlign::Left;
543 }
544
545 #[test]
546 fn test_ocr_types() {
547 // Test OCR-related types
548 let _mock_ocr = MockOcrProvider::new();
549 let _ocr_options = OcrOptions::default();
550 let _ocr_engine = OcrEngine::Tesseract;
551
552 // Test fragment types
553 let _fragment_type = FragmentType::Word;
554 let _image_preprocessing = ImagePreprocessing::default();
555 }
556
557 #[test]
558 fn test_text_utilities() {
559 // Test text utility functions
560 let text = "Hello world test";
561 let words = split_into_words(text);
562 assert!(!words.is_empty());
563 assert!(words.contains(&"Hello"));
564 assert!(words.contains(&"world"));
565
566 // Test text measurement (with mock font)
567 let font = Font::Helvetica;
568 let size = 12.0;
569 let width = measure_text(text, font, size);
570 assert!(width > 0.0);
571 }
572
573 #[test]
574 fn test_image_types() {
575 // Test image-related types
576 let _format = ImageFormat::Jpeg;
577 let _color_space = ImageColorSpace::DeviceRGB;
578
579 // Test that image creation doesn't panic
580 let image_data = vec![0u8; 100];
581 let _image = Image::from_jpeg_data(image_data);
582 }
583
584 #[test]
585 fn test_version_string_format() {
586 // Test that version string follows semantic versioning
587 let version_parts: Vec<&str> = VERSION.split('.').collect();
588 assert!(
589 version_parts.len() >= 2,
590 "Version should have at least major.minor format"
591 );
592
593 // Test that major and minor are numeric
594 assert!(
595 version_parts[0].parse::<u32>().is_ok(),
596 "Major version should be numeric"
597 );
598 assert!(
599 version_parts[1].parse::<u32>().is_ok(),
600 "Minor version should be numeric"
601 );
602
603 // Test that version is not empty
604 assert!(!VERSION.is_empty());
605 assert!(!VERSION.is_empty());
606 }
607}