oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
13//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
14//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
15//! - **Resource Access**: Work with fonts, images, and other PDF resources
16//! - **Pure Rust**: No C dependencies or external libraries
17//! - **100% Native**: Complete PDF implementation from scratch
18//!
19//! ## Quick Start
20//!
21//! ### Creating PDFs
22//!
23//! ```rust
24//! use oxidize_pdf::{Document, Page, Font, Color, Result};
25//!
26//! # fn main() -> Result<()> {
27//! // Create a new document
28//! let mut doc = Document::new();
29//! doc.set_title("My PDF");
30//!
31//! // Create a page
32//! let mut page = Page::a4();
33//!
34//! // Add text
35//! page.text()
36//! .set_font(Font::Helvetica, 24.0)
37//! .at(50.0, 700.0)
38//! .write("Hello, PDF!")?;
39//!
40//! // Add graphics
41//! page.graphics()
42//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
43//! .circle(300.0, 400.0, 50.0)
44//! .fill();
45//!
46//! // Save the document
47//! doc.add_page(page);
48//! doc.save("output.pdf")?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ### Parsing PDFs
54//!
55//! ```rust,no_run
56//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
57//!
58//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
59//! // Open and parse a PDF
60//! let reader = PdfReader::open("document.pdf")?;
61//! let document = PdfDocument::new(reader);
62//!
63//! // Get document information
64//! println!("Pages: {}", document.page_count()?);
65//! println!("Version: {}", document.version()?);
66//!
67//! // Process pages
68//! for i in 0..document.page_count()? {
69//! let page = document.get_page(i)?;
70//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
71//! }
72//!
73//! // Extract text
74//! let text_pages = document.extract_text()?;
75//! for (i, page_text) in text_pages.iter().enumerate() {
76//! println!("Page {} text: {}", i+1, page_text.text);
77//! }
78//! # Ok(())
79//! # }
80//! ```
81//!
82//! ## Modules
83//!
84//! ### Generation Modules
85//! - [`document`] - PDF document creation and management
86//! - [`page`] - Page creation and layout
87//! - [`graphics`] - Vector graphics and images
88//! - [`text`] - Text rendering and flow
89//! - [`writer`] - Low-level PDF writing
90//!
91//! ### Parsing Modules
92//! - [`parser`] - Complete PDF parsing and reading
93//! - [`parser::PdfDocument`] - High-level document interface
94//! - [`parser::ParsedPage`] - Page representation with resources
95//! - [`parser::ContentParser`] - Content stream parsing
96//! - [`parser::PdfObject`] - Low-level PDF objects
97//!
98//! ### Manipulation Modules
99//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
100//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
101//! - [`text::extraction`] - Text extraction with positioning
102//!
103//! ### OCR Modules (v0.1.3+)
104//! - [`text::ocr`] - OCR trait system and types
105//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
106//! - [`text::ocr`] - OCR integration for scanned documents
107//!
108//! ## Examples
109//!
110//! ### Content Stream Processing
111//!
112//! ```rust,no_run
113//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
114//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
115//!
116//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
117//! let reader = PdfReader::open("document.pdf")?;
118//! let document = PdfDocument::new(reader);
119//! let page = document.get_page(0)?;
120//!
121//! // Get and parse content streams
122//! let streams = page.content_streams_with_document(&document)?;
123//! for stream in streams {
124//! let operations = ContentParser::parse(&stream)?;
125//!
126//! for op in operations {
127//! match op {
128//! ContentOperation::ShowText(text) => {
129//! println!("Text: {:?}", String::from_utf8_lossy(&text));
130//! }
131//! ContentOperation::SetFont(name, size) => {
132//! println!("Font: {} at {} pt", name, size);
133//! }
134//! ContentOperation::MoveTo(x, y) => {
135//! println!("Move to ({}, {})", x, y);
136//! }
137//! _ => {} // Handle other operations
138//! }
139//! }
140//! }
141//! # Ok(())
142//! # }
143//! ```
144//!
145//! ### Resource Access
146//!
147//! ```rust,no_run
148//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
149//!
150//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
151//! let reader = PdfReader::open("document.pdf")?;
152//! let document = PdfDocument::new(reader);
153//! let page = document.get_page(0)?;
154//!
155//! // Access page resources
156//! if let Some(resources) = page.get_resources() {
157//! // Check fonts
158//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
159//! for (name, _) in &fonts.0 {
160//! println!("Font resource: {}", name.as_str());
161//! }
162//! }
163//!
164//! // Check images/XObjects
165//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
166//! for (name, _) in &xobjects.0 {
167//! println!("XObject resource: {}", name.as_str());
168//! }
169//! }
170//! }
171//! # Ok(())
172//! # }
173//! ```
174
175pub mod actions;
176pub mod annotations;
177pub mod batch;
178pub mod compression;
179pub mod document;
180pub mod encryption;
181pub mod error;
182pub mod fonts;
183pub mod forms;
184pub mod geometry;
185pub mod graphics;
186pub mod memory;
187pub mod objects;
188pub mod operations;
189pub mod page;
190pub mod page_forms;
191pub mod page_labels;
192pub mod page_lists;
193pub mod page_tables;
194pub mod page_transitions;
195pub mod page_tree;
196pub mod parser;
197pub mod recovery;
198pub mod streaming;
199pub mod structure;
200pub mod text;
201pub mod verification;
202pub mod viewer_preferences;
203pub mod writer;
204
205#[cfg(feature = "semantic")]
206pub mod semantic;
207
208// Re-export generation types
209pub use document::{Document, DocumentMetadata};
210pub use error::{OxidizePdfError, PdfError, Result};
211pub use geometry::{Point, Rectangle};
212pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
213pub use page::{Margins, Page};
214pub use page_lists::{ListStyle, ListType, PageLists};
215pub use page_tables::{PageTables, TableStyle};
216pub use text::{
217 measure_text,
218 split_into_words,
219 BulletStyle,
220 Font,
221 FontFamily,
222 FragmentType,
223 HeaderStyle,
224 ImagePreprocessing,
225 ListElement,
226 ListOptions,
227 MockOcrProvider,
228 OcrEngine,
229 OcrError,
230 OcrOptions,
231 OcrProcessingResult,
232 OcrProvider,
233 OcrResult,
234 OcrTextFragment,
235 // List exports
236 OrderedList,
237 OrderedListStyle,
238 // Table exports
239 Table,
240 TableCell,
241 TableOptions,
242 TextAlign,
243 TextContext,
244 TextFlowContext,
245 UnorderedList,
246};
247
248// Re-export forms types
249pub use forms::{
250 calculations::FieldValue,
251 field_actions::{
252 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
253 SpecialFormatType, ValidateActionType,
254 },
255 validation::{
256 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
257 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
258 },
259 BorderStyle, FieldType, TextField, Widget,
260};
261
262// Re-export font embedding types
263pub use text::fonts::embedding::{
264 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
265 FontEncoding, FontFlags, FontMetrics, FontType,
266};
267
268// Re-export font management types
269pub use text::font_manager::{CustomFont, FontManager};
270
271// Re-export parsing types
272pub use parser::{
273 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
274 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
275 PdfString,
276};
277
278// Re-export operations
279pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
280
281// Re-export memory optimization types
282pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
283
284// Re-export streaming types
285pub use streaming::{
286 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
287 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
288 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
289};
290
291// Re-export batch processing types
292pub use batch::{
293 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
294 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
295 ProgressCallback, ProgressInfo,
296};
297
298// Re-export recovery types
299pub use recovery::{
300 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
301 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
302 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
303};
304
305// Re-export structure types
306pub use structure::{
307 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
308 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
309};
310
311// Re-export action types
312pub use actions::{
313 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
314 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
315};
316
317// Re-export page label types
318pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
319
320// Re-export verification types
321pub use verification::comparators::{
322 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
323};
324pub use verification::compliance_report::{
325 format_report_markdown, generate_compliance_report, ComplianceReport,
326};
327pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
328pub use verification::validators::{
329 check_available_validators, validate_external, validate_with_qpdf,
330};
331pub use verification::{
332 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
333 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
334};
335
336/// Current version of oxidize-pdf
337pub const VERSION: &str = env!("CARGO_PKG_VERSION");
338
339/// Scanned page analysis and OCR example
340///
341/// ```rust,no_run
342/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
343/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
344/// use oxidize_pdf::parser::PdfReader;
345///
346/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
347/// let document = PdfReader::open_document("scanned.pdf")?;
348/// let analyzer = PageContentAnalyzer::new(document);
349///
350/// // Analyze pages for scanned content
351/// let analyses = analyzer.analyze_document()?;
352/// for analysis in analyses {
353/// match analysis.page_type {
354/// PageType::Scanned => {
355/// println!("Page {} is scanned - applying OCR", analysis.page_number);
356///
357/// // Process with OCR
358/// let ocr_provider = MockOcrProvider::new();
359/// let ocr_result = analyzer.extract_text_from_scanned_page(
360/// analysis.page_number,
361/// &ocr_provider
362/// )?;
363///
364/// println!("OCR extracted: {}", ocr_result.text);
365/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
366/// }
367/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
368/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
369/// }
370/// }
371/// # Ok(())
372/// # }
373/// ```
374///
375/// ### Font Embedding
376///
377/// ```rust,no_run
378/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
379/// use std::collections::HashSet;
380///
381/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
382/// // Create font embedder
383/// let mut embedder = FontEmbedder::new();
384///
385/// // Define used glyphs (example with basic ASCII)
386/// let mut used_glyphs = HashSet::new();
387/// used_glyphs.insert(65); // 'A'
388/// used_glyphs.insert(66); // 'B'
389/// used_glyphs.insert(67); // 'C'
390///
391/// // Configure embedding options
392/// let options = EmbeddingOptions {
393/// subset: true, // Create font subset
394/// compress_font_streams: true, // Compress font data
395/// ..Default::default()
396/// };
397///
398/// // Load font data (example - you'd load actual TrueType data)
399/// let font_data = std::fs::read("path/to/font.ttf")?;
400///
401/// // Embed the font
402/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
403/// println!("Embedded font as: {}", font_name);
404///
405/// // Generate PDF dictionary for the embedded font
406/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
407/// println!("Font dictionary generated successfully");
408/// # Ok(())
409/// # }
410/// ```
411///
412/// Supported PDF versions
413pub mod pdf_version {
414 /// PDF 1.0 - 1.7 are fully supported
415 pub const SUPPORTED_VERSIONS: &[&str] =
416 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
417 /// PDF 2.0 support is planned
418 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
419}
420
421#[cfg(test)]
422mod tests {
423 use super::*;
424
425 #[test]
426 fn test_create_empty_document() {
427 let doc = Document::new();
428 assert_eq!(doc.pages.len(), 0);
429 }
430
431 #[test]
432 fn test_create_page() {
433 let page = Page::new(595.0, 842.0);
434 assert_eq!(page.width(), 595.0);
435 assert_eq!(page.height(), 842.0);
436 }
437
438 #[test]
439 fn test_version_info() {
440 assert!(!VERSION.is_empty());
441 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
442 }
443
444 #[test]
445 fn test_pdf_version_constants() {
446 // Test that all expected PDF versions are supported
447 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
448
449 for version in expected_versions {
450 assert!(
451 pdf_version::SUPPORTED_VERSIONS.contains(&version),
452 "Expected PDF version {version} to be supported"
453 );
454 }
455
456 // Test that we have exactly 8 supported versions
457 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
458
459 // Test planned versions
460 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
461 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
462 }
463
464 #[test]
465 fn test_document_with_metadata() {
466 let mut doc = Document::new();
467 doc.set_title("Test Document");
468 doc.set_author("Test Author");
469 doc.set_subject("Test Subject");
470
471 // Verify metadata is set (checking internal state)
472 assert_eq!(doc.pages.len(), 0);
473 // Note: We can't directly test metadata without exposing getters
474 // This test ensures the methods don't panic
475 }
476
477 #[test]
478 fn test_page_creation_variants() {
479 // Test different page creation methods
480 let page_a4 = Page::a4();
481 let page_letter = Page::letter();
482 let page_custom = Page::new(400.0, 600.0);
483
484 // A4 dimensions: 595.276 x 841.89 points (approximation)
485 assert!((page_a4.width() - 595.0).abs() < 10.0);
486 assert!((page_a4.height() - 842.0).abs() < 10.0);
487
488 // Letter dimensions: 612 x 792 points
489 assert_eq!(page_letter.width(), 612.0);
490 assert_eq!(page_letter.height(), 792.0);
491
492 // Custom dimensions
493 assert_eq!(page_custom.width(), 400.0);
494 assert_eq!(page_custom.height(), 600.0);
495 }
496
497 #[test]
498 fn test_color_creation() {
499 let red = Color::rgb(1.0, 0.0, 0.0);
500 let green = Color::rgb(0.0, 1.0, 0.0);
501 let blue = Color::rgb(0.0, 0.0, 1.0);
502 let black = Color::rgb(0.0, 0.0, 0.0);
503 let white = Color::rgb(1.0, 1.0, 1.0);
504
505 // Test color creation doesn't panic
506 let _colors = [red, green, blue, black, white];
507
508 // Test CMYK color (if available)
509 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
510 let _cmyk_test = cyan;
511 }
512
513 #[test]
514 fn test_font_types() {
515 let helvetica = Font::Helvetica;
516 let times = Font::TimesRoman;
517 let courier = Font::Courier;
518
519 // Test font creation doesn't panic
520 let _fonts = [helvetica, times, courier];
521
522 // Test font family
523 let helvetica_family = FontFamily::Helvetica;
524 let times_family = FontFamily::Times;
525 let courier_family = FontFamily::Courier;
526
527 let _families = [helvetica_family, times_family, courier_family];
528 }
529
530 #[test]
531 fn test_error_types() {
532 // Test that error types can be created
533 let pdf_error = PdfError::InvalidStructure("test error".to_string());
534 let _error_test = pdf_error;
535
536 // Test result type
537 let ok_result: Result<i32> = Ok(42);
538 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
539
540 assert!(ok_result.is_ok());
541 assert!(err_result.is_err());
542 }
543
544 #[test]
545 fn test_module_exports() {
546 // Test that all major types are properly exported
547 let _doc = Document::new();
548 let _page = Page::new(100.0, 100.0);
549 let _color = Color::rgb(0.5, 0.5, 0.5);
550 let _font = Font::Helvetica;
551
552 // Test parsing types
553 let _array = PdfArray::new();
554 let _dict = PdfDictionary::new();
555 let _name = PdfName::new("Test".to_string());
556 let _string = PdfString::new(b"Test".to_vec());
557
558 // Test operation types
559 let _margins = Margins {
560 top: 10.0,
561 right: 10.0,
562 bottom: 10.0,
563 left: 10.0,
564 };
565 let _align = TextAlign::Left;
566 }
567
568 #[test]
569 fn test_ocr_types() {
570 // Test OCR-related types
571 let _mock_ocr = MockOcrProvider::new();
572 let _ocr_options = OcrOptions::default();
573 let _ocr_engine = OcrEngine::Tesseract;
574
575 // Test fragment types
576 let _fragment_type = FragmentType::Word;
577 let _image_preprocessing = ImagePreprocessing::default();
578 }
579
580 #[test]
581 fn test_text_utilities() {
582 // Test text utility functions
583 let text = "Hello world test";
584 let words = split_into_words(text);
585 assert!(!words.is_empty());
586 assert!(words.contains(&"Hello"));
587 assert!(words.contains(&"world"));
588
589 // Test text measurement (with mock font)
590 let font = Font::Helvetica;
591 let size = 12.0;
592 let width = measure_text(text, font, size);
593 assert!(width > 0.0);
594 }
595
596 #[test]
597 fn test_image_types() {
598 // Test image-related types
599 let _format = ImageFormat::Jpeg;
600 let _color_space = ColorSpace::DeviceRGB;
601
602 // Test that image creation doesn't panic
603 let image_data = vec![0u8; 100];
604 let _image = Image::from_jpeg_data(image_data);
605 }
606
607 #[test]
608 fn test_version_string_format() {
609 // Test that version string follows semantic versioning
610 let version_parts: Vec<&str> = VERSION.split('.').collect();
611 assert!(
612 version_parts.len() >= 2,
613 "Version should have at least major.minor format"
614 );
615
616 // Test that major and minor are numeric
617 assert!(
618 version_parts[0].parse::<u32>().is_ok(),
619 "Major version should be numeric"
620 );
621 assert!(
622 version_parts[1].parse::<u32>().is_ok(),
623 "Minor version should be numeric"
624 );
625
626 // Test that version is not empty
627 assert!(!VERSION.is_empty());
628 assert!(!VERSION.is_empty());
629 }
630}