oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod recovery;
210pub mod streaming;
211pub mod structure;
212pub mod templates;
213pub mod text;
214pub mod verification;
215pub mod viewer_preferences;
216pub mod writer;
217
218pub mod semantic;
219pub mod signatures;
220
221// Dashboard and reporting modules
222pub mod dashboard;
223
224// Re-export generation types
225pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
226pub use document::{Document, DocumentMetadata};
227pub use error::{OxidizePdfError, PdfError, Result};
228pub use geometry::{Point, Rectangle};
229pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
230pub use page::{Margins, Page};
231pub use page_lists::{ListStyle, ListType, PageLists};
232pub use page_tables::{PageTables, TableStyle};
233pub use text::{
234 measure_text,
235 split_into_words,
236 BulletStyle,
237 Font,
238 FontFamily,
239 FragmentType,
240 HeaderStyle,
241 ImagePreprocessing,
242 ListElement,
243 ListOptions,
244 MockOcrProvider,
245 OcrEngine,
246 OcrError,
247 OcrOptions,
248 OcrProcessingResult,
249 OcrProvider,
250 OcrResult,
251 OcrTextFragment,
252 // List exports
253 OrderedList,
254 OrderedListStyle,
255 // Table exports
256 Table,
257 TableCell,
258 TableOptions,
259 TextAlign,
260 TextContext,
261 TextFlowContext,
262 UnorderedList,
263};
264
265// Re-export forms types
266pub use forms::{
267 calculations::FieldValue,
268 field_actions::{
269 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
270 SpecialFormatType, ValidateActionType,
271 },
272 validation::{
273 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
274 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
275 },
276 BorderStyle, FieldType, TextField, Widget,
277};
278
279// Re-export font embedding types
280pub use text::fonts::embedding::{
281 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
282 FontEncoding, FontFlags, FontMetrics, FontType,
283};
284
285// Re-export font management types
286pub use text::font_manager::{CustomFont, FontManager};
287
288// Re-export parsing types
289pub use parser::{
290 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
291 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
292 PdfString,
293};
294
295// Re-export operations
296pub use operations::{
297 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
298 ExtractImagesOptions, ExtractedImage, ImageExtractor,
299};
300
301// Re-export dashboard types
302pub use dashboard::{
303 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
304 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
305};
306
307// Re-export memory optimization types
308pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
309
310// Re-export streaming types
311pub use streaming::{
312 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
313 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
314 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
315};
316
317// Re-export batch processing types
318pub use batch::{
319 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
320 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
321 ProgressCallback, ProgressInfo,
322};
323
324// Re-export recovery types
325pub use recovery::{
326 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
327 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
328 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
329};
330
331// Re-export structure types
332pub use structure::{
333 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
334 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
335};
336
337// Re-export action types
338pub use actions::{
339 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
340 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
341};
342
343// Re-export page label types
344pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
345
346// Re-export template types
347pub use templates::{
348 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
349};
350
351// Re-export semantic types for AI-Ready PDFs
352pub use semantic::{
353 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
354 RelationType, SemanticEntity, SemanticMarking,
355};
356
357// Re-export verification types
358pub use verification::comparators::{
359 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
360};
361pub use verification::compliance_report::{
362 format_report_markdown, generate_compliance_report, ComplianceReport,
363};
364pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
365pub use verification::validators::{
366 check_available_validators, validate_external, validate_with_qpdf,
367};
368pub use verification::{
369 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
370 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
371};
372
373// Re-export PDF/A compliance types
374pub use pdfa::{
375 PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
376 ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
377 ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
378};
379
380/// Current version of oxidize-pdf
381pub const VERSION: &str = env!("CARGO_PKG_VERSION");
382
383/// Scanned page analysis and OCR example
384///
385/// ```rust,no_run
386/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
387/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
388/// use oxidize_pdf::parser::PdfReader;
389///
390/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
391/// let document = PdfReader::open_document("scanned.pdf")?;
392/// let analyzer = PageContentAnalyzer::new(document);
393///
394/// // Analyze pages for scanned content
395/// let analyses = analyzer.analyze_document()?;
396/// for analysis in analyses {
397/// match analysis.page_type {
398/// PageType::Scanned => {
399/// println!("Page {} is scanned - applying OCR", analysis.page_number);
400///
401/// // Process with OCR
402/// let ocr_provider = MockOcrProvider::new();
403/// let ocr_result = analyzer.extract_text_from_scanned_page(
404/// analysis.page_number,
405/// &ocr_provider
406/// )?;
407///
408/// println!("OCR extracted: {}", ocr_result.text);
409/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
410/// }
411/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
412/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
413/// }
414/// }
415/// # Ok(())
416/// # }
417/// ```
418///
419/// ### Font Embedding
420///
421/// ```rust,no_run
422/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
423/// use std::collections::HashSet;
424///
425/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
426/// // Create font embedder
427/// let mut embedder = FontEmbedder::new();
428///
429/// // Define used glyphs (example with basic ASCII)
430/// let mut used_glyphs = HashSet::new();
431/// used_glyphs.insert(65); // 'A'
432/// used_glyphs.insert(66); // 'B'
433/// used_glyphs.insert(67); // 'C'
434///
435/// // Configure embedding options
436/// let options = EmbeddingOptions {
437/// subset: true, // Create font subset
438/// compress_font_streams: true, // Compress font data
439/// ..Default::default()
440/// };
441///
442/// // Load font data (example - you'd load actual TrueType data)
443/// let font_data = std::fs::read("path/to/font.ttf")?;
444///
445/// // Embed the font
446/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
447/// println!("Embedded font as: {}", font_name);
448///
449/// // Generate PDF dictionary for the embedded font
450/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
451/// println!("Font dictionary generated successfully");
452/// # Ok(())
453/// # }
454/// ```
455///
456/// Supported PDF versions
457pub mod pdf_version {
458 /// PDF 1.0 - 1.7 are fully supported
459 pub const SUPPORTED_VERSIONS: &[&str] =
460 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
461 /// PDF 2.0 support is planned
462 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468
469 #[test]
470 fn test_create_empty_document() {
471 let doc = Document::new();
472 assert_eq!(doc.pages.len(), 0);
473 }
474
475 #[test]
476 fn test_create_page() {
477 let page = Page::new(595.0, 842.0);
478 assert_eq!(page.width(), 595.0);
479 assert_eq!(page.height(), 842.0);
480 }
481
482 #[test]
483 fn test_version_info() {
484 assert!(!VERSION.is_empty());
485 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
486 }
487
488 #[test]
489 fn test_pdf_version_constants() {
490 // Test that all expected PDF versions are supported
491 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
492
493 for version in expected_versions {
494 assert!(
495 pdf_version::SUPPORTED_VERSIONS.contains(&version),
496 "Expected PDF version {version} to be supported"
497 );
498 }
499
500 // Test that we have exactly 8 supported versions
501 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
502
503 // Test planned versions
504 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
505 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
506 }
507
508 #[test]
509 fn test_document_with_metadata() {
510 let mut doc = Document::new();
511 doc.set_title("Test Document");
512 doc.set_author("Test Author");
513 doc.set_subject("Test Subject");
514
515 // Verify metadata is set (checking internal state)
516 assert_eq!(doc.pages.len(), 0);
517 // Note: We can't directly test metadata without exposing getters
518 // This test ensures the methods don't panic
519 }
520
521 #[test]
522 fn test_page_creation_variants() {
523 // Test different page creation methods
524 let page_a4 = Page::a4();
525 let page_letter = Page::letter();
526 let page_custom = Page::new(400.0, 600.0);
527
528 // A4 dimensions: 595.276 x 841.89 points (approximation)
529 assert!((page_a4.width() - 595.0).abs() < 10.0);
530 assert!((page_a4.height() - 842.0).abs() < 10.0);
531
532 // Letter dimensions: 612 x 792 points
533 assert_eq!(page_letter.width(), 612.0);
534 assert_eq!(page_letter.height(), 792.0);
535
536 // Custom dimensions
537 assert_eq!(page_custom.width(), 400.0);
538 assert_eq!(page_custom.height(), 600.0);
539 }
540
541 #[test]
542 fn test_color_creation() {
543 let red = Color::rgb(1.0, 0.0, 0.0);
544 let green = Color::rgb(0.0, 1.0, 0.0);
545 let blue = Color::rgb(0.0, 0.0, 1.0);
546 let black = Color::rgb(0.0, 0.0, 0.0);
547 let white = Color::rgb(1.0, 1.0, 1.0);
548
549 // Test color creation doesn't panic
550 let _colors = [red, green, blue, black, white];
551
552 // Test CMYK color (if available)
553 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
554 let _cmyk_test = cyan;
555 }
556
557 #[test]
558 fn test_font_types() {
559 let helvetica = Font::Helvetica;
560 let times = Font::TimesRoman;
561 let courier = Font::Courier;
562
563 // Test font creation doesn't panic
564 let _fonts = [helvetica, times, courier];
565
566 // Test font family
567 let helvetica_family = FontFamily::Helvetica;
568 let times_family = FontFamily::Times;
569 let courier_family = FontFamily::Courier;
570
571 let _families = [helvetica_family, times_family, courier_family];
572 }
573
574 #[test]
575 fn test_error_types() {
576 // Test that error types can be created
577 let pdf_error = PdfError::InvalidStructure("test error".to_string());
578 let _error_test = pdf_error;
579
580 // Test result type
581 let ok_result: Result<i32> = Ok(42);
582 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
583
584 assert!(ok_result.is_ok());
585 assert!(err_result.is_err());
586 }
587
588 #[test]
589 fn test_module_exports() {
590 // Test that all major types are properly exported
591 let _doc = Document::new();
592 let _page = Page::new(100.0, 100.0);
593 let _color = Color::rgb(0.5, 0.5, 0.5);
594 let _font = Font::Helvetica;
595
596 // Test parsing types
597 let _array = PdfArray::new();
598 let _dict = PdfDictionary::new();
599 let _name = PdfName::new("Test".to_string());
600 let _string = PdfString::new(b"Test".to_vec());
601
602 // Test operation types
603 let _margins = Margins {
604 top: 10.0,
605 right: 10.0,
606 bottom: 10.0,
607 left: 10.0,
608 };
609 let _align = TextAlign::Left;
610 }
611
612 #[test]
613 fn test_ocr_types() {
614 // Test OCR-related types
615 let _mock_ocr = MockOcrProvider::new();
616 let _ocr_options = OcrOptions::default();
617 let _ocr_engine = OcrEngine::Tesseract;
618
619 // Test fragment types
620 let _fragment_type = FragmentType::Word;
621 let _image_preprocessing = ImagePreprocessing::default();
622 }
623
624 #[test]
625 fn test_text_utilities() {
626 // Test text utility functions
627 let text = "Hello world test";
628 let words = split_into_words(text);
629 assert!(!words.is_empty());
630 assert!(words.contains(&"Hello"));
631 assert!(words.contains(&"world"));
632
633 // Test text measurement (with mock font)
634 let font = Font::Helvetica;
635 let size = 12.0;
636 let width = measure_text(text, font, size);
637 assert!(width > 0.0);
638 }
639
640 #[test]
641 fn test_image_types() {
642 // Test image-related types
643 let _format = ImageFormat::Jpeg;
644 let _color_space = ColorSpace::DeviceRGB;
645
646 // Test that image creation doesn't panic
647 let image_data = vec![0u8; 100];
648 let _image = Image::from_jpeg_data(image_data);
649 }
650
651 #[test]
652 fn test_version_string_format() {
653 // Test that version string follows semantic versioning
654 let version_parts: Vec<&str> = VERSION.split('.').collect();
655 assert!(
656 version_parts.len() >= 2,
657 "Version should have at least major.minor format"
658 );
659
660 // Test that major and minor are numeric
661 assert!(
662 version_parts[0].parse::<u32>().is_ok(),
663 "Major version should be numeric"
664 );
665 assert!(
666 version_parts[1].parse::<u32>().is_ok(),
667 "Minor version should be numeric"
668 );
669
670 // Test that version is not empty
671 assert!(!VERSION.is_empty());
672 assert!(!VERSION.is_empty());
673 }
674}