oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod pipeline;
210pub mod recovery;
211pub mod streaming;
212pub mod structure;
213pub mod templates;
214pub mod text;
215pub mod verification;
216pub mod viewer_preferences;
217pub mod writer;
218
219pub mod semantic;
220pub mod signatures;
221
222// Dashboard and reporting modules
223pub mod dashboard;
224
225// Re-export generation types
226pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
227pub use document::{Document, DocumentMetadata};
228pub use error::{OxidizePdfError, PdfError, Result};
229pub use geometry::{Point, Rectangle};
230pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
231pub use page::{Margins, Page};
232pub use page_lists::{ListStyle, ListType, PageLists};
233pub use page_tables::{PageTables, TableStyle};
234pub use text::{
235 measure_text,
236 split_into_words,
237 BulletStyle,
238 Font,
239 FontFamily,
240 FragmentType,
241 HeaderStyle,
242 ImagePreprocessing,
243 ListElement,
244 ListOptions,
245 MockOcrProvider,
246 OcrEngine,
247 OcrError,
248 OcrOptions,
249 OcrProcessingResult,
250 OcrProvider,
251 OcrResult,
252 OcrTextFragment,
253 // List exports
254 OrderedList,
255 OrderedListStyle,
256 // Table exports
257 Table,
258 TableCell,
259 TableOptions,
260 TextAlign,
261 TextContext,
262 TextFlowContext,
263 UnorderedList,
264};
265
266// Re-export forms types
267pub use forms::{
268 calculations::FieldValue,
269 field_actions::{
270 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
271 SpecialFormatType, ValidateActionType,
272 },
273 validation::{
274 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
275 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
276 },
277 BorderStyle, FieldType, TextField, Widget,
278};
279
280// Re-export font embedding types
281pub use text::fonts::embedding::{
282 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
283 FontEncoding, FontFlags, FontMetrics, FontType,
284};
285
286// Re-export font management types
287pub use text::font_manager::{CustomFont, FontManager};
288
289// Re-export parsing types
290pub use parser::{
291 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
292 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
293 PdfString,
294};
295
296// Re-export operations
297pub use operations::{
298 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
299 ExtractImagesOptions, ExtractedImage, ImageExtractor,
300};
301
302// Re-export dashboard types
303pub use dashboard::{
304 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
305 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
306};
307
308// Re-export memory optimization types
309pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
310
311// Re-export streaming types
312pub use streaming::{
313 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
314 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
315 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
316};
317
318// Re-export batch processing types
319pub use batch::{
320 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
321 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
322 ProgressCallback, ProgressInfo,
323};
324
325// Re-export recovery types
326pub use recovery::{
327 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
328 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
329 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
330};
331
332// Re-export structure types
333pub use structure::{
334 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
335 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
336};
337
338// Re-export action types
339pub use actions::{
340 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
341 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
342};
343
344// Re-export page label types
345pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
346
347// Re-export template types
348pub use templates::{
349 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
350};
351
352// Re-export semantic types for AI-Ready PDFs
353pub use semantic::{
354 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
355 RelationType, SemanticEntity, SemanticMarking,
356};
357
358// Re-export verification types
359pub use verification::comparators::{
360 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
361};
362pub use verification::compliance_report::{
363 format_report_markdown, generate_compliance_report, ComplianceReport,
364};
365pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
366pub use verification::validators::{
367 check_available_validators, validate_external, validate_with_qpdf,
368};
369pub use verification::{
370 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
371 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
372};
373
374// Re-export PDF/A compliance types
375pub use pdfa::{
376 PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
377 ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
378 ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
379};
380
381/// Current version of oxidize-pdf
382pub const VERSION: &str = env!("CARGO_PKG_VERSION");
383
384/// Scanned page analysis and OCR example
385///
386/// ```rust,no_run
387/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
388/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
389/// use oxidize_pdf::parser::PdfReader;
390///
391/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
392/// let document = PdfReader::open_document("scanned.pdf")?;
393/// let analyzer = PageContentAnalyzer::new(document);
394///
395/// // Analyze pages for scanned content
396/// let analyses = analyzer.analyze_document()?;
397/// for analysis in analyses {
398/// match analysis.page_type {
399/// PageType::Scanned => {
400/// println!("Page {} is scanned - applying OCR", analysis.page_number);
401///
402/// // Process with OCR
403/// let ocr_provider = MockOcrProvider::new();
404/// let ocr_result = analyzer.extract_text_from_scanned_page(
405/// analysis.page_number,
406/// &ocr_provider
407/// )?;
408///
409/// println!("OCR extracted: {}", ocr_result.text);
410/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
411/// }
412/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
413/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
414/// }
415/// }
416/// # Ok(())
417/// # }
418/// ```
419///
420/// ### Font Embedding
421///
422/// ```rust,no_run
423/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
424/// use std::collections::HashSet;
425///
426/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
427/// // Create font embedder
428/// let mut embedder = FontEmbedder::new();
429///
430/// // Define used glyphs (example with basic ASCII)
431/// let mut used_glyphs = HashSet::new();
432/// used_glyphs.insert(65); // 'A'
433/// used_glyphs.insert(66); // 'B'
434/// used_glyphs.insert(67); // 'C'
435///
436/// // Configure embedding options
437/// let options = EmbeddingOptions {
438/// subset: true, // Create font subset
439/// compress_font_streams: true, // Compress font data
440/// ..Default::default()
441/// };
442///
443/// // Load font data (example - you'd load actual TrueType data)
444/// let font_data = std::fs::read("path/to/font.ttf")?;
445///
446/// // Embed the font
447/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
448/// println!("Embedded font as: {}", font_name);
449///
450/// // Generate PDF dictionary for the embedded font
451/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
452/// println!("Font dictionary generated successfully");
453/// # Ok(())
454/// # }
455/// ```
456///
457/// Supported PDF versions
458pub mod pdf_version {
459 /// PDF 1.0 - 1.7 are fully supported
460 pub const SUPPORTED_VERSIONS: &[&str] =
461 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
462 /// PDF 2.0 support is planned
463 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
464}
465
466#[cfg(test)]
467mod tests {
468 use super::*;
469
470 #[test]
471 fn test_create_empty_document() {
472 let doc = Document::new();
473 assert_eq!(doc.pages.len(), 0);
474 }
475
476 #[test]
477 fn test_create_page() {
478 let page = Page::new(595.0, 842.0);
479 assert_eq!(page.width(), 595.0);
480 assert_eq!(page.height(), 842.0);
481 }
482
483 #[test]
484 fn test_version_info() {
485 assert!(!VERSION.is_empty());
486 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
487 }
488
489 #[test]
490 fn test_pdf_version_constants() {
491 // Test that all expected PDF versions are supported
492 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
493
494 for version in expected_versions {
495 assert!(
496 pdf_version::SUPPORTED_VERSIONS.contains(&version),
497 "Expected PDF version {version} to be supported"
498 );
499 }
500
501 // Test that we have exactly 8 supported versions
502 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
503
504 // Test planned versions
505 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
506 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
507 }
508
509 #[test]
510 fn test_document_with_metadata() {
511 let mut doc = Document::new();
512 doc.set_title("Test Document");
513 doc.set_author("Test Author");
514 doc.set_subject("Test Subject");
515
516 // Verify metadata is set (checking internal state)
517 assert_eq!(doc.pages.len(), 0);
518 // Note: We can't directly test metadata without exposing getters
519 // This test ensures the methods don't panic
520 }
521
522 #[test]
523 fn test_page_creation_variants() {
524 // Test different page creation methods
525 let page_a4 = Page::a4();
526 let page_letter = Page::letter();
527 let page_custom = Page::new(400.0, 600.0);
528
529 // A4 dimensions: 595.276 x 841.89 points (approximation)
530 assert!((page_a4.width() - 595.0).abs() < 10.0);
531 assert!((page_a4.height() - 842.0).abs() < 10.0);
532
533 // Letter dimensions: 612 x 792 points
534 assert_eq!(page_letter.width(), 612.0);
535 assert_eq!(page_letter.height(), 792.0);
536
537 // Custom dimensions
538 assert_eq!(page_custom.width(), 400.0);
539 assert_eq!(page_custom.height(), 600.0);
540 }
541
542 #[test]
543 fn test_color_creation() {
544 let red = Color::rgb(1.0, 0.0, 0.0);
545 let green = Color::rgb(0.0, 1.0, 0.0);
546 let blue = Color::rgb(0.0, 0.0, 1.0);
547 let black = Color::rgb(0.0, 0.0, 0.0);
548 let white = Color::rgb(1.0, 1.0, 1.0);
549
550 // Test color creation doesn't panic
551 let _colors = [red, green, blue, black, white];
552
553 // Test CMYK color (if available)
554 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
555 let _cmyk_test = cyan;
556 }
557
558 #[test]
559 fn test_font_types() {
560 let helvetica = Font::Helvetica;
561 let times = Font::TimesRoman;
562 let courier = Font::Courier;
563
564 // Test font creation doesn't panic
565 let _fonts = [helvetica, times, courier];
566
567 // Test font family
568 let helvetica_family = FontFamily::Helvetica;
569 let times_family = FontFamily::Times;
570 let courier_family = FontFamily::Courier;
571
572 let _families = [helvetica_family, times_family, courier_family];
573 }
574
575 #[test]
576 fn test_error_types() {
577 // Test that error types can be created
578 let pdf_error = PdfError::InvalidStructure("test error".to_string());
579 let _error_test = pdf_error;
580
581 // Test result type
582 let ok_result: Result<i32> = Ok(42);
583 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
584
585 assert!(ok_result.is_ok());
586 assert!(err_result.is_err());
587 }
588
589 #[test]
590 fn test_module_exports() {
591 // Test that all major types are properly exported
592 let _doc = Document::new();
593 let _page = Page::new(100.0, 100.0);
594 let _color = Color::rgb(0.5, 0.5, 0.5);
595 let _font = Font::Helvetica;
596
597 // Test parsing types
598 let _array = PdfArray::new();
599 let _dict = PdfDictionary::new();
600 let _name = PdfName::new("Test".to_string());
601 let _string = PdfString::new(b"Test".to_vec());
602
603 // Test operation types
604 let _margins = Margins {
605 top: 10.0,
606 right: 10.0,
607 bottom: 10.0,
608 left: 10.0,
609 };
610 let _align = TextAlign::Left;
611 }
612
613 #[test]
614 fn test_ocr_types() {
615 // Test OCR-related types
616 let _mock_ocr = MockOcrProvider::new();
617 let _ocr_options = OcrOptions::default();
618 let _ocr_engine = OcrEngine::Tesseract;
619
620 // Test fragment types
621 let _fragment_type = FragmentType::Word;
622 let _image_preprocessing = ImagePreprocessing::default();
623 }
624
625 #[test]
626 fn test_text_utilities() {
627 // Test text utility functions
628 let text = "Hello world test";
629 let words = split_into_words(text);
630 assert!(!words.is_empty());
631 assert!(words.contains(&"Hello"));
632 assert!(words.contains(&"world"));
633
634 // Test text measurement (with mock font)
635 let font = Font::Helvetica;
636 let size = 12.0;
637 let width = measure_text(text, font, size);
638 assert!(width > 0.0);
639 }
640
641 #[test]
642 fn test_image_types() {
643 // Test image-related types
644 let _format = ImageFormat::Jpeg;
645 let _color_space = ColorSpace::DeviceRGB;
646
647 // Test that image creation doesn't panic
648 let image_data = vec![0u8; 100];
649 let _image = Image::from_jpeg_data(image_data);
650 }
651
652 #[test]
653 fn test_version_string_format() {
654 // Test that version string follows semantic versioning
655 let version_parts: Vec<&str> = VERSION.split('.').collect();
656 assert!(
657 version_parts.len() >= 2,
658 "Version should have at least major.minor format"
659 );
660
661 // Test that major and minor are numeric
662 assert!(
663 version_parts[0].parse::<u32>().is_ok(),
664 "Major version should be numeric"
665 );
666 assert!(
667 version_parts[1].parse::<u32>().is_ok(),
668 "Minor version should be numeric"
669 );
670
671 // Test that version is not empty
672 assert!(!VERSION.is_empty());
673 assert!(!VERSION.is_empty());
674 }
675}