oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod pipeline;
210pub mod recovery;
211pub mod streaming;
212pub mod structure;
213pub mod templates;
214pub mod text;
215pub mod verification;
216pub mod viewer_preferences;
217pub mod writer;
218
219pub mod semantic;
220pub mod signatures;
221
222// Dashboard and reporting modules
223pub mod dashboard;
224
225// Re-export generation types
226pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
227pub use document::{Document, DocumentMetadata};
228pub use error::{OxidizePdfError, PdfError, Result};
229pub use geometry::{Point, Rectangle};
230pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
231pub use page::{Margins, Page};
232pub use page_lists::{ListStyle, ListType, PageLists};
233pub use page_tables::{PageTables, TableStyle};
234pub use text::{
235 measure_text,
236 split_into_words,
237 BulletStyle,
238 Font,
239 FontFamily,
240 FragmentType,
241 HeaderStyle,
242 ImagePreprocessing,
243 ListElement,
244 ListOptions,
245 MockOcrProvider,
246 OcrEngine,
247 OcrError,
248 OcrOptions,
249 OcrProcessingResult,
250 OcrProvider,
251 OcrResult,
252 OcrTextFragment,
253 // List exports
254 OrderedList,
255 OrderedListStyle,
256 // Table exports
257 Table,
258 TableCell,
259 TableOptions,
260 TextAlign,
261 TextContext,
262 TextFlowContext,
263 UnorderedList,
264};
265
266// Re-export forms types
267pub use forms::{
268 calculations::FieldValue,
269 field_actions::{
270 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
271 SpecialFormatType, ValidateActionType,
272 },
273 validation::{
274 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
275 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
276 },
277 BorderStyle, FieldType, TextField, Widget,
278};
279
280// Re-export font embedding types
281pub use text::fonts::embedding::{
282 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
283 FontEncoding, FontFlags, FontMetrics, FontType,
284};
285
286// Re-export font management types
287pub use text::font_manager::{CustomFont, FontManager};
288
289// Re-export parsing types
290pub use parser::{
291 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
292 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
293 PdfString,
294};
295
296// Re-export operations
297pub use operations::{
298 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, move_pdf_page, overlay_pdf,
299 reorder_pdf_pages, reverse_pdf_pages, rotate_pdf_pages, split_pdf, swap_pdf_pages,
300 ExtractImagesOptions, ExtractedImage, ImageExtractor, OverlayOptions, OverlayPosition,
301 ReorderOptions,
302};
303
304// Re-export dashboard types
305pub use dashboard::{
306 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
307 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
308};
309
310// Re-export memory optimization types
311pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
312
313// Re-export streaming types
314pub use streaming::{
315 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
316 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
317 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
318};
319
320// Re-export batch processing types
321pub use batch::{
322 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
323 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
324 ProgressCallback, ProgressInfo,
325};
326
327// Re-export recovery types
328pub use recovery::{
329 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
330 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
331 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
332};
333
334// Re-export structure types
335pub use structure::{
336 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
337 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
338};
339
340// Re-export action types
341pub use actions::{
342 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
343 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
344};
345
346// Re-export page label types
347pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
348
349// Re-export template types
350pub use templates::{
351 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
352};
353
354// Re-export semantic types for AI-Ready PDFs
355pub use semantic::{
356 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
357 RelationType, SemanticEntity, SemanticMarking,
358};
359
360// Re-export verification types
361pub use verification::comparators::{
362 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
363};
364pub use verification::compliance_report::{
365 format_report_markdown, generate_compliance_report, ComplianceReport,
366};
367pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
368pub use verification::validators::{
369 check_available_validators, validate_external, validate_with_qpdf,
370};
371pub use verification::{
372 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
373 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
374};
375
376// Re-export PDF/A compliance types
377pub use pdfa::{
378 PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
379 ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
380 ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
381};
382
383/// Current version of oxidize-pdf
384pub const VERSION: &str = env!("CARGO_PKG_VERSION");
385
386/// Scanned page analysis and OCR example
387///
388/// ```rust,no_run
389/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
390/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
391/// use oxidize_pdf::parser::PdfReader;
392///
393/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
394/// let document = PdfReader::open_document("scanned.pdf")?;
395/// let analyzer = PageContentAnalyzer::new(document);
396///
397/// // Analyze pages for scanned content
398/// let analyses = analyzer.analyze_document()?;
399/// for analysis in analyses {
400/// match analysis.page_type {
401/// PageType::Scanned => {
402/// println!("Page {} is scanned - applying OCR", analysis.page_number);
403///
404/// // Process with OCR
405/// let ocr_provider = MockOcrProvider::new();
406/// let ocr_result = analyzer.extract_text_from_scanned_page(
407/// analysis.page_number,
408/// &ocr_provider
409/// )?;
410///
411/// println!("OCR extracted: {}", ocr_result.text);
412/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
413/// }
414/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
415/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
416/// }
417/// }
418/// # Ok(())
419/// # }
420/// ```
421///
422/// ### Font Embedding
423///
424/// ```rust,no_run
425/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
426/// use std::collections::HashSet;
427///
428/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
429/// // Create font embedder
430/// let mut embedder = FontEmbedder::new();
431///
432/// // Define used glyphs (example with basic ASCII)
433/// let mut used_glyphs = HashSet::new();
434/// used_glyphs.insert(65); // 'A'
435/// used_glyphs.insert(66); // 'B'
436/// used_glyphs.insert(67); // 'C'
437///
438/// // Configure embedding options
439/// let options = EmbeddingOptions {
440/// subset: true, // Create font subset
441/// compress_font_streams: true, // Compress font data
442/// ..Default::default()
443/// };
444///
445/// // Load font data (example - you'd load actual TrueType data)
446/// let font_data = std::fs::read("path/to/font.ttf")?;
447///
448/// // Embed the font
449/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
450/// println!("Embedded font as: {}", font_name);
451///
452/// // Generate PDF dictionary for the embedded font
453/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
454/// println!("Font dictionary generated successfully");
455/// # Ok(())
456/// # }
457/// ```
458///
459/// Supported PDF versions
460pub mod pdf_version {
461 /// PDF 1.0 - 1.7 are fully supported
462 pub const SUPPORTED_VERSIONS: &[&str] =
463 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
464 /// PDF 2.0 support is planned
465 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
466}
467
468#[cfg(test)]
469mod tests {
470 use super::*;
471
472 #[test]
473 fn test_create_empty_document() {
474 let doc = Document::new();
475 assert_eq!(doc.pages.len(), 0);
476 }
477
478 #[test]
479 fn test_create_page() {
480 let page = Page::new(595.0, 842.0);
481 assert_eq!(page.width(), 595.0);
482 assert_eq!(page.height(), 842.0);
483 }
484
485 #[test]
486 fn test_version_info() {
487 assert!(!VERSION.is_empty());
488 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
489 }
490
491 #[test]
492 fn test_pdf_version_constants() {
493 // Test that all expected PDF versions are supported
494 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
495
496 for version in expected_versions {
497 assert!(
498 pdf_version::SUPPORTED_VERSIONS.contains(&version),
499 "Expected PDF version {version} to be supported"
500 );
501 }
502
503 // Test that we have exactly 8 supported versions
504 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
505
506 // Test planned versions
507 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
508 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
509 }
510
511 #[test]
512 fn test_document_with_metadata() {
513 let mut doc = Document::new();
514 doc.set_title("Test Document");
515 doc.set_author("Test Author");
516 doc.set_subject("Test Subject");
517
518 // Verify metadata is set (checking internal state)
519 assert_eq!(doc.pages.len(), 0);
520 // Note: We can't directly test metadata without exposing getters
521 // This test ensures the methods don't panic
522 }
523
524 #[test]
525 fn test_page_creation_variants() {
526 // Test different page creation methods
527 let page_a4 = Page::a4();
528 let page_letter = Page::letter();
529 let page_custom = Page::new(400.0, 600.0);
530
531 // A4 dimensions: 595.276 x 841.89 points (approximation)
532 assert!((page_a4.width() - 595.0).abs() < 10.0);
533 assert!((page_a4.height() - 842.0).abs() < 10.0);
534
535 // Letter dimensions: 612 x 792 points
536 assert_eq!(page_letter.width(), 612.0);
537 assert_eq!(page_letter.height(), 792.0);
538
539 // Custom dimensions
540 assert_eq!(page_custom.width(), 400.0);
541 assert_eq!(page_custom.height(), 600.0);
542 }
543
544 #[test]
545 fn test_color_creation() {
546 let red = Color::rgb(1.0, 0.0, 0.0);
547 let green = Color::rgb(0.0, 1.0, 0.0);
548 let blue = Color::rgb(0.0, 0.0, 1.0);
549 let black = Color::rgb(0.0, 0.0, 0.0);
550 let white = Color::rgb(1.0, 1.0, 1.0);
551
552 // Test color creation doesn't panic
553 let _colors = [red, green, blue, black, white];
554
555 // Test CMYK color (if available)
556 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
557 let _cmyk_test = cyan;
558 }
559
560 #[test]
561 fn test_font_types() {
562 let helvetica = Font::Helvetica;
563 let times = Font::TimesRoman;
564 let courier = Font::Courier;
565
566 // Test font creation doesn't panic
567 let _fonts = [helvetica, times, courier];
568
569 // Test font family
570 let helvetica_family = FontFamily::Helvetica;
571 let times_family = FontFamily::Times;
572 let courier_family = FontFamily::Courier;
573
574 let _families = [helvetica_family, times_family, courier_family];
575 }
576
577 #[test]
578 fn test_error_types() {
579 // Test that error types can be created
580 let pdf_error = PdfError::InvalidStructure("test error".to_string());
581 let _error_test = pdf_error;
582
583 // Test result type
584 let ok_result: Result<i32> = Ok(42);
585 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
586
587 assert!(ok_result.is_ok());
588 assert!(err_result.is_err());
589 }
590
591 #[test]
592 fn test_module_exports() {
593 // Test that all major types are properly exported
594 let _doc = Document::new();
595 let _page = Page::new(100.0, 100.0);
596 let _color = Color::rgb(0.5, 0.5, 0.5);
597 let _font = Font::Helvetica;
598
599 // Test parsing types
600 let _array = PdfArray::new();
601 let _dict = PdfDictionary::new();
602 let _name = PdfName::new("Test".to_string());
603 let _string = PdfString::new(b"Test".to_vec());
604
605 // Test operation types
606 let _margins = Margins {
607 top: 10.0,
608 right: 10.0,
609 bottom: 10.0,
610 left: 10.0,
611 };
612 let _align = TextAlign::Left;
613 }
614
615 #[test]
616 fn test_ocr_types() {
617 // Test OCR-related types
618 let _mock_ocr = MockOcrProvider::new();
619 let _ocr_options = OcrOptions::default();
620 let _ocr_engine = OcrEngine::Tesseract;
621
622 // Test fragment types
623 let _fragment_type = FragmentType::Word;
624 let _image_preprocessing = ImagePreprocessing::default();
625 }
626
627 #[test]
628 fn test_text_utilities() {
629 // Test text utility functions
630 let text = "Hello world test";
631 let words = split_into_words(text);
632 assert!(!words.is_empty());
633 assert!(words.contains(&"Hello"));
634 assert!(words.contains(&"world"));
635
636 // Test text measurement (with mock font)
637 let font = Font::Helvetica;
638 let size = 12.0;
639 let width = measure_text(text, font, size);
640 assert!(width > 0.0);
641 }
642
643 #[test]
644 fn test_image_types() {
645 // Test image-related types
646 let _format = ImageFormat::Jpeg;
647 let _color_space = ColorSpace::DeviceRGB;
648
649 // Test that image creation doesn't panic
650 let image_data = vec![0u8; 100];
651 let _image = Image::from_jpeg_data(image_data);
652 }
653
654 #[test]
655 fn test_version_string_format() {
656 // Test that version string follows semantic versioning
657 let version_parts: Vec<&str> = VERSION.split('.').collect();
658 assert!(
659 version_parts.len() >= 2,
660 "Version should have at least major.minor format"
661 );
662
663 // Test that major and minor are numeric
664 assert!(
665 version_parts[0].parse::<u32>().is_ok(),
666 "Major version should be numeric"
667 );
668 assert!(
669 version_parts[1].parse::<u32>().is_ok(),
670 "Minor version should be numeric"
671 );
672
673 // Test that version is not empty
674 assert!(!VERSION.is_empty());
675 assert!(!VERSION.is_empty());
676 }
677}