Skip to main content

oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod pipeline;
210pub mod recovery;
211pub mod streaming;
212pub mod structure;
213pub mod templates;
214pub mod text;
215pub mod verification;
216pub mod viewer_preferences;
217pub mod writer;
218
219pub mod semantic;
220pub mod signatures;
221
222// Dashboard and reporting modules
223pub mod dashboard;
224
225// Re-export generation types
226pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
227pub use document::{Document, DocumentMetadata};
228pub use error::{OxidizePdfError, PdfError, Result};
229pub use geometry::{Point, Rectangle};
230pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
231pub use page::{Margins, Page};
232pub use page_lists::{ListStyle, ListType, PageLists};
233pub use page_tables::{PageTables, TableStyle};
234pub use text::{
235    measure_text,
236    split_into_words,
237    BulletStyle,
238    Font,
239    FontFamily,
240    FragmentType,
241    HeaderStyle,
242    ImagePreprocessing,
243    ListElement,
244    ListOptions,
245    MockOcrProvider,
246    OcrEngine,
247    OcrError,
248    OcrOptions,
249    OcrProcessingResult,
250    OcrProvider,
251    OcrResult,
252    OcrTextFragment,
253    // List exports
254    OrderedList,
255    OrderedListStyle,
256    // Table exports
257    Table,
258    TableCell,
259    TableOptions,
260    TextAlign,
261    TextContext,
262    TextFlowContext,
263    UnorderedList,
264};
265
266// Re-export forms types
267pub use forms::{
268    calculations::FieldValue,
269    field_actions::{
270        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
271        SpecialFormatType, ValidateActionType,
272    },
273    validation::{
274        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
275        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
276    },
277    BorderStyle, FieldType, TextField, Widget,
278};
279
280// Re-export font embedding types
281pub use text::fonts::embedding::{
282    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
283    FontEncoding, FontFlags, FontMetrics, FontType,
284};
285
286// Re-export font management types
287pub use text::font_manager::{CustomFont, FontManager};
288
289// Re-export parsing types
290pub use parser::{
291    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
292    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
293    PdfString,
294};
295
296// Re-export operations
297pub use operations::{
298    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
299    ExtractImagesOptions, ExtractedImage, ImageExtractor,
300};
301
302// Re-export dashboard types
303pub use dashboard::{
304    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
305    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
306};
307
308// Re-export memory optimization types
309pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
310
311// Re-export streaming types
312pub use streaming::{
313    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
314    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
315    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
316};
317
318// Re-export batch processing types
319pub use batch::{
320    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
321    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
322    ProgressCallback, ProgressInfo,
323};
324
325// Re-export recovery types
326pub use recovery::{
327    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
328    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
329    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
330};
331
332// Re-export structure types
333pub use structure::{
334    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
335    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
336};
337
338// Re-export action types
339pub use actions::{
340    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
341    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
342};
343
344// Re-export page label types
345pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
346
347// Re-export template types
348pub use templates::{
349    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
350};
351
352// Re-export semantic types for AI-Ready PDFs
353pub use semantic::{
354    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
355    RelationType, SemanticEntity, SemanticMarking,
356};
357
358// Re-export verification types
359pub use verification::comparators::{
360    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
361};
362pub use verification::compliance_report::{
363    format_report_markdown, generate_compliance_report, ComplianceReport,
364};
365pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
366pub use verification::validators::{
367    check_available_validators, validate_external, validate_with_qpdf,
368};
369pub use verification::{
370    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
371    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
372};
373
374// Re-export PDF/A compliance types
375pub use pdfa::{
376    PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
377    ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
378    ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
379};
380
381/// Current version of oxidize-pdf
382pub const VERSION: &str = env!("CARGO_PKG_VERSION");
383
384/// Scanned page analysis and OCR example
385///
386/// ```rust,no_run
387/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
388/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
389/// use oxidize_pdf::parser::PdfReader;
390///
391/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
392/// let document = PdfReader::open_document("scanned.pdf")?;
393/// let analyzer = PageContentAnalyzer::new(document);
394///
395/// // Analyze pages for scanned content
396/// let analyses = analyzer.analyze_document()?;
397/// for analysis in analyses {
398///     match analysis.page_type {
399///         PageType::Scanned => {
400///             println!("Page {} is scanned - applying OCR", analysis.page_number);
401///             
402///             // Process with OCR
403///             let ocr_provider = MockOcrProvider::new();
404///             let ocr_result = analyzer.extract_text_from_scanned_page(
405///                 analysis.page_number,
406///                 &ocr_provider
407///             )?;
408///             
409///             println!("OCR extracted: {}", ocr_result.text);
410///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
411///         }
412///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
413///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
414///     }
415/// }
416/// # Ok(())
417/// # }
418/// ```
419///
420/// ### Font Embedding
421///
422/// ```rust,no_run
423/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
424/// use std::collections::HashSet;
425///
426/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
427/// // Create font embedder
428/// let mut embedder = FontEmbedder::new();
429///
430/// // Define used glyphs (example with basic ASCII)
431/// let mut used_glyphs = HashSet::new();
432/// used_glyphs.insert(65); // 'A'
433/// used_glyphs.insert(66); // 'B'
434/// used_glyphs.insert(67); // 'C'
435///
436/// // Configure embedding options
437/// let options = EmbeddingOptions {
438///     subset: true,                    // Create font subset
439///     compress_font_streams: true,     // Compress font data
440///     ..Default::default()
441/// };
442///
443/// // Load font data (example - you'd load actual TrueType data)
444/// let font_data = std::fs::read("path/to/font.ttf")?;
445///
446/// // Embed the font
447/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
448/// println!("Embedded font as: {}", font_name);
449///
450/// // Generate PDF dictionary for the embedded font
451/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
452/// println!("Font dictionary generated successfully");
453/// # Ok(())
454/// # }
455/// ```
456///
457/// Supported PDF versions
458pub mod pdf_version {
459    /// PDF 1.0 - 1.7 are fully supported
460    pub const SUPPORTED_VERSIONS: &[&str] =
461        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
462    /// PDF 2.0 support is planned
463    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_create_empty_document() {
472        let doc = Document::new();
473        assert_eq!(doc.pages.len(), 0);
474    }
475
476    #[test]
477    fn test_create_page() {
478        let page = Page::new(595.0, 842.0);
479        assert_eq!(page.width(), 595.0);
480        assert_eq!(page.height(), 842.0);
481    }
482
483    #[test]
484    fn test_version_info() {
485        assert!(!VERSION.is_empty());
486        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
487    }
488
489    #[test]
490    fn test_pdf_version_constants() {
491        // Test that all expected PDF versions are supported
492        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
493
494        for version in expected_versions {
495            assert!(
496                pdf_version::SUPPORTED_VERSIONS.contains(&version),
497                "Expected PDF version {version} to be supported"
498            );
499        }
500
501        // Test that we have exactly 8 supported versions
502        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
503
504        // Test planned versions
505        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
506        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
507    }
508
509    #[test]
510    fn test_document_with_metadata() {
511        let mut doc = Document::new();
512        doc.set_title("Test Document");
513        doc.set_author("Test Author");
514        doc.set_subject("Test Subject");
515
516        // Verify metadata is set (checking internal state)
517        assert_eq!(doc.pages.len(), 0);
518        // Note: We can't directly test metadata without exposing getters
519        // This test ensures the methods don't panic
520    }
521
522    #[test]
523    fn test_page_creation_variants() {
524        // Test different page creation methods
525        let page_a4 = Page::a4();
526        let page_letter = Page::letter();
527        let page_custom = Page::new(400.0, 600.0);
528
529        // A4 dimensions: 595.276 x 841.89 points (approximation)
530        assert!((page_a4.width() - 595.0).abs() < 10.0);
531        assert!((page_a4.height() - 842.0).abs() < 10.0);
532
533        // Letter dimensions: 612 x 792 points
534        assert_eq!(page_letter.width(), 612.0);
535        assert_eq!(page_letter.height(), 792.0);
536
537        // Custom dimensions
538        assert_eq!(page_custom.width(), 400.0);
539        assert_eq!(page_custom.height(), 600.0);
540    }
541
542    #[test]
543    fn test_color_creation() {
544        let red = Color::rgb(1.0, 0.0, 0.0);
545        let green = Color::rgb(0.0, 1.0, 0.0);
546        let blue = Color::rgb(0.0, 0.0, 1.0);
547        let black = Color::rgb(0.0, 0.0, 0.0);
548        let white = Color::rgb(1.0, 1.0, 1.0);
549
550        // Test color creation doesn't panic
551        let _colors = [red, green, blue, black, white];
552
553        // Test CMYK color (if available)
554        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
555        let _cmyk_test = cyan;
556    }
557
558    #[test]
559    fn test_font_types() {
560        let helvetica = Font::Helvetica;
561        let times = Font::TimesRoman;
562        let courier = Font::Courier;
563
564        // Test font creation doesn't panic
565        let _fonts = [helvetica, times, courier];
566
567        // Test font family
568        let helvetica_family = FontFamily::Helvetica;
569        let times_family = FontFamily::Times;
570        let courier_family = FontFamily::Courier;
571
572        let _families = [helvetica_family, times_family, courier_family];
573    }
574
575    #[test]
576    fn test_error_types() {
577        // Test that error types can be created
578        let pdf_error = PdfError::InvalidStructure("test error".to_string());
579        let _error_test = pdf_error;
580
581        // Test result type
582        let ok_result: Result<i32> = Ok(42);
583        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
584
585        assert!(ok_result.is_ok());
586        assert!(err_result.is_err());
587    }
588
589    #[test]
590    fn test_module_exports() {
591        // Test that all major types are properly exported
592        let _doc = Document::new();
593        let _page = Page::new(100.0, 100.0);
594        let _color = Color::rgb(0.5, 0.5, 0.5);
595        let _font = Font::Helvetica;
596
597        // Test parsing types
598        let _array = PdfArray::new();
599        let _dict = PdfDictionary::new();
600        let _name = PdfName::new("Test".to_string());
601        let _string = PdfString::new(b"Test".to_vec());
602
603        // Test operation types
604        let _margins = Margins {
605            top: 10.0,
606            right: 10.0,
607            bottom: 10.0,
608            left: 10.0,
609        };
610        let _align = TextAlign::Left;
611    }
612
613    #[test]
614    fn test_ocr_types() {
615        // Test OCR-related types
616        let _mock_ocr = MockOcrProvider::new();
617        let _ocr_options = OcrOptions::default();
618        let _ocr_engine = OcrEngine::Tesseract;
619
620        // Test fragment types
621        let _fragment_type = FragmentType::Word;
622        let _image_preprocessing = ImagePreprocessing::default();
623    }
624
625    #[test]
626    fn test_text_utilities() {
627        // Test text utility functions
628        let text = "Hello world test";
629        let words = split_into_words(text);
630        assert!(!words.is_empty());
631        assert!(words.contains(&"Hello"));
632        assert!(words.contains(&"world"));
633
634        // Test text measurement (with mock font)
635        let font = Font::Helvetica;
636        let size = 12.0;
637        let width = measure_text(text, font, size);
638        assert!(width > 0.0);
639    }
640
641    #[test]
642    fn test_image_types() {
643        // Test image-related types
644        let _format = ImageFormat::Jpeg;
645        let _color_space = ColorSpace::DeviceRGB;
646
647        // Test that image creation doesn't panic
648        let image_data = vec![0u8; 100];
649        let _image = Image::from_jpeg_data(image_data);
650    }
651
652    #[test]
653    fn test_version_string_format() {
654        // Test that version string follows semantic versioning
655        let version_parts: Vec<&str> = VERSION.split('.').collect();
656        assert!(
657            version_parts.len() >= 2,
658            "Version should have at least major.minor format"
659        );
660
661        // Test that major and minor are numeric
662        assert!(
663            version_parts[0].parse::<u32>().is_ok(),
664            "Major version should be numeric"
665        );
666        assert!(
667            version_parts[1].parse::<u32>().is_ok(),
668            "Minor version should be numeric"
669        );
670
671        // Test that version is not empty
672        assert!(!VERSION.is_empty());
673        assert!(!VERSION.is_empty());
674    }
675}