Skip to main content

oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod pipeline;
210pub mod recovery;
211pub mod streaming;
212pub mod structure;
213pub mod templates;
214pub mod text;
215pub mod verification;
216pub mod viewer_preferences;
217pub mod writer;
218
219pub mod semantic;
220pub mod signatures;
221
222// Dashboard and reporting modules
223pub mod dashboard;
224
225// Re-export generation types
226pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
227pub use document::{Document, DocumentMetadata};
228pub use error::{OxidizePdfError, PdfError, Result};
229pub use geometry::{Point, Rectangle};
230pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
231pub use page::{Margins, Page};
232pub use page_lists::{ListStyle, ListType, PageLists};
233pub use page_tables::{PageTables, TableStyle};
234pub use text::{
235    measure_text,
236    split_into_words,
237    BulletStyle,
238    Font,
239    FontFamily,
240    FragmentType,
241    HeaderStyle,
242    ImagePreprocessing,
243    ListElement,
244    ListOptions,
245    MockOcrProvider,
246    OcrEngine,
247    OcrError,
248    OcrOptions,
249    OcrProcessingResult,
250    OcrProvider,
251    OcrResult,
252    OcrTextFragment,
253    // List exports
254    OrderedList,
255    OrderedListStyle,
256    // Table exports
257    Table,
258    TableCell,
259    TableOptions,
260    TextAlign,
261    TextContext,
262    TextFlowContext,
263    UnorderedList,
264};
265
266// Re-export forms types
267pub use forms::{
268    calculations::FieldValue,
269    field_actions::{
270        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
271        SpecialFormatType, ValidateActionType,
272    },
273    validation::{
274        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
275        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
276    },
277    BorderStyle, FieldType, TextField, Widget,
278};
279
280// Re-export font embedding types
281pub use text::fonts::embedding::{
282    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
283    FontEncoding, FontFlags, FontMetrics, FontType,
284};
285
286// Re-export font management types
287pub use text::font_manager::{CustomFont, FontManager};
288
289// Re-export parsing types
290pub use parser::{
291    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
292    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
293    PdfString,
294};
295
296// Re-export operations
297pub use operations::{
298    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, move_pdf_page, overlay_pdf,
299    reorder_pdf_pages, reverse_pdf_pages, rotate_pdf_pages, split_pdf, swap_pdf_pages,
300    ExtractImagesOptions, ExtractedImage, ImageExtractor, OverlayOptions, OverlayPosition,
301    ReorderOptions,
302};
303
304// Re-export dashboard types
305pub use dashboard::{
306    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
307    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
308};
309
310// Re-export memory optimization types
311pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
312
313// Re-export streaming types
314pub use streaming::{
315    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
316    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
317    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
318};
319
320// Re-export batch processing types
321pub use batch::{
322    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
323    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
324    ProgressCallback, ProgressInfo,
325};
326
327// Re-export recovery types
328pub use recovery::{
329    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
330    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
331    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
332};
333
334// Re-export structure types
335pub use structure::{
336    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
337    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
338};
339
340// Re-export action types
341pub use actions::{
342    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
343    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
344};
345
346// Re-export page label types
347pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
348
349// Re-export template types
350pub use templates::{
351    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
352};
353
354// Re-export semantic types for AI-Ready PDFs
355pub use semantic::{
356    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
357    RelationType, SemanticEntity, SemanticMarking,
358};
359
360// Re-export verification types
361pub use verification::comparators::{
362    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
363};
364pub use verification::compliance_report::{
365    format_report_markdown, generate_compliance_report, ComplianceReport,
366};
367pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
368pub use verification::validators::{
369    check_available_validators, validate_external, validate_with_qpdf,
370};
371pub use verification::{
372    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
373    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
374};
375
376// Re-export PDF/A compliance types
377pub use pdfa::{
378    PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
379    ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
380    ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
381};
382
383/// Current version of oxidize-pdf
384pub const VERSION: &str = env!("CARGO_PKG_VERSION");
385
386/// Scanned page analysis and OCR example
387///
388/// ```rust,no_run
389/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
390/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
391/// use oxidize_pdf::parser::PdfReader;
392///
393/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
394/// let document = PdfReader::open_document("scanned.pdf")?;
395/// let analyzer = PageContentAnalyzer::new(document);
396///
397/// // Analyze pages for scanned content
398/// let analyses = analyzer.analyze_document()?;
399/// for analysis in analyses {
400///     match analysis.page_type {
401///         PageType::Scanned => {
402///             println!("Page {} is scanned - applying OCR", analysis.page_number);
403///             
404///             // Process with OCR
405///             let ocr_provider = MockOcrProvider::new();
406///             let ocr_result = analyzer.extract_text_from_scanned_page(
407///                 analysis.page_number,
408///                 &ocr_provider
409///             )?;
410///             
411///             println!("OCR extracted: {}", ocr_result.text);
412///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
413///         }
414///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
415///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
416///     }
417/// }
418/// # Ok(())
419/// # }
420/// ```
421///
422/// ### Font Embedding
423///
424/// ```rust,no_run
425/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
426/// use std::collections::HashSet;
427///
428/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
429/// // Create font embedder
430/// let mut embedder = FontEmbedder::new();
431///
432/// // Define used glyphs (example with basic ASCII)
433/// let mut used_glyphs = HashSet::new();
434/// used_glyphs.insert(65); // 'A'
435/// used_glyphs.insert(66); // 'B'
436/// used_glyphs.insert(67); // 'C'
437///
438/// // Configure embedding options
439/// let options = EmbeddingOptions {
440///     subset: true,                    // Create font subset
441///     compress_font_streams: true,     // Compress font data
442///     ..Default::default()
443/// };
444///
445/// // Load font data (example - you'd load actual TrueType data)
446/// let font_data = std::fs::read("path/to/font.ttf")?;
447///
448/// // Embed the font
449/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
450/// println!("Embedded font as: {}", font_name);
451///
452/// // Generate PDF dictionary for the embedded font
453/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
454/// println!("Font dictionary generated successfully");
455/// # Ok(())
456/// # }
457/// ```
458///
459/// Supported PDF versions
460pub mod pdf_version {
461    /// PDF 1.0 - 1.7 are fully supported
462    pub const SUPPORTED_VERSIONS: &[&str] =
463        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
464    /// PDF 2.0 support is planned
465    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471
472    #[test]
473    fn test_create_empty_document() {
474        let doc = Document::new();
475        assert_eq!(doc.pages.len(), 0);
476    }
477
478    #[test]
479    fn test_create_page() {
480        let page = Page::new(595.0, 842.0);
481        assert_eq!(page.width(), 595.0);
482        assert_eq!(page.height(), 842.0);
483    }
484
485    #[test]
486    fn test_version_info() {
487        assert!(!VERSION.is_empty());
488        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
489    }
490
491    #[test]
492    fn test_pdf_version_constants() {
493        // Test that all expected PDF versions are supported
494        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
495
496        for version in expected_versions {
497            assert!(
498                pdf_version::SUPPORTED_VERSIONS.contains(&version),
499                "Expected PDF version {version} to be supported"
500            );
501        }
502
503        // Test that we have exactly 8 supported versions
504        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
505
506        // Test planned versions
507        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
508        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
509    }
510
511    #[test]
512    fn test_document_with_metadata() {
513        let mut doc = Document::new();
514        doc.set_title("Test Document");
515        doc.set_author("Test Author");
516        doc.set_subject("Test Subject");
517
518        // Verify metadata is set (checking internal state)
519        assert_eq!(doc.pages.len(), 0);
520        // Note: We can't directly test metadata without exposing getters
521        // This test ensures the methods don't panic
522    }
523
524    #[test]
525    fn test_page_creation_variants() {
526        // Test different page creation methods
527        let page_a4 = Page::a4();
528        let page_letter = Page::letter();
529        let page_custom = Page::new(400.0, 600.0);
530
531        // A4 dimensions: 595.276 x 841.89 points (approximation)
532        assert!((page_a4.width() - 595.0).abs() < 10.0);
533        assert!((page_a4.height() - 842.0).abs() < 10.0);
534
535        // Letter dimensions: 612 x 792 points
536        assert_eq!(page_letter.width(), 612.0);
537        assert_eq!(page_letter.height(), 792.0);
538
539        // Custom dimensions
540        assert_eq!(page_custom.width(), 400.0);
541        assert_eq!(page_custom.height(), 600.0);
542    }
543
544    #[test]
545    fn test_color_creation() {
546        let red = Color::rgb(1.0, 0.0, 0.0);
547        let green = Color::rgb(0.0, 1.0, 0.0);
548        let blue = Color::rgb(0.0, 0.0, 1.0);
549        let black = Color::rgb(0.0, 0.0, 0.0);
550        let white = Color::rgb(1.0, 1.0, 1.0);
551
552        // Test color creation doesn't panic
553        let _colors = [red, green, blue, black, white];
554
555        // Test CMYK color (if available)
556        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
557        let _cmyk_test = cyan;
558    }
559
560    #[test]
561    fn test_font_types() {
562        let helvetica = Font::Helvetica;
563        let times = Font::TimesRoman;
564        let courier = Font::Courier;
565
566        // Test font creation doesn't panic
567        let _fonts = [helvetica, times, courier];
568
569        // Test font family
570        let helvetica_family = FontFamily::Helvetica;
571        let times_family = FontFamily::Times;
572        let courier_family = FontFamily::Courier;
573
574        let _families = [helvetica_family, times_family, courier_family];
575    }
576
577    #[test]
578    fn test_error_types() {
579        // Test that error types can be created
580        let pdf_error = PdfError::InvalidStructure("test error".to_string());
581        let _error_test = pdf_error;
582
583        // Test result type
584        let ok_result: Result<i32> = Ok(42);
585        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
586
587        assert!(ok_result.is_ok());
588        assert!(err_result.is_err());
589    }
590
591    #[test]
592    fn test_module_exports() {
593        // Test that all major types are properly exported
594        let _doc = Document::new();
595        let _page = Page::new(100.0, 100.0);
596        let _color = Color::rgb(0.5, 0.5, 0.5);
597        let _font = Font::Helvetica;
598
599        // Test parsing types
600        let _array = PdfArray::new();
601        let _dict = PdfDictionary::new();
602        let _name = PdfName::new("Test".to_string());
603        let _string = PdfString::new(b"Test".to_vec());
604
605        // Test operation types
606        let _margins = Margins {
607            top: 10.0,
608            right: 10.0,
609            bottom: 10.0,
610            left: 10.0,
611        };
612        let _align = TextAlign::Left;
613    }
614
615    #[test]
616    fn test_ocr_types() {
617        // Test OCR-related types
618        let _mock_ocr = MockOcrProvider::new();
619        let _ocr_options = OcrOptions::default();
620        let _ocr_engine = OcrEngine::Tesseract;
621
622        // Test fragment types
623        let _fragment_type = FragmentType::Word;
624        let _image_preprocessing = ImagePreprocessing::default();
625    }
626
627    #[test]
628    fn test_text_utilities() {
629        // Test text utility functions
630        let text = "Hello world test";
631        let words = split_into_words(text);
632        assert!(!words.is_empty());
633        assert!(words.contains(&"Hello"));
634        assert!(words.contains(&"world"));
635
636        // Test text measurement (with mock font)
637        let font = Font::Helvetica;
638        let size = 12.0;
639        let width = measure_text(text, font, size);
640        assert!(width > 0.0);
641    }
642
643    #[test]
644    fn test_image_types() {
645        // Test image-related types
646        let _format = ImageFormat::Jpeg;
647        let _color_space = ColorSpace::DeviceRGB;
648
649        // Test that image creation doesn't panic
650        let image_data = vec![0u8; 100];
651        let _image = Image::from_jpeg_data(image_data);
652    }
653
654    #[test]
655    fn test_version_string_format() {
656        // Test that version string follows semantic versioning
657        let version_parts: Vec<&str> = VERSION.split('.').collect();
658        assert!(
659            version_parts.len() >= 2,
660            "Version should have at least major.minor format"
661        );
662
663        // Test that major and minor are numeric
664        assert!(
665            version_parts[0].parse::<u32>().is_ok(),
666            "Major version should be numeric"
667        );
668        assert!(
669            version_parts[1].parse::<u32>().is_ok(),
670            "Minor version should be numeric"
671        );
672
673        // Test that version is not empty
674        assert!(!VERSION.is_empty());
675        assert!(!VERSION.is_empty());
676    }
677}