Skip to main content

oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206pub mod pdfa;
207#[cfg(feature = "performance")]
208pub mod performance;
209pub mod recovery;
210pub mod streaming;
211pub mod structure;
212pub mod templates;
213pub mod text;
214pub mod verification;
215pub mod viewer_preferences;
216pub mod writer;
217
218pub mod semantic;
219pub mod signatures;
220
221// Dashboard and reporting modules
222pub mod dashboard;
223
224// Re-export generation types
225pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
226pub use document::{Document, DocumentMetadata};
227pub use error::{OxidizePdfError, PdfError, Result};
228pub use geometry::{Point, Rectangle};
229pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
230pub use page::{Margins, Page};
231pub use page_lists::{ListStyle, ListType, PageLists};
232pub use page_tables::{PageTables, TableStyle};
233pub use text::{
234    measure_text,
235    split_into_words,
236    BulletStyle,
237    Font,
238    FontFamily,
239    FragmentType,
240    HeaderStyle,
241    ImagePreprocessing,
242    ListElement,
243    ListOptions,
244    MockOcrProvider,
245    OcrEngine,
246    OcrError,
247    OcrOptions,
248    OcrProcessingResult,
249    OcrProvider,
250    OcrResult,
251    OcrTextFragment,
252    // List exports
253    OrderedList,
254    OrderedListStyle,
255    // Table exports
256    Table,
257    TableCell,
258    TableOptions,
259    TextAlign,
260    TextContext,
261    TextFlowContext,
262    UnorderedList,
263};
264
265// Re-export forms types
266pub use forms::{
267    calculations::FieldValue,
268    field_actions::{
269        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
270        SpecialFormatType, ValidateActionType,
271    },
272    validation::{
273        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
274        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
275    },
276    BorderStyle, FieldType, TextField, Widget,
277};
278
279// Re-export font embedding types
280pub use text::fonts::embedding::{
281    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
282    FontEncoding, FontFlags, FontMetrics, FontType,
283};
284
285// Re-export font management types
286pub use text::font_manager::{CustomFont, FontManager};
287
288// Re-export parsing types
289pub use parser::{
290    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
291    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
292    PdfString,
293};
294
295// Re-export operations
296pub use operations::{
297    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
298    ExtractImagesOptions, ExtractedImage, ImageExtractor,
299};
300
301// Re-export dashboard types
302pub use dashboard::{
303    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
304    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
305};
306
307// Re-export memory optimization types
308pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
309
310// Re-export streaming types
311pub use streaming::{
312    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
313    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
314    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
315};
316
317// Re-export batch processing types
318pub use batch::{
319    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
320    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
321    ProgressCallback, ProgressInfo,
322};
323
324// Re-export recovery types
325pub use recovery::{
326    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
327    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
328    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
329};
330
331// Re-export structure types
332pub use structure::{
333    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
334    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
335};
336
337// Re-export action types
338pub use actions::{
339    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
340    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
341};
342
343// Re-export page label types
344pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
345
346// Re-export template types
347pub use templates::{
348    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
349};
350
351// Re-export semantic types for AI-Ready PDFs
352pub use semantic::{
353    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
354    RelationType, SemanticEntity, SemanticMarking,
355};
356
357// Re-export verification types
358pub use verification::comparators::{
359    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
360};
361pub use verification::compliance_report::{
362    format_report_markdown, generate_compliance_report, ComplianceReport,
363};
364pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
365pub use verification::validators::{
366    check_available_validators, validate_external, validate_with_qpdf,
367};
368pub use verification::{
369    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
370    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
371};
372
373// Re-export PDF/A compliance types
374pub use pdfa::{
375    PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
376    ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
377    ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
378};
379
380/// Current version of oxidize-pdf
381pub const VERSION: &str = env!("CARGO_PKG_VERSION");
382
383/// Scanned page analysis and OCR example
384///
385/// ```rust,no_run
386/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
387/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
388/// use oxidize_pdf::parser::PdfReader;
389///
390/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
391/// let document = PdfReader::open_document("scanned.pdf")?;
392/// let analyzer = PageContentAnalyzer::new(document);
393///
394/// // Analyze pages for scanned content
395/// let analyses = analyzer.analyze_document()?;
396/// for analysis in analyses {
397///     match analysis.page_type {
398///         PageType::Scanned => {
399///             println!("Page {} is scanned - applying OCR", analysis.page_number);
400///             
401///             // Process with OCR
402///             let ocr_provider = MockOcrProvider::new();
403///             let ocr_result = analyzer.extract_text_from_scanned_page(
404///                 analysis.page_number,
405///                 &ocr_provider
406///             )?;
407///             
408///             println!("OCR extracted: {}", ocr_result.text);
409///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
410///         }
411///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
412///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
413///     }
414/// }
415/// # Ok(())
416/// # }
417/// ```
418///
419/// ### Font Embedding
420///
421/// ```rust,no_run
422/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
423/// use std::collections::HashSet;
424///
425/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
426/// // Create font embedder
427/// let mut embedder = FontEmbedder::new();
428///
429/// // Define used glyphs (example with basic ASCII)
430/// let mut used_glyphs = HashSet::new();
431/// used_glyphs.insert(65); // 'A'
432/// used_glyphs.insert(66); // 'B'
433/// used_glyphs.insert(67); // 'C'
434///
435/// // Configure embedding options
436/// let options = EmbeddingOptions {
437///     subset: true,                    // Create font subset
438///     compress_font_streams: true,     // Compress font data
439///     ..Default::default()
440/// };
441///
442/// // Load font data (example - you'd load actual TrueType data)
443/// let font_data = std::fs::read("path/to/font.ttf")?;
444///
445/// // Embed the font
446/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
447/// println!("Embedded font as: {}", font_name);
448///
449/// // Generate PDF dictionary for the embedded font
450/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
451/// println!("Font dictionary generated successfully");
452/// # Ok(())
453/// # }
454/// ```
455///
456/// Supported PDF versions
457pub mod pdf_version {
458    /// PDF 1.0 - 1.7 are fully supported
459    pub const SUPPORTED_VERSIONS: &[&str] =
460        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
461    /// PDF 2.0 support is planned
462    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469    #[test]
470    fn test_create_empty_document() {
471        let doc = Document::new();
472        assert_eq!(doc.pages.len(), 0);
473    }
474
475    #[test]
476    fn test_create_page() {
477        let page = Page::new(595.0, 842.0);
478        assert_eq!(page.width(), 595.0);
479        assert_eq!(page.height(), 842.0);
480    }
481
482    #[test]
483    fn test_version_info() {
484        assert!(!VERSION.is_empty());
485        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
486    }
487
488    #[test]
489    fn test_pdf_version_constants() {
490        // Test that all expected PDF versions are supported
491        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
492
493        for version in expected_versions {
494            assert!(
495                pdf_version::SUPPORTED_VERSIONS.contains(&version),
496                "Expected PDF version {version} to be supported"
497            );
498        }
499
500        // Test that we have exactly 8 supported versions
501        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
502
503        // Test planned versions
504        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
505        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
506    }
507
508    #[test]
509    fn test_document_with_metadata() {
510        let mut doc = Document::new();
511        doc.set_title("Test Document");
512        doc.set_author("Test Author");
513        doc.set_subject("Test Subject");
514
515        // Verify metadata is set (checking internal state)
516        assert_eq!(doc.pages.len(), 0);
517        // Note: We can't directly test metadata without exposing getters
518        // This test ensures the methods don't panic
519    }
520
521    #[test]
522    fn test_page_creation_variants() {
523        // Test different page creation methods
524        let page_a4 = Page::a4();
525        let page_letter = Page::letter();
526        let page_custom = Page::new(400.0, 600.0);
527
528        // A4 dimensions: 595.276 x 841.89 points (approximation)
529        assert!((page_a4.width() - 595.0).abs() < 10.0);
530        assert!((page_a4.height() - 842.0).abs() < 10.0);
531
532        // Letter dimensions: 612 x 792 points
533        assert_eq!(page_letter.width(), 612.0);
534        assert_eq!(page_letter.height(), 792.0);
535
536        // Custom dimensions
537        assert_eq!(page_custom.width(), 400.0);
538        assert_eq!(page_custom.height(), 600.0);
539    }
540
541    #[test]
542    fn test_color_creation() {
543        let red = Color::rgb(1.0, 0.0, 0.0);
544        let green = Color::rgb(0.0, 1.0, 0.0);
545        let blue = Color::rgb(0.0, 0.0, 1.0);
546        let black = Color::rgb(0.0, 0.0, 0.0);
547        let white = Color::rgb(1.0, 1.0, 1.0);
548
549        // Test color creation doesn't panic
550        let _colors = [red, green, blue, black, white];
551
552        // Test CMYK color (if available)
553        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
554        let _cmyk_test = cyan;
555    }
556
557    #[test]
558    fn test_font_types() {
559        let helvetica = Font::Helvetica;
560        let times = Font::TimesRoman;
561        let courier = Font::Courier;
562
563        // Test font creation doesn't panic
564        let _fonts = [helvetica, times, courier];
565
566        // Test font family
567        let helvetica_family = FontFamily::Helvetica;
568        let times_family = FontFamily::Times;
569        let courier_family = FontFamily::Courier;
570
571        let _families = [helvetica_family, times_family, courier_family];
572    }
573
574    #[test]
575    fn test_error_types() {
576        // Test that error types can be created
577        let pdf_error = PdfError::InvalidStructure("test error".to_string());
578        let _error_test = pdf_error;
579
580        // Test result type
581        let ok_result: Result<i32> = Ok(42);
582        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
583
584        assert!(ok_result.is_ok());
585        assert!(err_result.is_err());
586    }
587
588    #[test]
589    fn test_module_exports() {
590        // Test that all major types are properly exported
591        let _doc = Document::new();
592        let _page = Page::new(100.0, 100.0);
593        let _color = Color::rgb(0.5, 0.5, 0.5);
594        let _font = Font::Helvetica;
595
596        // Test parsing types
597        let _array = PdfArray::new();
598        let _dict = PdfDictionary::new();
599        let _name = PdfName::new("Test".to_string());
600        let _string = PdfString::new(b"Test".to_vec());
601
602        // Test operation types
603        let _margins = Margins {
604            top: 10.0,
605            right: 10.0,
606            bottom: 10.0,
607            left: 10.0,
608        };
609        let _align = TextAlign::Left;
610    }
611
612    #[test]
613    fn test_ocr_types() {
614        // Test OCR-related types
615        let _mock_ocr = MockOcrProvider::new();
616        let _ocr_options = OcrOptions::default();
617        let _ocr_engine = OcrEngine::Tesseract;
618
619        // Test fragment types
620        let _fragment_type = FragmentType::Word;
621        let _image_preprocessing = ImagePreprocessing::default();
622    }
623
624    #[test]
625    fn test_text_utilities() {
626        // Test text utility functions
627        let text = "Hello world test";
628        let words = split_into_words(text);
629        assert!(!words.is_empty());
630        assert!(words.contains(&"Hello"));
631        assert!(words.contains(&"world"));
632
633        // Test text measurement (with mock font)
634        let font = Font::Helvetica;
635        let size = 12.0;
636        let width = measure_text(text, font, size);
637        assert!(width > 0.0);
638    }
639
640    #[test]
641    fn test_image_types() {
642        // Test image-related types
643        let _format = ImageFormat::Jpeg;
644        let _color_space = ColorSpace::DeviceRGB;
645
646        // Test that image creation doesn't panic
647        let image_data = vec![0u8; 100];
648        let _image = Image::from_jpeg_data(image_data);
649    }
650
651    #[test]
652    fn test_version_string_format() {
653        // Test that version string follows semantic versioning
654        let version_parts: Vec<&str> = VERSION.split('.').collect();
655        assert!(
656            version_parts.len() >= 2,
657            "Version should have at least major.minor format"
658        );
659
660        // Test that major and minor are numeric
661        assert!(
662            version_parts[0].parse::<u32>().is_ok(),
663            "Major version should be numeric"
664        );
665        assert!(
666            version_parts[1].parse::<u32>().is_ok(),
667            "Minor version should be numeric"
668        );
669
670        // Test that version is not empty
671        assert!(!VERSION.is_empty());
672        assert!(!VERSION.is_empty());
673    }
674}