oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod objects;
195pub mod operations;
196pub mod page;
197pub mod page_forms;
198pub mod page_labels;
199pub mod page_lists;
200pub mod page_tables;
201pub mod page_transitions;
202pub mod page_tree;
203pub mod parser;
204#[cfg(feature = "performance")]
205pub mod performance;
206pub mod recovery;
207pub mod streaming;
208pub mod structure;
209pub mod templates;
210pub mod text;
211pub mod verification;
212pub mod viewer_preferences;
213pub mod writer;
214
215pub mod semantic;
216
217// Dashboard and reporting modules
218pub mod dashboard;
219
220// Re-export generation types
221pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
222pub use document::{Document, DocumentMetadata};
223pub use error::{OxidizePdfError, PdfError, Result};
224pub use geometry::{Point, Rectangle};
225pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
226pub use page::{Margins, Page};
227pub use page_lists::{ListStyle, ListType, PageLists};
228pub use page_tables::{PageTables, TableStyle};
229pub use text::{
230    measure_text,
231    split_into_words,
232    BulletStyle,
233    Font,
234    FontFamily,
235    FragmentType,
236    HeaderStyle,
237    ImagePreprocessing,
238    ListElement,
239    ListOptions,
240    MockOcrProvider,
241    OcrEngine,
242    OcrError,
243    OcrOptions,
244    OcrProcessingResult,
245    OcrProvider,
246    OcrResult,
247    OcrTextFragment,
248    // List exports
249    OrderedList,
250    OrderedListStyle,
251    // Table exports
252    Table,
253    TableCell,
254    TableOptions,
255    TextAlign,
256    TextContext,
257    TextFlowContext,
258    UnorderedList,
259};
260
261// Re-export forms types
262pub use forms::{
263    calculations::FieldValue,
264    field_actions::{
265        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
266        SpecialFormatType, ValidateActionType,
267    },
268    validation::{
269        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
270        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
271    },
272    BorderStyle, FieldType, TextField, Widget,
273};
274
275// Re-export font embedding types
276pub use text::fonts::embedding::{
277    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
278    FontEncoding, FontFlags, FontMetrics, FontType,
279};
280
281// Re-export font management types
282pub use text::font_manager::{CustomFont, FontManager};
283
284// Re-export parsing types
285pub use parser::{
286    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
287    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
288    PdfString,
289};
290
291// Re-export operations
292pub use operations::{
293    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
294    ExtractImagesOptions, ExtractedImage, ImageExtractor,
295};
296
297// Re-export dashboard types
298pub use dashboard::{
299    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
300    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
301};
302
303// Re-export memory optimization types
304pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
305
306// Re-export streaming types
307pub use streaming::{
308    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
309    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
310    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
311};
312
313// Re-export batch processing types
314pub use batch::{
315    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
316    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
317    ProgressCallback, ProgressInfo,
318};
319
320// Re-export recovery types
321pub use recovery::{
322    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
323    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
324    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
325};
326
327// Re-export structure types
328pub use structure::{
329    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
330    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
331};
332
333// Re-export action types
334pub use actions::{
335    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
336    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
337};
338
339// Re-export page label types
340pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
341
342// Re-export template types
343pub use templates::{
344    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
345};
346
347// Re-export semantic types for AI-Ready PDFs
348pub use semantic::{
349    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
350    RelationType, SemanticEntity, SemanticMarking,
351};
352
353// Re-export verification types
354pub use verification::comparators::{
355    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
356};
357pub use verification::compliance_report::{
358    format_report_markdown, generate_compliance_report, ComplianceReport,
359};
360pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
361pub use verification::validators::{
362    check_available_validators, validate_external, validate_with_qpdf,
363};
364pub use verification::{
365    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
366    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
367};
368
369/// Current version of oxidize-pdf
370pub const VERSION: &str = env!("CARGO_PKG_VERSION");
371
372/// Scanned page analysis and OCR example
373///
374/// ```rust,no_run
375/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
376/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
377/// use oxidize_pdf::parser::PdfReader;
378///
379/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
380/// let document = PdfReader::open_document("scanned.pdf")?;
381/// let analyzer = PageContentAnalyzer::new(document);
382///
383/// // Analyze pages for scanned content
384/// let analyses = analyzer.analyze_document()?;
385/// for analysis in analyses {
386///     match analysis.page_type {
387///         PageType::Scanned => {
388///             println!("Page {} is scanned - applying OCR", analysis.page_number);
389///             
390///             // Process with OCR
391///             let ocr_provider = MockOcrProvider::new();
392///             let ocr_result = analyzer.extract_text_from_scanned_page(
393///                 analysis.page_number,
394///                 &ocr_provider
395///             )?;
396///             
397///             println!("OCR extracted: {}", ocr_result.text);
398///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
399///         }
400///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
401///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
402///     }
403/// }
404/// # Ok(())
405/// # }
406/// ```
407///
408/// ### Font Embedding
409///
410/// ```rust,no_run
411/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
412/// use std::collections::HashSet;
413///
414/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
415/// // Create font embedder
416/// let mut embedder = FontEmbedder::new();
417///
418/// // Define used glyphs (example with basic ASCII)
419/// let mut used_glyphs = HashSet::new();
420/// used_glyphs.insert(65); // 'A'
421/// used_glyphs.insert(66); // 'B'
422/// used_glyphs.insert(67); // 'C'
423///
424/// // Configure embedding options
425/// let options = EmbeddingOptions {
426///     subset: true,                    // Create font subset
427///     compress_font_streams: true,     // Compress font data
428///     ..Default::default()
429/// };
430///
431/// // Load font data (example - you'd load actual TrueType data)
432/// let font_data = std::fs::read("path/to/font.ttf")?;
433///
434/// // Embed the font
435/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
436/// println!("Embedded font as: {}", font_name);
437///
438/// // Generate PDF dictionary for the embedded font
439/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
440/// println!("Font dictionary generated successfully");
441/// # Ok(())
442/// # }
443/// ```
444///
445/// Supported PDF versions
446pub mod pdf_version {
447    /// PDF 1.0 - 1.7 are fully supported
448    pub const SUPPORTED_VERSIONS: &[&str] =
449        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
450    /// PDF 2.0 support is planned
451    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457
458    #[test]
459    fn test_create_empty_document() {
460        let doc = Document::new();
461        assert_eq!(doc.pages.len(), 0);
462    }
463
464    #[test]
465    fn test_create_page() {
466        let page = Page::new(595.0, 842.0);
467        assert_eq!(page.width(), 595.0);
468        assert_eq!(page.height(), 842.0);
469    }
470
471    #[test]
472    fn test_version_info() {
473        assert!(!VERSION.is_empty());
474        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
475    }
476
477    #[test]
478    fn test_pdf_version_constants() {
479        // Test that all expected PDF versions are supported
480        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
481
482        for version in expected_versions {
483            assert!(
484                pdf_version::SUPPORTED_VERSIONS.contains(&version),
485                "Expected PDF version {version} to be supported"
486            );
487        }
488
489        // Test that we have exactly 8 supported versions
490        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
491
492        // Test planned versions
493        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
494        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
495    }
496
497    #[test]
498    fn test_document_with_metadata() {
499        let mut doc = Document::new();
500        doc.set_title("Test Document");
501        doc.set_author("Test Author");
502        doc.set_subject("Test Subject");
503
504        // Verify metadata is set (checking internal state)
505        assert_eq!(doc.pages.len(), 0);
506        // Note: We can't directly test metadata without exposing getters
507        // This test ensures the methods don't panic
508    }
509
510    #[test]
511    fn test_page_creation_variants() {
512        // Test different page creation methods
513        let page_a4 = Page::a4();
514        let page_letter = Page::letter();
515        let page_custom = Page::new(400.0, 600.0);
516
517        // A4 dimensions: 595.276 x 841.89 points (approximation)
518        assert!((page_a4.width() - 595.0).abs() < 10.0);
519        assert!((page_a4.height() - 842.0).abs() < 10.0);
520
521        // Letter dimensions: 612 x 792 points
522        assert_eq!(page_letter.width(), 612.0);
523        assert_eq!(page_letter.height(), 792.0);
524
525        // Custom dimensions
526        assert_eq!(page_custom.width(), 400.0);
527        assert_eq!(page_custom.height(), 600.0);
528    }
529
530    #[test]
531    fn test_color_creation() {
532        let red = Color::rgb(1.0, 0.0, 0.0);
533        let green = Color::rgb(0.0, 1.0, 0.0);
534        let blue = Color::rgb(0.0, 0.0, 1.0);
535        let black = Color::rgb(0.0, 0.0, 0.0);
536        let white = Color::rgb(1.0, 1.0, 1.0);
537
538        // Test color creation doesn't panic
539        let _colors = [red, green, blue, black, white];
540
541        // Test CMYK color (if available)
542        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
543        let _cmyk_test = cyan;
544    }
545
546    #[test]
547    fn test_font_types() {
548        let helvetica = Font::Helvetica;
549        let times = Font::TimesRoman;
550        let courier = Font::Courier;
551
552        // Test font creation doesn't panic
553        let _fonts = [helvetica, times, courier];
554
555        // Test font family
556        let helvetica_family = FontFamily::Helvetica;
557        let times_family = FontFamily::Times;
558        let courier_family = FontFamily::Courier;
559
560        let _families = [helvetica_family, times_family, courier_family];
561    }
562
563    #[test]
564    fn test_error_types() {
565        // Test that error types can be created
566        let pdf_error = PdfError::InvalidStructure("test error".to_string());
567        let _error_test = pdf_error;
568
569        // Test result type
570        let ok_result: Result<i32> = Ok(42);
571        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
572
573        assert!(ok_result.is_ok());
574        assert!(err_result.is_err());
575    }
576
577    #[test]
578    fn test_module_exports() {
579        // Test that all major types are properly exported
580        let _doc = Document::new();
581        let _page = Page::new(100.0, 100.0);
582        let _color = Color::rgb(0.5, 0.5, 0.5);
583        let _font = Font::Helvetica;
584
585        // Test parsing types
586        let _array = PdfArray::new();
587        let _dict = PdfDictionary::new();
588        let _name = PdfName::new("Test".to_string());
589        let _string = PdfString::new(b"Test".to_vec());
590
591        // Test operation types
592        let _margins = Margins {
593            top: 10.0,
594            right: 10.0,
595            bottom: 10.0,
596            left: 10.0,
597        };
598        let _align = TextAlign::Left;
599    }
600
601    #[test]
602    fn test_ocr_types() {
603        // Test OCR-related types
604        let _mock_ocr = MockOcrProvider::new();
605        let _ocr_options = OcrOptions::default();
606        let _ocr_engine = OcrEngine::Tesseract;
607
608        // Test fragment types
609        let _fragment_type = FragmentType::Word;
610        let _image_preprocessing = ImagePreprocessing::default();
611    }
612
613    #[test]
614    fn test_text_utilities() {
615        // Test text utility functions
616        let text = "Hello world test";
617        let words = split_into_words(text);
618        assert!(!words.is_empty());
619        assert!(words.contains(&"Hello"));
620        assert!(words.contains(&"world"));
621
622        // Test text measurement (with mock font)
623        let font = Font::Helvetica;
624        let size = 12.0;
625        let width = measure_text(text, font, size);
626        assert!(width > 0.0);
627    }
628
629    #[test]
630    fn test_image_types() {
631        // Test image-related types
632        let _format = ImageFormat::Jpeg;
633        let _color_space = ColorSpace::DeviceRGB;
634
635        // Test that image creation doesn't panic
636        let image_data = vec![0u8; 100];
637        let _image = Image::from_jpeg_data(image_data);
638    }
639
640    #[test]
641    fn test_version_string_format() {
642        // Test that version string follows semantic versioning
643        let version_parts: Vec<&str> = VERSION.split('.').collect();
644        assert!(
645            version_parts.len() >= 2,
646            "Version should have at least major.minor format"
647        );
648
649        // Test that major and minor are numeric
650        assert!(
651            version_parts[0].parse::<u32>().is_ok(),
652            "Major version should be numeric"
653        );
654        assert!(
655            version_parts[1].parse::<u32>().is_ok(),
656            "Minor version should be numeric"
657        );
658
659        // Test that version is not empty
660        assert!(!VERSION.is_empty());
661        assert!(!VERSION.is_empty());
662    }
663}