oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206#[cfg(feature = "performance")]
207pub mod performance;
208pub mod recovery;
209pub mod streaming;
210pub mod structure;
211pub mod templates;
212pub mod text;
213pub mod verification;
214pub mod viewer_preferences;
215pub mod writer;
216
217pub mod semantic;
218
219// Dashboard and reporting modules
220pub mod dashboard;
221
222// Re-export generation types
223pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
224pub use document::{Document, DocumentMetadata};
225pub use error::{OxidizePdfError, PdfError, Result};
226pub use geometry::{Point, Rectangle};
227pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
228pub use page::{Margins, Page};
229pub use page_lists::{ListStyle, ListType, PageLists};
230pub use page_tables::{PageTables, TableStyle};
231pub use text::{
232    measure_text,
233    split_into_words,
234    BulletStyle,
235    Font,
236    FontFamily,
237    FragmentType,
238    HeaderStyle,
239    ImagePreprocessing,
240    ListElement,
241    ListOptions,
242    MockOcrProvider,
243    OcrEngine,
244    OcrError,
245    OcrOptions,
246    OcrProcessingResult,
247    OcrProvider,
248    OcrResult,
249    OcrTextFragment,
250    // List exports
251    OrderedList,
252    OrderedListStyle,
253    // Table exports
254    Table,
255    TableCell,
256    TableOptions,
257    TextAlign,
258    TextContext,
259    TextFlowContext,
260    UnorderedList,
261};
262
263// Re-export forms types
264pub use forms::{
265    calculations::FieldValue,
266    field_actions::{
267        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
268        SpecialFormatType, ValidateActionType,
269    },
270    validation::{
271        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
272        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
273    },
274    BorderStyle, FieldType, TextField, Widget,
275};
276
277// Re-export font embedding types
278pub use text::fonts::embedding::{
279    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
280    FontEncoding, FontFlags, FontMetrics, FontType,
281};
282
283// Re-export font management types
284pub use text::font_manager::{CustomFont, FontManager};
285
286// Re-export parsing types
287pub use parser::{
288    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
289    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
290    PdfString,
291};
292
293// Re-export operations
294pub use operations::{
295    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
296    ExtractImagesOptions, ExtractedImage, ImageExtractor,
297};
298
299// Re-export dashboard types
300pub use dashboard::{
301    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
302    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
303};
304
305// Re-export memory optimization types
306pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
307
308// Re-export streaming types
309pub use streaming::{
310    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
311    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
312    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
313};
314
315// Re-export batch processing types
316pub use batch::{
317    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
318    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
319    ProgressCallback, ProgressInfo,
320};
321
322// Re-export recovery types
323pub use recovery::{
324    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
325    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
326    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
327};
328
329// Re-export structure types
330pub use structure::{
331    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
332    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
333};
334
335// Re-export action types
336pub use actions::{
337    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
338    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
339};
340
341// Re-export page label types
342pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
343
344// Re-export template types
345pub use templates::{
346    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
347};
348
349// Re-export semantic types for AI-Ready PDFs
350pub use semantic::{
351    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
352    RelationType, SemanticEntity, SemanticMarking,
353};
354
355// Re-export verification types
356pub use verification::comparators::{
357    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
358};
359pub use verification::compliance_report::{
360    format_report_markdown, generate_compliance_report, ComplianceReport,
361};
362pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
363pub use verification::validators::{
364    check_available_validators, validate_external, validate_with_qpdf,
365};
366pub use verification::{
367    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
368    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
369};
370
371/// Current version of oxidize-pdf
372pub const VERSION: &str = env!("CARGO_PKG_VERSION");
373
374/// Scanned page analysis and OCR example
375///
376/// ```rust,no_run
377/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
378/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
379/// use oxidize_pdf::parser::PdfReader;
380///
381/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
382/// let document = PdfReader::open_document("scanned.pdf")?;
383/// let analyzer = PageContentAnalyzer::new(document);
384///
385/// // Analyze pages for scanned content
386/// let analyses = analyzer.analyze_document()?;
387/// for analysis in analyses {
388///     match analysis.page_type {
389///         PageType::Scanned => {
390///             println!("Page {} is scanned - applying OCR", analysis.page_number);
391///             
392///             // Process with OCR
393///             let ocr_provider = MockOcrProvider::new();
394///             let ocr_result = analyzer.extract_text_from_scanned_page(
395///                 analysis.page_number,
396///                 &ocr_provider
397///             )?;
398///             
399///             println!("OCR extracted: {}", ocr_result.text);
400///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
401///         }
402///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
403///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
404///     }
405/// }
406/// # Ok(())
407/// # }
408/// ```
409///
410/// ### Font Embedding
411///
412/// ```rust,no_run
413/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
414/// use std::collections::HashSet;
415///
416/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
417/// // Create font embedder
418/// let mut embedder = FontEmbedder::new();
419///
420/// // Define used glyphs (example with basic ASCII)
421/// let mut used_glyphs = HashSet::new();
422/// used_glyphs.insert(65); // 'A'
423/// used_glyphs.insert(66); // 'B'
424/// used_glyphs.insert(67); // 'C'
425///
426/// // Configure embedding options
427/// let options = EmbeddingOptions {
428///     subset: true,                    // Create font subset
429///     compress_font_streams: true,     // Compress font data
430///     ..Default::default()
431/// };
432///
433/// // Load font data (example - you'd load actual TrueType data)
434/// let font_data = std::fs::read("path/to/font.ttf")?;
435///
436/// // Embed the font
437/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
438/// println!("Embedded font as: {}", font_name);
439///
440/// // Generate PDF dictionary for the embedded font
441/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
442/// println!("Font dictionary generated successfully");
443/// # Ok(())
444/// # }
445/// ```
446///
447/// Supported PDF versions
448pub mod pdf_version {
449    /// PDF 1.0 - 1.7 are fully supported
450    pub const SUPPORTED_VERSIONS: &[&str] =
451        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
452    /// PDF 2.0 support is planned
453    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    #[test]
461    fn test_create_empty_document() {
462        let doc = Document::new();
463        assert_eq!(doc.pages.len(), 0);
464    }
465
466    #[test]
467    fn test_create_page() {
468        let page = Page::new(595.0, 842.0);
469        assert_eq!(page.width(), 595.0);
470        assert_eq!(page.height(), 842.0);
471    }
472
473    #[test]
474    fn test_version_info() {
475        assert!(!VERSION.is_empty());
476        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
477    }
478
479    #[test]
480    fn test_pdf_version_constants() {
481        // Test that all expected PDF versions are supported
482        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
483
484        for version in expected_versions {
485            assert!(
486                pdf_version::SUPPORTED_VERSIONS.contains(&version),
487                "Expected PDF version {version} to be supported"
488            );
489        }
490
491        // Test that we have exactly 8 supported versions
492        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
493
494        // Test planned versions
495        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
496        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
497    }
498
499    #[test]
500    fn test_document_with_metadata() {
501        let mut doc = Document::new();
502        doc.set_title("Test Document");
503        doc.set_author("Test Author");
504        doc.set_subject("Test Subject");
505
506        // Verify metadata is set (checking internal state)
507        assert_eq!(doc.pages.len(), 0);
508        // Note: We can't directly test metadata without exposing getters
509        // This test ensures the methods don't panic
510    }
511
512    #[test]
513    fn test_page_creation_variants() {
514        // Test different page creation methods
515        let page_a4 = Page::a4();
516        let page_letter = Page::letter();
517        let page_custom = Page::new(400.0, 600.0);
518
519        // A4 dimensions: 595.276 x 841.89 points (approximation)
520        assert!((page_a4.width() - 595.0).abs() < 10.0);
521        assert!((page_a4.height() - 842.0).abs() < 10.0);
522
523        // Letter dimensions: 612 x 792 points
524        assert_eq!(page_letter.width(), 612.0);
525        assert_eq!(page_letter.height(), 792.0);
526
527        // Custom dimensions
528        assert_eq!(page_custom.width(), 400.0);
529        assert_eq!(page_custom.height(), 600.0);
530    }
531
532    #[test]
533    fn test_color_creation() {
534        let red = Color::rgb(1.0, 0.0, 0.0);
535        let green = Color::rgb(0.0, 1.0, 0.0);
536        let blue = Color::rgb(0.0, 0.0, 1.0);
537        let black = Color::rgb(0.0, 0.0, 0.0);
538        let white = Color::rgb(1.0, 1.0, 1.0);
539
540        // Test color creation doesn't panic
541        let _colors = [red, green, blue, black, white];
542
543        // Test CMYK color (if available)
544        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
545        let _cmyk_test = cyan;
546    }
547
548    #[test]
549    fn test_font_types() {
550        let helvetica = Font::Helvetica;
551        let times = Font::TimesRoman;
552        let courier = Font::Courier;
553
554        // Test font creation doesn't panic
555        let _fonts = [helvetica, times, courier];
556
557        // Test font family
558        let helvetica_family = FontFamily::Helvetica;
559        let times_family = FontFamily::Times;
560        let courier_family = FontFamily::Courier;
561
562        let _families = [helvetica_family, times_family, courier_family];
563    }
564
565    #[test]
566    fn test_error_types() {
567        // Test that error types can be created
568        let pdf_error = PdfError::InvalidStructure("test error".to_string());
569        let _error_test = pdf_error;
570
571        // Test result type
572        let ok_result: Result<i32> = Ok(42);
573        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
574
575        assert!(ok_result.is_ok());
576        assert!(err_result.is_err());
577    }
578
579    #[test]
580    fn test_module_exports() {
581        // Test that all major types are properly exported
582        let _doc = Document::new();
583        let _page = Page::new(100.0, 100.0);
584        let _color = Color::rgb(0.5, 0.5, 0.5);
585        let _font = Font::Helvetica;
586
587        // Test parsing types
588        let _array = PdfArray::new();
589        let _dict = PdfDictionary::new();
590        let _name = PdfName::new("Test".to_string());
591        let _string = PdfString::new(b"Test".to_vec());
592
593        // Test operation types
594        let _margins = Margins {
595            top: 10.0,
596            right: 10.0,
597            bottom: 10.0,
598            left: 10.0,
599        };
600        let _align = TextAlign::Left;
601    }
602
603    #[test]
604    fn test_ocr_types() {
605        // Test OCR-related types
606        let _mock_ocr = MockOcrProvider::new();
607        let _ocr_options = OcrOptions::default();
608        let _ocr_engine = OcrEngine::Tesseract;
609
610        // Test fragment types
611        let _fragment_type = FragmentType::Word;
612        let _image_preprocessing = ImagePreprocessing::default();
613    }
614
615    #[test]
616    fn test_text_utilities() {
617        // Test text utility functions
618        let text = "Hello world test";
619        let words = split_into_words(text);
620        assert!(!words.is_empty());
621        assert!(words.contains(&"Hello"));
622        assert!(words.contains(&"world"));
623
624        // Test text measurement (with mock font)
625        let font = Font::Helvetica;
626        let size = 12.0;
627        let width = measure_text(text, font, size);
628        assert!(width > 0.0);
629    }
630
631    #[test]
632    fn test_image_types() {
633        // Test image-related types
634        let _format = ImageFormat::Jpeg;
635        let _color_space = ColorSpace::DeviceRGB;
636
637        // Test that image creation doesn't panic
638        let image_data = vec![0u8; 100];
639        let _image = Image::from_jpeg_data(image_data);
640    }
641
642    #[test]
643    fn test_version_string_format() {
644        // Test that version string follows semantic versioning
645        let version_parts: Vec<&str> = VERSION.split('.').collect();
646        assert!(
647            version_parts.len() >= 2,
648            "Version should have at least major.minor format"
649        );
650
651        // Test that major and minor are numeric
652        assert!(
653            version_parts[0].parse::<u32>().is_ok(),
654            "Major version should be numeric"
655        );
656        assert!(
657            version_parts[1].parse::<u32>().is_ok(),
658            "Minor version should be numeric"
659        );
660
661        // Test that version is not empty
662        assert!(!VERSION.is_empty());
663        assert!(!VERSION.is_empty());
664    }
665}