oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205#[cfg(feature = "performance")]
206pub mod performance;
207pub mod recovery;
208pub mod streaming;
209pub mod structure;
210pub mod templates;
211pub mod text;
212pub mod verification;
213pub mod viewer_preferences;
214pub mod writer;
215
216pub mod semantic;
217
218// Dashboard and reporting modules
219pub mod dashboard;
220
221// Re-export generation types
222pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
223pub use document::{Document, DocumentMetadata};
224pub use error::{OxidizePdfError, PdfError, Result};
225pub use geometry::{Point, Rectangle};
226pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
227pub use page::{Margins, Page};
228pub use page_lists::{ListStyle, ListType, PageLists};
229pub use page_tables::{PageTables, TableStyle};
230pub use text::{
231    measure_text,
232    split_into_words,
233    BulletStyle,
234    Font,
235    FontFamily,
236    FragmentType,
237    HeaderStyle,
238    ImagePreprocessing,
239    ListElement,
240    ListOptions,
241    MockOcrProvider,
242    OcrEngine,
243    OcrError,
244    OcrOptions,
245    OcrProcessingResult,
246    OcrProvider,
247    OcrResult,
248    OcrTextFragment,
249    // List exports
250    OrderedList,
251    OrderedListStyle,
252    // Table exports
253    Table,
254    TableCell,
255    TableOptions,
256    TextAlign,
257    TextContext,
258    TextFlowContext,
259    UnorderedList,
260};
261
262// Re-export forms types
263pub use forms::{
264    calculations::FieldValue,
265    field_actions::{
266        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
267        SpecialFormatType, ValidateActionType,
268    },
269    validation::{
270        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
271        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
272    },
273    BorderStyle, FieldType, TextField, Widget,
274};
275
276// Re-export font embedding types
277pub use text::fonts::embedding::{
278    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
279    FontEncoding, FontFlags, FontMetrics, FontType,
280};
281
282// Re-export font management types
283pub use text::font_manager::{CustomFont, FontManager};
284
285// Re-export parsing types
286pub use parser::{
287    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
288    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
289    PdfString,
290};
291
292// Re-export operations
293pub use operations::{
294    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
295    ExtractImagesOptions, ExtractedImage, ImageExtractor,
296};
297
298// Re-export dashboard types
299pub use dashboard::{
300    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
301    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
302};
303
304// Re-export memory optimization types
305pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
306
307// Re-export streaming types
308pub use streaming::{
309    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
310    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
311    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
312};
313
314// Re-export batch processing types
315pub use batch::{
316    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
317    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
318    ProgressCallback, ProgressInfo,
319};
320
321// Re-export recovery types
322pub use recovery::{
323    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
324    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
325    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
326};
327
328// Re-export structure types
329pub use structure::{
330    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
331    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
332};
333
334// Re-export action types
335pub use actions::{
336    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
337    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
338};
339
340// Re-export page label types
341pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
342
343// Re-export template types
344pub use templates::{
345    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
346};
347
348// Re-export semantic types for AI-Ready PDFs
349pub use semantic::{
350    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
351    RelationType, SemanticEntity, SemanticMarking,
352};
353
354// Re-export verification types
355pub use verification::comparators::{
356    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
357};
358pub use verification::compliance_report::{
359    format_report_markdown, generate_compliance_report, ComplianceReport,
360};
361pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
362pub use verification::validators::{
363    check_available_validators, validate_external, validate_with_qpdf,
364};
365pub use verification::{
366    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
367    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
368};
369
370/// Current version of oxidize-pdf
371pub const VERSION: &str = env!("CARGO_PKG_VERSION");
372
373/// Scanned page analysis and OCR example
374///
375/// ```rust,no_run
376/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
377/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
378/// use oxidize_pdf::parser::PdfReader;
379///
380/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
381/// let document = PdfReader::open_document("scanned.pdf")?;
382/// let analyzer = PageContentAnalyzer::new(document);
383///
384/// // Analyze pages for scanned content
385/// let analyses = analyzer.analyze_document()?;
386/// for analysis in analyses {
387///     match analysis.page_type {
388///         PageType::Scanned => {
389///             println!("Page {} is scanned - applying OCR", analysis.page_number);
390///             
391///             // Process with OCR
392///             let ocr_provider = MockOcrProvider::new();
393///             let ocr_result = analyzer.extract_text_from_scanned_page(
394///                 analysis.page_number,
395///                 &ocr_provider
396///             )?;
397///             
398///             println!("OCR extracted: {}", ocr_result.text);
399///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
400///         }
401///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
402///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
403///     }
404/// }
405/// # Ok(())
406/// # }
407/// ```
408///
409/// ### Font Embedding
410///
411/// ```rust,no_run
412/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
413/// use std::collections::HashSet;
414///
415/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
416/// // Create font embedder
417/// let mut embedder = FontEmbedder::new();
418///
419/// // Define used glyphs (example with basic ASCII)
420/// let mut used_glyphs = HashSet::new();
421/// used_glyphs.insert(65); // 'A'
422/// used_glyphs.insert(66); // 'B'
423/// used_glyphs.insert(67); // 'C'
424///
425/// // Configure embedding options
426/// let options = EmbeddingOptions {
427///     subset: true,                    // Create font subset
428///     compress_font_streams: true,     // Compress font data
429///     ..Default::default()
430/// };
431///
432/// // Load font data (example - you'd load actual TrueType data)
433/// let font_data = std::fs::read("path/to/font.ttf")?;
434///
435/// // Embed the font
436/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
437/// println!("Embedded font as: {}", font_name);
438///
439/// // Generate PDF dictionary for the embedded font
440/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
441/// println!("Font dictionary generated successfully");
442/// # Ok(())
443/// # }
444/// ```
445///
446/// Supported PDF versions
447pub mod pdf_version {
448    /// PDF 1.0 - 1.7 are fully supported
449    pub const SUPPORTED_VERSIONS: &[&str] =
450        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
451    /// PDF 2.0 support is planned
452    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn test_create_empty_document() {
461        let doc = Document::new();
462        assert_eq!(doc.pages.len(), 0);
463    }
464
465    #[test]
466    fn test_create_page() {
467        let page = Page::new(595.0, 842.0);
468        assert_eq!(page.width(), 595.0);
469        assert_eq!(page.height(), 842.0);
470    }
471
472    #[test]
473    fn test_version_info() {
474        assert!(!VERSION.is_empty());
475        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
476    }
477
478    #[test]
479    fn test_pdf_version_constants() {
480        // Test that all expected PDF versions are supported
481        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
482
483        for version in expected_versions {
484            assert!(
485                pdf_version::SUPPORTED_VERSIONS.contains(&version),
486                "Expected PDF version {version} to be supported"
487            );
488        }
489
490        // Test that we have exactly 8 supported versions
491        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
492
493        // Test planned versions
494        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
495        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
496    }
497
498    #[test]
499    fn test_document_with_metadata() {
500        let mut doc = Document::new();
501        doc.set_title("Test Document");
502        doc.set_author("Test Author");
503        doc.set_subject("Test Subject");
504
505        // Verify metadata is set (checking internal state)
506        assert_eq!(doc.pages.len(), 0);
507        // Note: We can't directly test metadata without exposing getters
508        // This test ensures the methods don't panic
509    }
510
511    #[test]
512    fn test_page_creation_variants() {
513        // Test different page creation methods
514        let page_a4 = Page::a4();
515        let page_letter = Page::letter();
516        let page_custom = Page::new(400.0, 600.0);
517
518        // A4 dimensions: 595.276 x 841.89 points (approximation)
519        assert!((page_a4.width() - 595.0).abs() < 10.0);
520        assert!((page_a4.height() - 842.0).abs() < 10.0);
521
522        // Letter dimensions: 612 x 792 points
523        assert_eq!(page_letter.width(), 612.0);
524        assert_eq!(page_letter.height(), 792.0);
525
526        // Custom dimensions
527        assert_eq!(page_custom.width(), 400.0);
528        assert_eq!(page_custom.height(), 600.0);
529    }
530
531    #[test]
532    fn test_color_creation() {
533        let red = Color::rgb(1.0, 0.0, 0.0);
534        let green = Color::rgb(0.0, 1.0, 0.0);
535        let blue = Color::rgb(0.0, 0.0, 1.0);
536        let black = Color::rgb(0.0, 0.0, 0.0);
537        let white = Color::rgb(1.0, 1.0, 1.0);
538
539        // Test color creation doesn't panic
540        let _colors = [red, green, blue, black, white];
541
542        // Test CMYK color (if available)
543        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
544        let _cmyk_test = cyan;
545    }
546
547    #[test]
548    fn test_font_types() {
549        let helvetica = Font::Helvetica;
550        let times = Font::TimesRoman;
551        let courier = Font::Courier;
552
553        // Test font creation doesn't panic
554        let _fonts = [helvetica, times, courier];
555
556        // Test font family
557        let helvetica_family = FontFamily::Helvetica;
558        let times_family = FontFamily::Times;
559        let courier_family = FontFamily::Courier;
560
561        let _families = [helvetica_family, times_family, courier_family];
562    }
563
564    #[test]
565    fn test_error_types() {
566        // Test that error types can be created
567        let pdf_error = PdfError::InvalidStructure("test error".to_string());
568        let _error_test = pdf_error;
569
570        // Test result type
571        let ok_result: Result<i32> = Ok(42);
572        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
573
574        assert!(ok_result.is_ok());
575        assert!(err_result.is_err());
576    }
577
578    #[test]
579    fn test_module_exports() {
580        // Test that all major types are properly exported
581        let _doc = Document::new();
582        let _page = Page::new(100.0, 100.0);
583        let _color = Color::rgb(0.5, 0.5, 0.5);
584        let _font = Font::Helvetica;
585
586        // Test parsing types
587        let _array = PdfArray::new();
588        let _dict = PdfDictionary::new();
589        let _name = PdfName::new("Test".to_string());
590        let _string = PdfString::new(b"Test".to_vec());
591
592        // Test operation types
593        let _margins = Margins {
594            top: 10.0,
595            right: 10.0,
596            bottom: 10.0,
597            left: 10.0,
598        };
599        let _align = TextAlign::Left;
600    }
601
602    #[test]
603    fn test_ocr_types() {
604        // Test OCR-related types
605        let _mock_ocr = MockOcrProvider::new();
606        let _ocr_options = OcrOptions::default();
607        let _ocr_engine = OcrEngine::Tesseract;
608
609        // Test fragment types
610        let _fragment_type = FragmentType::Word;
611        let _image_preprocessing = ImagePreprocessing::default();
612    }
613
614    #[test]
615    fn test_text_utilities() {
616        // Test text utility functions
617        let text = "Hello world test";
618        let words = split_into_words(text);
619        assert!(!words.is_empty());
620        assert!(words.contains(&"Hello"));
621        assert!(words.contains(&"world"));
622
623        // Test text measurement (with mock font)
624        let font = Font::Helvetica;
625        let size = 12.0;
626        let width = measure_text(text, font, size);
627        assert!(width > 0.0);
628    }
629
630    #[test]
631    fn test_image_types() {
632        // Test image-related types
633        let _format = ImageFormat::Jpeg;
634        let _color_space = ColorSpace::DeviceRGB;
635
636        // Test that image creation doesn't panic
637        let image_data = vec![0u8; 100];
638        let _image = Image::from_jpeg_data(image_data);
639    }
640
641    #[test]
642    fn test_version_string_format() {
643        // Test that version string follows semantic versioning
644        let version_parts: Vec<&str> = VERSION.split('.').collect();
645        assert!(
646            version_parts.len() >= 2,
647            "Version should have at least major.minor format"
648        );
649
650        // Test that major and minor are numeric
651        assert!(
652            version_parts[0].parse::<u32>().is_ok(),
653            "Major version should be numeric"
654        );
655        assert!(
656            version_parts[1].parse::<u32>().is_ok(),
657            "Minor version should be numeric"
658        );
659
660        // Test that version is not empty
661        assert!(!VERSION.is_empty());
662        assert!(!VERSION.is_empty());
663    }
664}