Skip to main content

oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod layout;
194pub mod memory;
195pub mod metadata;
196pub mod objects;
197pub mod operations;
198pub mod page;
199pub mod page_forms;
200pub mod page_labels;
201pub mod page_lists;
202pub mod page_tables;
203pub mod page_transitions;
204pub mod page_tree;
205pub mod parser;
206pub mod pdf_objects;
207pub mod pdfa;
208#[cfg(feature = "performance")]
209pub mod performance;
210pub mod pipeline;
211pub mod recovery;
212pub mod streaming;
213pub mod structure;
214pub mod templates;
215pub mod text;
216pub mod verification;
217pub mod viewer_preferences;
218pub mod writer;
219
220pub mod semantic;
221pub mod signatures;
222
223// Dashboard and reporting modules
224pub mod dashboard;
225
226// Re-export generation types
227pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
228pub use document::{Document, DocumentMetadata};
229pub use error::{OxidizePdfError, PdfError, Result};
230pub use geometry::{Point, Rectangle};
231pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
232pub use layout::{
233    centered_image_x, fit_image_dimensions, DocumentBuilder, FlowElement, FlowLayout, PageConfig,
234    RichText, TextSpan,
235};
236pub use page::{Margins, Page};
237pub use page_lists::{ListStyle, ListType, PageLists};
238pub use page_tables::{PageTables, TableStyle};
239pub use text::{
240    measure_text,
241    measure_text_block,
242    split_into_words,
243    BulletStyle,
244    Font,
245    FontFamily,
246    FragmentType,
247    HeaderStyle,
248    ImagePreprocessing,
249    ListElement,
250    ListOptions,
251    MockOcrProvider,
252    OcrEngine,
253    OcrError,
254    OcrOptions,
255    OcrProcessingResult,
256    OcrProvider,
257    OcrResult,
258    OcrTextFragment,
259    // List exports
260    OrderedList,
261    OrderedListStyle,
262    // Table exports
263    Table,
264    TableCell,
265    TableOptions,
266    TextAlign,
267    TextBlockMetrics,
268    TextContext,
269    TextFlowContext,
270    UnorderedList,
271};
272
273// Re-export forms types
274pub use forms::{
275    calculations::FieldValue,
276    field_actions::{
277        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
278        SpecialFormatType, ValidateActionType,
279    },
280    validation::{
281        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
282        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
283    },
284    BorderStyle, FieldType, TextField, Widget,
285};
286
287// Re-export font embedding types
288pub use text::fonts::embedding::{
289    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
290    FontEncoding, FontFlags, FontMetrics, FontType,
291};
292
293// Re-export font management types
294pub use text::font_manager::{CustomFont, FontManager};
295
296// Re-export parsing types
297pub use parser::{
298    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
299    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
300    PdfString,
301};
302
303// Re-export operations
304pub use operations::{
305    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, move_pdf_page, overlay_pdf,
306    reorder_pdf_pages, reverse_pdf_pages, rotate_pdf_pages, split_pdf, swap_pdf_pages,
307    ExtractImagesOptions, ExtractedImage, ImageExtractor, OverlayOptions, OverlayPosition,
308    ReorderOptions,
309};
310
311// Re-export dashboard types
312pub use dashboard::{
313    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
314    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
315};
316
317// Re-export memory optimization types
318pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
319
320// Re-export streaming types
321pub use streaming::{
322    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
323    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
324    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
325};
326
327// Re-export batch processing types
328pub use batch::{
329    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
330    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
331    ProgressCallback, ProgressInfo,
332};
333
334// Re-export recovery types
335pub use recovery::{
336    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
337    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
338    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
339};
340
341// Re-export structure types
342pub use structure::{
343    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
344    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
345};
346
347// Re-export action types
348pub use actions::{
349    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
350    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
351};
352
353// Re-export page label types
354pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
355
356// Re-export template types
357pub use templates::{
358    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
359};
360
361// Re-export semantic types for AI-Ready PDFs
362pub use semantic::{
363    BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
364    RelationType, SemanticEntity, SemanticMarking,
365};
366
367// Re-export verification types
368pub use verification::comparators::{
369    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
370};
371pub use verification::compliance_report::{
372    format_report_markdown, generate_compliance_report, ComplianceReport,
373};
374pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
375pub use verification::validators::{
376    check_available_validators, validate_external, validate_with_qpdf,
377};
378pub use verification::{
379    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
380    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
381};
382
383// Re-export PDF/A compliance types
384pub use pdfa::{
385    PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
386    ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
387    ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
388};
389
390/// Current version of oxidize-pdf
391pub const VERSION: &str = env!("CARGO_PKG_VERSION");
392
393/// Scanned page analysis and OCR example
394///
395/// ```rust,no_run
396/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
397/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
398/// use oxidize_pdf::parser::PdfReader;
399///
400/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
401/// let document = PdfReader::open_document("scanned.pdf")?;
402/// let analyzer = PageContentAnalyzer::new(document);
403///
404/// // Analyze pages for scanned content
405/// let analyses = analyzer.analyze_document()?;
406/// for analysis in analyses {
407///     match analysis.page_type {
408///         PageType::Scanned => {
409///             println!("Page {} is scanned - applying OCR", analysis.page_number);
410///             
411///             // Process with OCR
412///             let ocr_provider = MockOcrProvider::new();
413///             let ocr_result = analyzer.extract_text_from_scanned_page(
414///                 analysis.page_number,
415///                 &ocr_provider
416///             )?;
417///             
418///             println!("OCR extracted: {}", ocr_result.text);
419///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
420///         }
421///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
422///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
423///     }
424/// }
425/// # Ok(())
426/// # }
427/// ```
428///
429/// ### Font Embedding
430///
431/// ```rust,no_run
432/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
433/// use std::collections::HashSet;
434///
435/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
436/// // Create font embedder
437/// let mut embedder = FontEmbedder::new();
438///
439/// // Define used glyphs (example with basic ASCII)
440/// let mut used_glyphs = HashSet::new();
441/// used_glyphs.insert(65); // 'A'
442/// used_glyphs.insert(66); // 'B'
443/// used_glyphs.insert(67); // 'C'
444///
445/// // Configure embedding options
446/// let options = EmbeddingOptions {
447///     subset: true,                    // Create font subset
448///     compress_font_streams: true,     // Compress font data
449///     ..Default::default()
450/// };
451///
452/// // Load font data (example - you'd load actual TrueType data)
453/// let font_data = std::fs::read("path/to/font.ttf")?;
454///
455/// // Embed the font
456/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
457/// println!("Embedded font as: {}", font_name);
458///
459/// // Generate PDF dictionary for the embedded font
460/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
461/// println!("Font dictionary generated successfully");
462/// # Ok(())
463/// # }
464/// ```
465///
466/// Supported PDF versions
467pub mod pdf_version {
468    /// PDF 1.0 - 1.7 are fully supported
469    pub const SUPPORTED_VERSIONS: &[&str] =
470        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
471    /// PDF 2.0 support is planned
472    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478
479    #[test]
480    fn test_create_empty_document() {
481        let doc = Document::new();
482        assert_eq!(doc.pages.len(), 0);
483    }
484
485    #[test]
486    fn test_create_page() {
487        let page = Page::new(595.0, 842.0);
488        assert_eq!(page.width(), 595.0);
489        assert_eq!(page.height(), 842.0);
490    }
491
492    #[test]
493    fn test_version_info() {
494        assert!(!VERSION.is_empty());
495        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
496    }
497
498    #[test]
499    fn test_pdf_version_constants() {
500        // Test that all expected PDF versions are supported
501        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
502
503        for version in expected_versions {
504            assert!(
505                pdf_version::SUPPORTED_VERSIONS.contains(&version),
506                "Expected PDF version {version} to be supported"
507            );
508        }
509
510        // Test that we have exactly 8 supported versions
511        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
512
513        // Test planned versions
514        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
515        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
516    }
517
518    #[test]
519    fn test_document_with_metadata() {
520        let mut doc = Document::new();
521        doc.set_title("Test Document");
522        doc.set_author("Test Author");
523        doc.set_subject("Test Subject");
524
525        // Verify metadata is set (checking internal state)
526        assert_eq!(doc.pages.len(), 0);
527        // Note: We can't directly test metadata without exposing getters
528        // This test ensures the methods don't panic
529    }
530
531    #[test]
532    fn test_page_creation_variants() {
533        // Test different page creation methods
534        let page_a4 = Page::a4();
535        let page_letter = Page::letter();
536        let page_custom = Page::new(400.0, 600.0);
537
538        // A4 dimensions: 595.276 x 841.89 points (approximation)
539        assert!((page_a4.width() - 595.0).abs() < 10.0);
540        assert!((page_a4.height() - 842.0).abs() < 10.0);
541
542        // Letter dimensions: 612 x 792 points
543        assert_eq!(page_letter.width(), 612.0);
544        assert_eq!(page_letter.height(), 792.0);
545
546        // Custom dimensions
547        assert_eq!(page_custom.width(), 400.0);
548        assert_eq!(page_custom.height(), 600.0);
549    }
550
551    #[test]
552    fn test_color_creation() {
553        let red = Color::rgb(1.0, 0.0, 0.0);
554        let green = Color::rgb(0.0, 1.0, 0.0);
555        let blue = Color::rgb(0.0, 0.0, 1.0);
556        let black = Color::rgb(0.0, 0.0, 0.0);
557        let white = Color::rgb(1.0, 1.0, 1.0);
558
559        // Test color creation doesn't panic
560        let _colors = [red, green, blue, black, white];
561
562        // Test CMYK color (if available)
563        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
564        let _cmyk_test = cyan;
565    }
566
567    #[test]
568    fn test_font_types() {
569        let helvetica = Font::Helvetica;
570        let times = Font::TimesRoman;
571        let courier = Font::Courier;
572
573        // Test font creation doesn't panic
574        let _fonts = [helvetica, times, courier];
575
576        // Test font family
577        let helvetica_family = FontFamily::Helvetica;
578        let times_family = FontFamily::Times;
579        let courier_family = FontFamily::Courier;
580
581        let _families = [helvetica_family, times_family, courier_family];
582    }
583
584    #[test]
585    fn test_error_types() {
586        // Test that error types can be created
587        let pdf_error = PdfError::InvalidStructure("test error".to_string());
588        let _error_test = pdf_error;
589
590        // Test result type
591        let ok_result: Result<i32> = Ok(42);
592        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
593
594        assert!(ok_result.is_ok());
595        assert!(err_result.is_err());
596    }
597
598    #[test]
599    fn test_module_exports() {
600        // Test that all major types are properly exported
601        let _doc = Document::new();
602        let _page = Page::new(100.0, 100.0);
603        let _color = Color::rgb(0.5, 0.5, 0.5);
604        let _font = Font::Helvetica;
605
606        // Test parsing types
607        let _array = PdfArray::new();
608        let _dict = PdfDictionary::new();
609        let _name = PdfName::new("Test".to_string());
610        let _string = PdfString::new(b"Test".to_vec());
611
612        // Test operation types
613        let _margins = Margins {
614            top: 10.0,
615            right: 10.0,
616            bottom: 10.0,
617            left: 10.0,
618        };
619        let _align = TextAlign::Left;
620    }
621
622    #[test]
623    fn test_ocr_types() {
624        // Test OCR-related types
625        let _mock_ocr = MockOcrProvider::new();
626        let _ocr_options = OcrOptions::default();
627        let _ocr_engine = OcrEngine::Tesseract;
628
629        // Test fragment types
630        let _fragment_type = FragmentType::Word;
631        let _image_preprocessing = ImagePreprocessing::default();
632    }
633
634    #[test]
635    fn test_text_utilities() {
636        // Test text utility functions
637        let text = "Hello world test";
638        let words = split_into_words(text);
639        assert!(!words.is_empty());
640        assert!(words.contains(&"Hello"));
641        assert!(words.contains(&"world"));
642
643        // Test text measurement (with mock font)
644        let font = Font::Helvetica;
645        let size = 12.0;
646        let width = measure_text(text, &font, size);
647        assert!(width > 0.0);
648    }
649
650    #[test]
651    fn test_image_types() {
652        // Test image-related types
653        let _format = ImageFormat::Jpeg;
654        let _color_space = ColorSpace::DeviceRGB;
655
656        // Test that image creation doesn't panic
657        let image_data = vec![0u8; 100];
658        let _image = Image::from_jpeg_data(image_data);
659    }
660
661    #[test]
662    fn test_version_string_format() {
663        // Test that version string follows semantic versioning
664        let version_parts: Vec<&str> = VERSION.split('.').collect();
665        assert!(
666            version_parts.len() >= 2,
667            "Version should have at least major.minor format"
668        );
669
670        // Test that major and minor are numeric
671        assert!(
672            version_parts[0].parse::<u32>().is_ok(),
673            "Major version should be numeric"
674        );
675        assert!(
676            version_parts[1].parse::<u32>().is_ok(),
677            "Minor version should be numeric"
678        );
679
680        // Test that version is not empty
681        assert!(!VERSION.is_empty());
682        assert!(!VERSION.is_empty());
683    }
684}