oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod annotations;
180
181pub mod batch;
182pub mod charts;
183pub mod compression;
184pub mod coordinate_system;
185pub mod document;
186pub mod encryption;
187pub mod error;
188pub mod fonts;
189pub mod forms;
190pub mod geometry;
191pub mod graphics;
192pub mod memory;
193pub mod objects;
194pub mod operations;
195pub mod page;
196pub mod page_forms;
197pub mod page_labels;
198pub mod page_lists;
199pub mod page_tables;
200pub mod page_transitions;
201pub mod page_tree;
202pub mod parser;
203#[cfg(feature = "performance")]
204pub mod performance;
205pub mod recovery;
206pub mod streaming;
207pub mod structure;
208pub mod templates;
209pub mod text;
210pub mod verification;
211pub mod viewer_preferences;
212pub mod writer;
213
214pub mod semantic;
215
216// Dashboard and reporting modules
217pub mod dashboard;
218
219// Re-export generation types
220pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
221pub use document::{Document, DocumentMetadata};
222pub use error::{OxidizePdfError, PdfError, Result};
223pub use geometry::{Point, Rectangle};
224pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
225pub use page::{Margins, Page};
226pub use page_lists::{ListStyle, ListType, PageLists};
227pub use page_tables::{PageTables, TableStyle};
228pub use text::{
229    measure_text,
230    split_into_words,
231    BulletStyle,
232    Font,
233    FontFamily,
234    FragmentType,
235    HeaderStyle,
236    ImagePreprocessing,
237    ListElement,
238    ListOptions,
239    MockOcrProvider,
240    OcrEngine,
241    OcrError,
242    OcrOptions,
243    OcrProcessingResult,
244    OcrProvider,
245    OcrResult,
246    OcrTextFragment,
247    // List exports
248    OrderedList,
249    OrderedListStyle,
250    // Table exports
251    Table,
252    TableCell,
253    TableOptions,
254    TextAlign,
255    TextContext,
256    TextFlowContext,
257    UnorderedList,
258};
259
260// Re-export forms types
261pub use forms::{
262    calculations::FieldValue,
263    field_actions::{
264        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
265        SpecialFormatType, ValidateActionType,
266    },
267    validation::{
268        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
269        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
270    },
271    BorderStyle, FieldType, TextField, Widget,
272};
273
274// Re-export font embedding types
275pub use text::fonts::embedding::{
276    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
277    FontEncoding, FontFlags, FontMetrics, FontType,
278};
279
280// Re-export font management types
281pub use text::font_manager::{CustomFont, FontManager};
282
283// Re-export parsing types
284pub use parser::{
285    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
286    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
287    PdfString,
288};
289
290// Re-export operations
291pub use operations::{
292    extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
293    ExtractImagesOptions, ExtractedImage, ImageExtractor,
294};
295
296// Re-export dashboard types
297pub use dashboard::{
298    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
299    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
300};
301
302// Re-export memory optimization types
303pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
304
305// Re-export streaming types
306pub use streaming::{
307    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
308    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
309    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
310};
311
312// Re-export batch processing types
313pub use batch::{
314    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
315    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
316    ProgressCallback, ProgressInfo,
317};
318
319// Re-export recovery types
320pub use recovery::{
321    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
322    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
323    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
324};
325
326// Re-export structure types
327pub use structure::{
328    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
329    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
330};
331
332// Re-export action types
333pub use actions::{
334    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
335    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
336};
337
338// Re-export page label types
339pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
340
341// Re-export template types
342pub use templates::{
343    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
344};
345
346// Re-export verification types
347pub use verification::comparators::{
348    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
349};
350pub use verification::compliance_report::{
351    format_report_markdown, generate_compliance_report, ComplianceReport,
352};
353pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
354pub use verification::validators::{
355    check_available_validators, validate_external, validate_with_qpdf,
356};
357pub use verification::{
358    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
359    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
360};
361
362/// Current version of oxidize-pdf
363pub const VERSION: &str = env!("CARGO_PKG_VERSION");
364
365/// Scanned page analysis and OCR example
366///
367/// ```rust,no_run
368/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
369/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
370/// use oxidize_pdf::parser::PdfReader;
371///
372/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
373/// let document = PdfReader::open_document("scanned.pdf")?;
374/// let analyzer = PageContentAnalyzer::new(document);
375///
376/// // Analyze pages for scanned content
377/// let analyses = analyzer.analyze_document()?;
378/// for analysis in analyses {
379///     match analysis.page_type {
380///         PageType::Scanned => {
381///             println!("Page {} is scanned - applying OCR", analysis.page_number);
382///             
383///             // Process with OCR
384///             let ocr_provider = MockOcrProvider::new();
385///             let ocr_result = analyzer.extract_text_from_scanned_page(
386///                 analysis.page_number,
387///                 &ocr_provider
388///             )?;
389///             
390///             println!("OCR extracted: {}", ocr_result.text);
391///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
392///         }
393///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
394///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
395///     }
396/// }
397/// # Ok(())
398/// # }
399/// ```
400///
401/// ### Font Embedding
402///
403/// ```rust,no_run
404/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
405/// use std::collections::HashSet;
406///
407/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
408/// // Create font embedder
409/// let mut embedder = FontEmbedder::new();
410///
411/// // Define used glyphs (example with basic ASCII)
412/// let mut used_glyphs = HashSet::new();
413/// used_glyphs.insert(65); // 'A'
414/// used_glyphs.insert(66); // 'B'
415/// used_glyphs.insert(67); // 'C'
416///
417/// // Configure embedding options
418/// let options = EmbeddingOptions {
419///     subset: true,                    // Create font subset
420///     compress_font_streams: true,     // Compress font data
421///     ..Default::default()
422/// };
423///
424/// // Load font data (example - you'd load actual TrueType data)
425/// let font_data = std::fs::read("path/to/font.ttf")?;
426///
427/// // Embed the font
428/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
429/// println!("Embedded font as: {}", font_name);
430///
431/// // Generate PDF dictionary for the embedded font
432/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
433/// println!("Font dictionary generated successfully");
434/// # Ok(())
435/// # }
436/// ```
437///
438/// Supported PDF versions
439pub mod pdf_version {
440    /// PDF 1.0 - 1.7 are fully supported
441    pub const SUPPORTED_VERSIONS: &[&str] =
442        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
443    /// PDF 2.0 support is planned
444    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn test_create_empty_document() {
453        let doc = Document::new();
454        assert_eq!(doc.pages.len(), 0);
455    }
456
457    #[test]
458    fn test_create_page() {
459        let page = Page::new(595.0, 842.0);
460        assert_eq!(page.width(), 595.0);
461        assert_eq!(page.height(), 842.0);
462    }
463
464    #[test]
465    fn test_version_info() {
466        assert!(!VERSION.is_empty());
467        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
468    }
469
470    #[test]
471    fn test_pdf_version_constants() {
472        // Test that all expected PDF versions are supported
473        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
474
475        for version in expected_versions {
476            assert!(
477                pdf_version::SUPPORTED_VERSIONS.contains(&version),
478                "Expected PDF version {version} to be supported"
479            );
480        }
481
482        // Test that we have exactly 8 supported versions
483        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
484
485        // Test planned versions
486        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
487        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
488    }
489
490    #[test]
491    fn test_document_with_metadata() {
492        let mut doc = Document::new();
493        doc.set_title("Test Document");
494        doc.set_author("Test Author");
495        doc.set_subject("Test Subject");
496
497        // Verify metadata is set (checking internal state)
498        assert_eq!(doc.pages.len(), 0);
499        // Note: We can't directly test metadata without exposing getters
500        // This test ensures the methods don't panic
501    }
502
503    #[test]
504    fn test_page_creation_variants() {
505        // Test different page creation methods
506        let page_a4 = Page::a4();
507        let page_letter = Page::letter();
508        let page_custom = Page::new(400.0, 600.0);
509
510        // A4 dimensions: 595.276 x 841.89 points (approximation)
511        assert!((page_a4.width() - 595.0).abs() < 10.0);
512        assert!((page_a4.height() - 842.0).abs() < 10.0);
513
514        // Letter dimensions: 612 x 792 points
515        assert_eq!(page_letter.width(), 612.0);
516        assert_eq!(page_letter.height(), 792.0);
517
518        // Custom dimensions
519        assert_eq!(page_custom.width(), 400.0);
520        assert_eq!(page_custom.height(), 600.0);
521    }
522
523    #[test]
524    fn test_color_creation() {
525        let red = Color::rgb(1.0, 0.0, 0.0);
526        let green = Color::rgb(0.0, 1.0, 0.0);
527        let blue = Color::rgb(0.0, 0.0, 1.0);
528        let black = Color::rgb(0.0, 0.0, 0.0);
529        let white = Color::rgb(1.0, 1.0, 1.0);
530
531        // Test color creation doesn't panic
532        let _colors = [red, green, blue, black, white];
533
534        // Test CMYK color (if available)
535        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
536        let _cmyk_test = cyan;
537    }
538
539    #[test]
540    fn test_font_types() {
541        let helvetica = Font::Helvetica;
542        let times = Font::TimesRoman;
543        let courier = Font::Courier;
544
545        // Test font creation doesn't panic
546        let _fonts = [helvetica, times, courier];
547
548        // Test font family
549        let helvetica_family = FontFamily::Helvetica;
550        let times_family = FontFamily::Times;
551        let courier_family = FontFamily::Courier;
552
553        let _families = [helvetica_family, times_family, courier_family];
554    }
555
556    #[test]
557    fn test_error_types() {
558        // Test that error types can be created
559        let pdf_error = PdfError::InvalidStructure("test error".to_string());
560        let _error_test = pdf_error;
561
562        // Test result type
563        let ok_result: Result<i32> = Ok(42);
564        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
565
566        assert!(ok_result.is_ok());
567        assert!(err_result.is_err());
568    }
569
570    #[test]
571    fn test_module_exports() {
572        // Test that all major types are properly exported
573        let _doc = Document::new();
574        let _page = Page::new(100.0, 100.0);
575        let _color = Color::rgb(0.5, 0.5, 0.5);
576        let _font = Font::Helvetica;
577
578        // Test parsing types
579        let _array = PdfArray::new();
580        let _dict = PdfDictionary::new();
581        let _name = PdfName::new("Test".to_string());
582        let _string = PdfString::new(b"Test".to_vec());
583
584        // Test operation types
585        let _margins = Margins {
586            top: 10.0,
587            right: 10.0,
588            bottom: 10.0,
589            left: 10.0,
590        };
591        let _align = TextAlign::Left;
592    }
593
594    #[test]
595    fn test_ocr_types() {
596        // Test OCR-related types
597        let _mock_ocr = MockOcrProvider::new();
598        let _ocr_options = OcrOptions::default();
599        let _ocr_engine = OcrEngine::Tesseract;
600
601        // Test fragment types
602        let _fragment_type = FragmentType::Word;
603        let _image_preprocessing = ImagePreprocessing::default();
604    }
605
606    #[test]
607    fn test_text_utilities() {
608        // Test text utility functions
609        let text = "Hello world test";
610        let words = split_into_words(text);
611        assert!(!words.is_empty());
612        assert!(words.contains(&"Hello"));
613        assert!(words.contains(&"world"));
614
615        // Test text measurement (with mock font)
616        let font = Font::Helvetica;
617        let size = 12.0;
618        let width = measure_text(text, font, size);
619        assert!(width > 0.0);
620    }
621
622    #[test]
623    fn test_image_types() {
624        // Test image-related types
625        let _format = ImageFormat::Jpeg;
626        let _color_space = ColorSpace::DeviceRGB;
627
628        // Test that image creation doesn't panic
629        let image_data = vec![0u8; 100];
630        let _image = Image::from_jpeg_data(image_data);
631    }
632
633    #[test]
634    fn test_version_string_format() {
635        // Test that version string follows semantic versioning
636        let version_parts: Vec<&str> = VERSION.split('.').collect();
637        assert!(
638            version_parts.len() >= 2,
639            "Version should have at least major.minor format"
640        );
641
642        // Test that major and minor are numeric
643        assert!(
644            version_parts[0].parse::<u32>().is_ok(),
645            "Major version should be numeric"
646        );
647        assert!(
648            version_parts[1].parse::<u32>().is_ok(),
649            "Minor version should be numeric"
650        );
651
652        // Test that version is not empty
653        assert!(!VERSION.is_empty());
654        assert!(!VERSION.is_empty());
655    }
656}
oxidize_pdf/lib.rs

oxidize_pdf/
lib.rs