oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//!     .set_font(Font::Helvetica, 24.0)
39//!     .at(50.0, 700.0)
40//!     .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//!     .circle(300.0, 400.0, 50.0)
46//!     .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//!     let page = document.get_page(i)?;
72//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//!     println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//!   - [`parser::PdfDocument`] - High-level document interface
96//!   - [`parser::ParsedPage`] - Page representation with resources
97//!   - [`parser::ContentParser`] - Content stream parsing
98//!   - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//!     let operations = ContentParser::parse(&stream)?;
127//!     
128//!     for op in operations {
129//!         match op {
130//!             ContentOperation::ShowText(text) => {
131//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
132//!             }
133//!             ContentOperation::SetFont(name, size) => {
134//!                 println!("Font: {} at {} pt", name, size);
135//!             }
136//!             ContentOperation::MoveTo(x, y) => {
137//!                 println!("Move to ({}, {})", x, y);
138//!             }
139//!             _ => {} // Handle other operations
140//!         }
141//!     }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//!     // Check fonts
160//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//!         for (name, _) in &fonts.0 {
162//!             println!("Font resource: {}", name.as_str());
163//!         }
164//!     }
165//!     
166//!     // Check images/XObjects
167//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//!         for (name, _) in &xobjects.0 {
169//!             println!("XObject resource: {}", name.as_str());
170//!         }
171//!     }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod annotations;
180
181pub mod batch;
182pub mod charts;
183pub mod compression;
184pub mod coordinate_system;
185pub mod document;
186pub mod encryption;
187pub mod error;
188pub mod fonts;
189pub mod forms;
190pub mod geometry;
191pub mod graphics;
192pub mod memory;
193pub mod objects;
194pub mod operations;
195pub mod page;
196pub mod page_forms;
197pub mod page_labels;
198pub mod page_lists;
199pub mod page_tables;
200pub mod page_transitions;
201pub mod page_tree;
202pub mod parser;
203#[cfg(feature = "performance")]
204pub mod performance;
205pub mod recovery;
206pub mod streaming;
207pub mod structure;
208pub mod templates;
209pub mod text;
210pub mod verification;
211pub mod viewer_preferences;
212pub mod writer;
213
214pub mod semantic;
215
216// Dashboard and reporting modules
217pub mod dashboard;
218
219// Re-export generation types
220pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
221pub use document::{Document, DocumentMetadata};
222pub use error::{OxidizePdfError, PdfError, Result};
223pub use geometry::{Point, Rectangle};
224pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
225pub use page::{Margins, Page};
226pub use page_lists::{ListStyle, ListType, PageLists};
227pub use page_tables::{PageTables, TableStyle};
228pub use text::{
229    measure_text,
230    split_into_words,
231    BulletStyle,
232    Font,
233    FontFamily,
234    FragmentType,
235    HeaderStyle,
236    ImagePreprocessing,
237    ListElement,
238    ListOptions,
239    MockOcrProvider,
240    OcrEngine,
241    OcrError,
242    OcrOptions,
243    OcrProcessingResult,
244    OcrProvider,
245    OcrResult,
246    OcrTextFragment,
247    // List exports
248    OrderedList,
249    OrderedListStyle,
250    // Table exports
251    Table,
252    TableCell,
253    TableOptions,
254    TextAlign,
255    TextContext,
256    TextFlowContext,
257    UnorderedList,
258};
259
260// Re-export forms types
261pub use forms::{
262    calculations::FieldValue,
263    field_actions::{
264        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
265        SpecialFormatType, ValidateActionType,
266    },
267    validation::{
268        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
269        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
270    },
271    BorderStyle, FieldType, TextField, Widget,
272};
273
274// Re-export font embedding types
275pub use text::fonts::embedding::{
276    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
277    FontEncoding, FontFlags, FontMetrics, FontType,
278};
279
280// Re-export font management types
281pub use text::font_manager::{CustomFont, FontManager};
282
283// Re-export parsing types
284pub use parser::{
285    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
286    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
287    PdfString,
288};
289
290// Re-export operations
291pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
292
293// Re-export dashboard types
294pub use dashboard::{
295    Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
296    DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
297};
298
299// Re-export memory optimization types
300pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
301
302// Re-export streaming types
303pub use streaming::{
304    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
305    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
306    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
307};
308
309// Re-export batch processing types
310pub use batch::{
311    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
312    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
313    ProgressCallback, ProgressInfo,
314};
315
316// Re-export recovery types
317pub use recovery::{
318    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
319    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
320    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
321};
322
323// Re-export structure types
324pub use structure::{
325    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
326    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
327};
328
329// Re-export action types
330pub use actions::{
331    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
332    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
333};
334
335// Re-export page label types
336pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
337
338// Re-export template types
339pub use templates::{
340    Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
341};
342
343// Re-export verification types
344pub use verification::comparators::{
345    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
346};
347pub use verification::compliance_report::{
348    format_report_markdown, generate_compliance_report, ComplianceReport,
349};
350pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
351pub use verification::validators::{
352    check_available_validators, validate_external, validate_with_qpdf,
353};
354pub use verification::{
355    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
356    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
357};
358
359/// Current version of oxidize-pdf
360pub const VERSION: &str = env!("CARGO_PKG_VERSION");
361
362/// Scanned page analysis and OCR example
363///
364/// ```rust,no_run
365/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
366/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
367/// use oxidize_pdf::parser::PdfReader;
368///
369/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
370/// let document = PdfReader::open_document("scanned.pdf")?;
371/// let analyzer = PageContentAnalyzer::new(document);
372///
373/// // Analyze pages for scanned content
374/// let analyses = analyzer.analyze_document()?;
375/// for analysis in analyses {
376///     match analysis.page_type {
377///         PageType::Scanned => {
378///             println!("Page {} is scanned - applying OCR", analysis.page_number);
379///             
380///             // Process with OCR
381///             let ocr_provider = MockOcrProvider::new();
382///             let ocr_result = analyzer.extract_text_from_scanned_page(
383///                 analysis.page_number,
384///                 &ocr_provider
385///             )?;
386///             
387///             println!("OCR extracted: {}", ocr_result.text);
388///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
389///         }
390///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
391///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
392///     }
393/// }
394/// # Ok(())
395/// # }
396/// ```
397///
398/// ### Font Embedding
399///
400/// ```rust,no_run
401/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
402/// use std::collections::HashSet;
403///
404/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
405/// // Create font embedder
406/// let mut embedder = FontEmbedder::new();
407///
408/// // Define used glyphs (example with basic ASCII)
409/// let mut used_glyphs = HashSet::new();
410/// used_glyphs.insert(65); // 'A'
411/// used_glyphs.insert(66); // 'B'
412/// used_glyphs.insert(67); // 'C'
413///
414/// // Configure embedding options
415/// let options = EmbeddingOptions {
416///     subset: true,                    // Create font subset
417///     compress_font_streams: true,     // Compress font data
418///     ..Default::default()
419/// };
420///
421/// // Load font data (example - you'd load actual TrueType data)
422/// let font_data = std::fs::read("path/to/font.ttf")?;
423///
424/// // Embed the font
425/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
426/// println!("Embedded font as: {}", font_name);
427///
428/// // Generate PDF dictionary for the embedded font
429/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
430/// println!("Font dictionary generated successfully");
431/// # Ok(())
432/// # }
433/// ```
434///
435/// Supported PDF versions
436pub mod pdf_version {
437    /// PDF 1.0 - 1.7 are fully supported
438    pub const SUPPORTED_VERSIONS: &[&str] =
439        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
440    /// PDF 2.0 support is planned
441    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn test_create_empty_document() {
450        let doc = Document::new();
451        assert_eq!(doc.pages.len(), 0);
452    }
453
454    #[test]
455    fn test_create_page() {
456        let page = Page::new(595.0, 842.0);
457        assert_eq!(page.width(), 595.0);
458        assert_eq!(page.height(), 842.0);
459    }
460
461    #[test]
462    fn test_version_info() {
463        assert!(!VERSION.is_empty());
464        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
465    }
466
467    #[test]
468    fn test_pdf_version_constants() {
469        // Test that all expected PDF versions are supported
470        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
471
472        for version in expected_versions {
473            assert!(
474                pdf_version::SUPPORTED_VERSIONS.contains(&version),
475                "Expected PDF version {version} to be supported"
476            );
477        }
478
479        // Test that we have exactly 8 supported versions
480        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
481
482        // Test planned versions
483        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
484        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
485    }
486
487    #[test]
488    fn test_document_with_metadata() {
489        let mut doc = Document::new();
490        doc.set_title("Test Document");
491        doc.set_author("Test Author");
492        doc.set_subject("Test Subject");
493
494        // Verify metadata is set (checking internal state)
495        assert_eq!(doc.pages.len(), 0);
496        // Note: We can't directly test metadata without exposing getters
497        // This test ensures the methods don't panic
498    }
499
500    #[test]
501    fn test_page_creation_variants() {
502        // Test different page creation methods
503        let page_a4 = Page::a4();
504        let page_letter = Page::letter();
505        let page_custom = Page::new(400.0, 600.0);
506
507        // A4 dimensions: 595.276 x 841.89 points (approximation)
508        assert!((page_a4.width() - 595.0).abs() < 10.0);
509        assert!((page_a4.height() - 842.0).abs() < 10.0);
510
511        // Letter dimensions: 612 x 792 points
512        assert_eq!(page_letter.width(), 612.0);
513        assert_eq!(page_letter.height(), 792.0);
514
515        // Custom dimensions
516        assert_eq!(page_custom.width(), 400.0);
517        assert_eq!(page_custom.height(), 600.0);
518    }
519
520    #[test]
521    fn test_color_creation() {
522        let red = Color::rgb(1.0, 0.0, 0.0);
523        let green = Color::rgb(0.0, 1.0, 0.0);
524        let blue = Color::rgb(0.0, 0.0, 1.0);
525        let black = Color::rgb(0.0, 0.0, 0.0);
526        let white = Color::rgb(1.0, 1.0, 1.0);
527
528        // Test color creation doesn't panic
529        let _colors = [red, green, blue, black, white];
530
531        // Test CMYK color (if available)
532        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
533        let _cmyk_test = cyan;
534    }
535
536    #[test]
537    fn test_font_types() {
538        let helvetica = Font::Helvetica;
539        let times = Font::TimesRoman;
540        let courier = Font::Courier;
541
542        // Test font creation doesn't panic
543        let _fonts = [helvetica, times, courier];
544
545        // Test font family
546        let helvetica_family = FontFamily::Helvetica;
547        let times_family = FontFamily::Times;
548        let courier_family = FontFamily::Courier;
549
550        let _families = [helvetica_family, times_family, courier_family];
551    }
552
553    #[test]
554    fn test_error_types() {
555        // Test that error types can be created
556        let pdf_error = PdfError::InvalidStructure("test error".to_string());
557        let _error_test = pdf_error;
558
559        // Test result type
560        let ok_result: Result<i32> = Ok(42);
561        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
562
563        assert!(ok_result.is_ok());
564        assert!(err_result.is_err());
565    }
566
567    #[test]
568    fn test_module_exports() {
569        // Test that all major types are properly exported
570        let _doc = Document::new();
571        let _page = Page::new(100.0, 100.0);
572        let _color = Color::rgb(0.5, 0.5, 0.5);
573        let _font = Font::Helvetica;
574
575        // Test parsing types
576        let _array = PdfArray::new();
577        let _dict = PdfDictionary::new();
578        let _name = PdfName::new("Test".to_string());
579        let _string = PdfString::new(b"Test".to_vec());
580
581        // Test operation types
582        let _margins = Margins {
583            top: 10.0,
584            right: 10.0,
585            bottom: 10.0,
586            left: 10.0,
587        };
588        let _align = TextAlign::Left;
589    }
590
591    #[test]
592    fn test_ocr_types() {
593        // Test OCR-related types
594        let _mock_ocr = MockOcrProvider::new();
595        let _ocr_options = OcrOptions::default();
596        let _ocr_engine = OcrEngine::Tesseract;
597
598        // Test fragment types
599        let _fragment_type = FragmentType::Word;
600        let _image_preprocessing = ImagePreprocessing::default();
601    }
602
603    #[test]
604    fn test_text_utilities() {
605        // Test text utility functions
606        let text = "Hello world test";
607        let words = split_into_words(text);
608        assert!(!words.is_empty());
609        assert!(words.contains(&"Hello"));
610        assert!(words.contains(&"world"));
611
612        // Test text measurement (with mock font)
613        let font = Font::Helvetica;
614        let size = 12.0;
615        let width = measure_text(text, font, size);
616        assert!(width > 0.0);
617    }
618
619    #[test]
620    fn test_image_types() {
621        // Test image-related types
622        let _format = ImageFormat::Jpeg;
623        let _color_space = ColorSpace::DeviceRGB;
624
625        // Test that image creation doesn't panic
626        let image_data = vec![0u8; 100];
627        let _image = Image::from_jpeg_data(image_data);
628    }
629
630    #[test]
631    fn test_version_string_format() {
632        // Test that version string follows semantic versioning
633        let version_parts: Vec<&str> = VERSION.split('.').collect();
634        assert!(
635            version_parts.len() >= 2,
636            "Version should have at least major.minor format"
637        );
638
639        // Test that major and minor are numeric
640        assert!(
641            version_parts[0].parse::<u32>().is_ok(),
642            "Major version should be numeric"
643        );
644        assert!(
645            version_parts[1].parse::<u32>().is_ok(),
646            "Minor version should be numeric"
647        );
648
649        // Test that version is not empty
650        assert!(!VERSION.is_empty());
651        assert!(!VERSION.is_empty());
652    }
653}