oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
13//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
14//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
15//! - **Resource Access**: Work with fonts, images, and other PDF resources
16//! - **Pure Rust**: No C dependencies or external libraries
17//! - **100% Native**: Complete PDF implementation from scratch
18//!
19//! ## Quick Start
20//!
21//! ### Creating PDFs
22//!
23//! ```rust
24//! use oxidize_pdf::{Document, Page, Font, Color, Result};
25//!
26//! # fn main() -> Result<()> {
27//! // Create a new document
28//! let mut doc = Document::new();
29//! doc.set_title("My PDF");
30//!
31//! // Create a page
32//! let mut page = Page::a4();
33//!
34//! // Add text
35//! page.text()
36//!     .set_font(Font::Helvetica, 24.0)
37//!     .at(50.0, 700.0)
38//!     .write("Hello, PDF!")?;
39//!
40//! // Add graphics
41//! page.graphics()
42//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
43//!     .circle(300.0, 400.0, 50.0)
44//!     .fill();
45//!
46//! // Save the document
47//! doc.add_page(page);
48//! doc.save("output.pdf")?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ### Parsing PDFs
54//!
55//! ```rust,no_run
56//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
57//!
58//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
59//! // Open and parse a PDF
60//! let reader = PdfReader::open("document.pdf")?;
61//! let document = PdfDocument::new(reader);
62//!
63//! // Get document information
64//! println!("Pages: {}", document.page_count()?);
65//! println!("Version: {}", document.version()?);
66//!
67//! // Process pages
68//! for i in 0..document.page_count()? {
69//!     let page = document.get_page(i)?;
70//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
71//! }
72//!
73//! // Extract text
74//! let text_pages = document.extract_text()?;
75//! for (i, page_text) in text_pages.iter().enumerate() {
76//!     println!("Page {} text: {}", i+1, page_text.text);
77//! }
78//! # Ok(())
79//! # }
80//! ```
81//!
82//! ## Modules
83//!
84//! ### Generation Modules
85//! - [`document`] - PDF document creation and management
86//! - [`page`] - Page creation and layout
87//! - [`graphics`] - Vector graphics and images
88//! - [`text`] - Text rendering and flow
89//! - [`writer`] - Low-level PDF writing
90//!
91//! ### Parsing Modules
92//! - [`parser`] - Complete PDF parsing and reading
93//!   - [`parser::PdfDocument`] - High-level document interface
94//!   - [`parser::ParsedPage`] - Page representation with resources
95//!   - [`parser::ContentParser`] - Content stream parsing
96//!   - [`parser::PdfObject`] - Low-level PDF objects
97//!
98//! ### Manipulation Modules
99//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
100//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
101//! - [`text::extraction`] - Text extraction with positioning
102//!
103//! ### OCR Modules (v0.1.3+)
104//! - [`text::ocr`] - OCR trait system and types
105//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
106//! - [`text::ocr`] - OCR integration for scanned documents
107//!
108//! ## Examples
109//!
110//! ### Content Stream Processing
111//!
112//! ```rust,no_run
113//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
114//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
115//!
116//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
117//! let reader = PdfReader::open("document.pdf")?;
118//! let document = PdfDocument::new(reader);
119//! let page = document.get_page(0)?;
120//!
121//! // Get and parse content streams
122//! let streams = page.content_streams_with_document(&document)?;
123//! for stream in streams {
124//!     let operations = ContentParser::parse(&stream)?;
125//!     
126//!     for op in operations {
127//!         match op {
128//!             ContentOperation::ShowText(text) => {
129//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
130//!             }
131//!             ContentOperation::SetFont(name, size) => {
132//!                 println!("Font: {} at {} pt", name, size);
133//!             }
134//!             ContentOperation::MoveTo(x, y) => {
135//!                 println!("Move to ({}, {})", x, y);
136//!             }
137//!             _ => {} // Handle other operations
138//!         }
139//!     }
140//! }
141//! # Ok(())
142//! # }
143//! ```
144//!
145//! ### Resource Access
146//!
147//! ```rust,no_run
148//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
149//!
150//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
151//! let reader = PdfReader::open("document.pdf")?;
152//! let document = PdfDocument::new(reader);
153//! let page = document.get_page(0)?;
154//!
155//! // Access page resources
156//! if let Some(resources) = page.get_resources() {
157//!     // Check fonts
158//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
159//!         for (name, _) in &fonts.0 {
160//!             println!("Font resource: {}", name.as_str());
161//!         }
162//!     }
163//!     
164//!     // Check images/XObjects
165//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
166//!         for (name, _) in &xobjects.0 {
167//!             println!("XObject resource: {}", name.as_str());
168//!         }
169//!     }
170//! }
171//! # Ok(())
172//! # }
173//! ```
174
175pub mod actions;
176pub mod annotations;
177pub mod batch;
178pub mod compression;
179pub mod document;
180pub mod encryption;
181pub mod error;
182pub mod fonts;
183pub mod forms;
184pub mod geometry;
185pub mod graphics;
186pub mod memory;
187pub mod objects;
188pub mod operations;
189pub mod page;
190pub mod page_forms;
191pub mod page_labels;
192pub mod page_lists;
193pub mod page_tables;
194pub mod page_transitions;
195pub mod page_tree;
196pub mod parser;
197pub mod recovery;
198pub mod streaming;
199pub mod structure;
200pub mod text;
201pub mod verification;
202pub mod viewer_preferences;
203pub mod writer;
204
205#[cfg(feature = "semantic")]
206pub mod semantic;
207
208// Re-export generation types
209pub use document::{Document, DocumentMetadata};
210pub use error::{OxidizePdfError, PdfError, Result};
211pub use geometry::{Point, Rectangle};
212pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
213pub use page::{Margins, Page};
214pub use page_lists::{ListStyle, ListType, PageLists};
215pub use page_tables::{PageTables, TableStyle};
216pub use text::{
217    measure_text,
218    split_into_words,
219    BulletStyle,
220    Font,
221    FontFamily,
222    FragmentType,
223    HeaderStyle,
224    ImagePreprocessing,
225    ListElement,
226    ListOptions,
227    MockOcrProvider,
228    OcrEngine,
229    OcrError,
230    OcrOptions,
231    OcrProcessingResult,
232    OcrProvider,
233    OcrResult,
234    OcrTextFragment,
235    // List exports
236    OrderedList,
237    OrderedListStyle,
238    // Table exports
239    Table,
240    TableCell,
241    TableOptions,
242    TextAlign,
243    TextContext,
244    TextFlowContext,
245    UnorderedList,
246};
247
248// Re-export forms types
249pub use forms::{
250    calculations::FieldValue,
251    field_actions::{
252        ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
253        SpecialFormatType, ValidateActionType,
254    },
255    validation::{
256        DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
257        RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
258    },
259    BorderStyle, FieldType, TextField, Widget,
260};
261
262// Re-export font embedding types
263pub use text::fonts::embedding::{
264    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
265    FontEncoding, FontFlags, FontMetrics, FontType,
266};
267
268// Re-export font management types
269pub use text::font_manager::{CustomFont, FontManager};
270
271// Re-export parsing types
272pub use parser::{
273    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
274    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
275    PdfString,
276};
277
278// Re-export operations
279pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
280
281// Re-export memory optimization types
282pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
283
284// Re-export streaming types
285pub use streaming::{
286    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
287    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
288    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
289};
290
291// Re-export batch processing types
292pub use batch::{
293    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
294    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
295    ProgressCallback, ProgressInfo,
296};
297
298// Re-export recovery types
299pub use recovery::{
300    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
301    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
302    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
303};
304
305// Re-export structure types
306pub use structure::{
307    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
308    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
309};
310
311// Re-export action types
312pub use actions::{
313    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
314    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
315};
316
317// Re-export page label types
318pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
319
320// Re-export verification types
321pub use verification::comparators::{
322    compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
323};
324pub use verification::compliance_report::{
325    format_report_markdown, generate_compliance_report, ComplianceReport,
326};
327pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
328pub use verification::validators::{
329    check_available_validators, validate_external, validate_with_qpdf,
330};
331pub use verification::{
332    extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
333    ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
334};
335
336/// Current version of oxidize-pdf
337pub const VERSION: &str = env!("CARGO_PKG_VERSION");
338
339/// Scanned page analysis and OCR example
340///
341/// ```rust,no_run
342/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
343/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
344/// use oxidize_pdf::parser::PdfReader;
345///
346/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
347/// let document = PdfReader::open_document("scanned.pdf")?;
348/// let analyzer = PageContentAnalyzer::new(document);
349///
350/// // Analyze pages for scanned content
351/// let analyses = analyzer.analyze_document()?;
352/// for analysis in analyses {
353///     match analysis.page_type {
354///         PageType::Scanned => {
355///             println!("Page {} is scanned - applying OCR", analysis.page_number);
356///             
357///             // Process with OCR
358///             let ocr_provider = MockOcrProvider::new();
359///             let ocr_result = analyzer.extract_text_from_scanned_page(
360///                 analysis.page_number,
361///                 &ocr_provider
362///             )?;
363///             
364///             println!("OCR extracted: {}", ocr_result.text);
365///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
366///         }
367///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
368///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
369///     }
370/// }
371/// # Ok(())
372/// # }
373/// ```
374///
375/// ### Font Embedding
376///
377/// ```rust,no_run
378/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
379/// use std::collections::HashSet;
380///
381/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
382/// // Create font embedder
383/// let mut embedder = FontEmbedder::new();
384///
385/// // Define used glyphs (example with basic ASCII)
386/// let mut used_glyphs = HashSet::new();
387/// used_glyphs.insert(65); // 'A'
388/// used_glyphs.insert(66); // 'B'
389/// used_glyphs.insert(67); // 'C'
390///
391/// // Configure embedding options
392/// let options = EmbeddingOptions {
393///     subset: true,                    // Create font subset
394///     compress_font_streams: true,     // Compress font data
395///     ..Default::default()
396/// };
397///
398/// // Load font data (example - you'd load actual TrueType data)
399/// let font_data = std::fs::read("path/to/font.ttf")?;
400///
401/// // Embed the font
402/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
403/// println!("Embedded font as: {}", font_name);
404///
405/// // Generate PDF dictionary for the embedded font
406/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
407/// println!("Font dictionary generated successfully");
408/// # Ok(())
409/// # }
410/// ```
411///
412/// Supported PDF versions
413pub mod pdf_version {
414    /// PDF 1.0 - 1.7 are fully supported
415    pub const SUPPORTED_VERSIONS: &[&str] =
416        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
417    /// PDF 2.0 support is planned
418    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424
425    #[test]
426    fn test_create_empty_document() {
427        let doc = Document::new();
428        assert_eq!(doc.pages.len(), 0);
429    }
430
431    #[test]
432    fn test_create_page() {
433        let page = Page::new(595.0, 842.0);
434        assert_eq!(page.width(), 595.0);
435        assert_eq!(page.height(), 842.0);
436    }
437
438    #[test]
439    fn test_version_info() {
440        assert!(!VERSION.is_empty());
441        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
442    }
443
444    #[test]
445    fn test_pdf_version_constants() {
446        // Test that all expected PDF versions are supported
447        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
448
449        for version in expected_versions {
450            assert!(
451                pdf_version::SUPPORTED_VERSIONS.contains(&version),
452                "Expected PDF version {version} to be supported"
453            );
454        }
455
456        // Test that we have exactly 8 supported versions
457        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
458
459        // Test planned versions
460        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
461        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
462    }
463
464    #[test]
465    fn test_document_with_metadata() {
466        let mut doc = Document::new();
467        doc.set_title("Test Document");
468        doc.set_author("Test Author");
469        doc.set_subject("Test Subject");
470
471        // Verify metadata is set (checking internal state)
472        assert_eq!(doc.pages.len(), 0);
473        // Note: We can't directly test metadata without exposing getters
474        // This test ensures the methods don't panic
475    }
476
477    #[test]
478    fn test_page_creation_variants() {
479        // Test different page creation methods
480        let page_a4 = Page::a4();
481        let page_letter = Page::letter();
482        let page_custom = Page::new(400.0, 600.0);
483
484        // A4 dimensions: 595.276 x 841.89 points (approximation)
485        assert!((page_a4.width() - 595.0).abs() < 10.0);
486        assert!((page_a4.height() - 842.0).abs() < 10.0);
487
488        // Letter dimensions: 612 x 792 points
489        assert_eq!(page_letter.width(), 612.0);
490        assert_eq!(page_letter.height(), 792.0);
491
492        // Custom dimensions
493        assert_eq!(page_custom.width(), 400.0);
494        assert_eq!(page_custom.height(), 600.0);
495    }
496
497    #[test]
498    fn test_color_creation() {
499        let red = Color::rgb(1.0, 0.0, 0.0);
500        let green = Color::rgb(0.0, 1.0, 0.0);
501        let blue = Color::rgb(0.0, 0.0, 1.0);
502        let black = Color::rgb(0.0, 0.0, 0.0);
503        let white = Color::rgb(1.0, 1.0, 1.0);
504
505        // Test color creation doesn't panic
506        let _colors = [red, green, blue, black, white];
507
508        // Test CMYK color (if available)
509        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
510        let _cmyk_test = cyan;
511    }
512
513    #[test]
514    fn test_font_types() {
515        let helvetica = Font::Helvetica;
516        let times = Font::TimesRoman;
517        let courier = Font::Courier;
518
519        // Test font creation doesn't panic
520        let _fonts = [helvetica, times, courier];
521
522        // Test font family
523        let helvetica_family = FontFamily::Helvetica;
524        let times_family = FontFamily::Times;
525        let courier_family = FontFamily::Courier;
526
527        let _families = [helvetica_family, times_family, courier_family];
528    }
529
530    #[test]
531    fn test_error_types() {
532        // Test that error types can be created
533        let pdf_error = PdfError::InvalidStructure("test error".to_string());
534        let _error_test = pdf_error;
535
536        // Test result type
537        let ok_result: Result<i32> = Ok(42);
538        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
539
540        assert!(ok_result.is_ok());
541        assert!(err_result.is_err());
542    }
543
544    #[test]
545    fn test_module_exports() {
546        // Test that all major types are properly exported
547        let _doc = Document::new();
548        let _page = Page::new(100.0, 100.0);
549        let _color = Color::rgb(0.5, 0.5, 0.5);
550        let _font = Font::Helvetica;
551
552        // Test parsing types
553        let _array = PdfArray::new();
554        let _dict = PdfDictionary::new();
555        let _name = PdfName::new("Test".to_string());
556        let _string = PdfString::new(b"Test".to_vec());
557
558        // Test operation types
559        let _margins = Margins {
560            top: 10.0,
561            right: 10.0,
562            bottom: 10.0,
563            left: 10.0,
564        };
565        let _align = TextAlign::Left;
566    }
567
568    #[test]
569    fn test_ocr_types() {
570        // Test OCR-related types
571        let _mock_ocr = MockOcrProvider::new();
572        let _ocr_options = OcrOptions::default();
573        let _ocr_engine = OcrEngine::Tesseract;
574
575        // Test fragment types
576        let _fragment_type = FragmentType::Word;
577        let _image_preprocessing = ImagePreprocessing::default();
578    }
579
580    #[test]
581    fn test_text_utilities() {
582        // Test text utility functions
583        let text = "Hello world test";
584        let words = split_into_words(text);
585        assert!(!words.is_empty());
586        assert!(words.contains(&"Hello"));
587        assert!(words.contains(&"world"));
588
589        // Test text measurement (with mock font)
590        let font = Font::Helvetica;
591        let size = 12.0;
592        let width = measure_text(text, font, size);
593        assert!(width > 0.0);
594    }
595
596    #[test]
597    fn test_image_types() {
598        // Test image-related types
599        let _format = ImageFormat::Jpeg;
600        let _color_space = ColorSpace::DeviceRGB;
601
602        // Test that image creation doesn't panic
603        let image_data = vec![0u8; 100];
604        let _image = Image::from_jpeg_data(image_data);
605    }
606
607    #[test]
608    fn test_version_string_format() {
609        // Test that version string follows semantic versioning
610        let version_parts: Vec<&str> = VERSION.split('.').collect();
611        assert!(
612            version_parts.len() >= 2,
613            "Version should have at least major.minor format"
614        );
615
616        // Test that major and minor are numeric
617        assert!(
618            version_parts[0].parse::<u32>().is_ok(),
619            "Major version should be numeric"
620        );
621        assert!(
622            version_parts[1].parse::<u32>().is_ok(),
623            "Minor version should be numeric"
624        );
625
626        // Test that version is not empty
627        assert!(!VERSION.is_empty());
628        assert!(!VERSION.is_empty());
629    }
630}