oxidize_pdf/
lib.rs

1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4//!
5//! ## Features
6//!
7//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
8//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
9//! - **PDF Operations**: Split, merge, rotate, and extract pages
10//! - **Text Extraction**: Extract text with position and formatting information
11//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
12//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
13//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
14//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
15//! - **Resource Access**: Work with fonts, images, and other PDF resources
16//! - **Pure Rust**: No C dependencies or external libraries
17//! - **100% Native**: Complete PDF implementation from scratch
18//!
19//! ## Quick Start
20//!
21//! ### Creating PDFs
22//!
23//! ```rust
24//! use oxidize_pdf::{Document, Page, Font, Color, Result};
25//!
26//! # fn main() -> Result<()> {
27//! // Create a new document
28//! let mut doc = Document::new();
29//! doc.set_title("My PDF");
30//!
31//! // Create a page
32//! let mut page = Page::a4();
33//!
34//! // Add text
35//! page.text()
36//!     .set_font(Font::Helvetica, 24.0)
37//!     .at(50.0, 700.0)
38//!     .write("Hello, PDF!")?;
39//!
40//! // Add graphics
41//! page.graphics()
42//!     .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
43//!     .circle(300.0, 400.0, 50.0)
44//!     .fill();
45//!
46//! // Save the document
47//! doc.add_page(page);
48//! doc.save("output.pdf")?;
49//! # Ok(())
50//! # }
51//! ```
52//!
53//! ### Parsing PDFs
54//!
55//! ```rust,no_run
56//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
57//!
58//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
59//! // Open and parse a PDF
60//! let reader = PdfReader::open("document.pdf")?;
61//! let document = PdfDocument::new(reader);
62//!
63//! // Get document information
64//! println!("Pages: {}", document.page_count()?);
65//! println!("Version: {}", document.version()?);
66//!
67//! // Process pages
68//! for i in 0..document.page_count()? {
69//!     let page = document.get_page(i)?;
70//!     println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
71//! }
72//!
73//! // Extract text
74//! let text_pages = document.extract_text()?;
75//! for (i, page_text) in text_pages.iter().enumerate() {
76//!     println!("Page {} text: {}", i+1, page_text.text);
77//! }
78//! # Ok(())
79//! # }
80//! ```
81//!
82//! ## Modules
83//!
84//! ### Generation Modules
85//! - [`document`] - PDF document creation and management
86//! - [`page`] - Page creation and layout
87//! - [`graphics`] - Vector graphics and images
88//! - [`text`] - Text rendering and flow
89//! - [`writer`] - Low-level PDF writing
90//!
91//! ### Parsing Modules
92//! - [`parser`] - Complete PDF parsing and reading
93//!   - [`parser::PdfDocument`] - High-level document interface
94//!   - [`parser::ParsedPage`] - Page representation with resources
95//!   - [`parser::ContentParser`] - Content stream parsing
96//!   - [`parser::PdfObject`] - Low-level PDF objects
97//!
98//! ### Manipulation Modules
99//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
100//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
101//! - [`text::extraction`] - Text extraction with positioning
102//!
103//! ### OCR Modules (v0.1.3+)
104//! - [`text::ocr`] - OCR trait system and types
105//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
106//! - [`text::ocr`] - OCR integration for scanned documents
107//!
108//! ## Examples
109//!
110//! ### Content Stream Processing
111//!
112//! ```rust,no_run
113//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
114//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
115//!
116//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
117//! let reader = PdfReader::open("document.pdf")?;
118//! let document = PdfDocument::new(reader);
119//! let page = document.get_page(0)?;
120//!
121//! // Get and parse content streams
122//! let streams = page.content_streams_with_document(&document)?;
123//! for stream in streams {
124//!     let operations = ContentParser::parse(&stream)?;
125//!     
126//!     for op in operations {
127//!         match op {
128//!             ContentOperation::ShowText(text) => {
129//!                 println!("Text: {:?}", String::from_utf8_lossy(&text));
130//!             }
131//!             ContentOperation::SetFont(name, size) => {
132//!                 println!("Font: {} at {} pt", name, size);
133//!             }
134//!             ContentOperation::MoveTo(x, y) => {
135//!                 println!("Move to ({}, {})", x, y);
136//!             }
137//!             _ => {} // Handle other operations
138//!         }
139//!     }
140//! }
141//! # Ok(())
142//! # }
143//! ```
144//!
145//! ### Resource Access
146//!
147//! ```rust,no_run
148//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
149//!
150//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
151//! let reader = PdfReader::open("document.pdf")?;
152//! let document = PdfDocument::new(reader);
153//! let page = document.get_page(0)?;
154//!
155//! // Access page resources
156//! if let Some(resources) = page.get_resources() {
157//!     // Check fonts
158//!     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
159//!         for (name, _) in &fonts.0 {
160//!             println!("Font resource: {}", name.as_str());
161//!         }
162//!     }
163//!     
164//!     // Check images/XObjects
165//!     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
166//!         for (name, _) in &xobjects.0 {
167//!             println!("XObject resource: {}", name.as_str());
168//!         }
169//!     }
170//! }
171//! # Ok(())
172//! # }
173//! ```
174
175pub mod actions;
176pub mod annotations;
177pub mod batch;
178pub mod compression;
179pub mod document;
180pub mod encryption;
181pub mod error;
182pub mod fonts;
183pub mod forms;
184pub mod geometry;
185pub mod graphics;
186pub mod memory;
187pub mod objects;
188pub mod operations;
189pub mod page;
190pub mod page_forms;
191pub mod page_labels;
192pub mod page_lists;
193pub mod page_tables;
194pub mod parser;
195pub mod recovery;
196pub mod streaming;
197pub mod structure;
198pub mod text;
199pub mod writer;
200
201#[cfg(feature = "semantic")]
202pub mod semantic;
203
204// Re-export generation types
205pub use document::{Document, DocumentMetadata};
206pub use error::{OxidizePdfError, PdfError, Result};
207pub use geometry::{Point, Rectangle};
208pub use graphics::{Color, GraphicsContext, Image, ImageColorSpace, ImageFormat};
209pub use page::{Margins, Page};
210pub use page_lists::{ListStyle, ListType, PageLists};
211pub use page_tables::{PageTables, TableStyle};
212pub use text::{
213    measure_text,
214    split_into_words,
215    AdvancedTable,
216    AdvancedTableCell,
217    AdvancedTableOptions,
218    AlternatingRowColors,
219    BorderLine,
220    BorderStyle as TableBorderStyle,
221    BulletStyle,
222    CellContent,
223    CellPadding,
224    ColumnDefinition,
225    ColumnWidth,
226    Font,
227    FontFamily,
228    FragmentType,
229    HeaderStyle,
230    ImagePreprocessing,
231    LineStyle,
232    ListElement,
233    ListOptions,
234    MockOcrProvider,
235    OcrEngine,
236    OcrError,
237    OcrOptions,
238    OcrProcessingResult,
239    OcrProvider,
240    OcrResult,
241    OcrTextFragment,
242    // List exports
243    OrderedList,
244    OrderedListStyle,
245    // Table exports
246    Table,
247    TableCell,
248    TableOptions,
249    TableRow,
250    TextAlign,
251    TextContext,
252    TextFlowContext,
253    UnorderedList,
254    VerticalAlign,
255};
256
257// Re-export font embedding types
258pub use text::fonts::embedding::{
259    EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
260    FontEncoding, FontFlags, FontMetrics, FontType,
261};
262
263// Re-export parsing types
264pub use parser::{
265    ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
266    ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
267    PdfString,
268};
269
270// Re-export operations
271pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
272
273// Re-export memory optimization types
274pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
275
276// Re-export streaming types
277pub use streaming::{
278    process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
279    IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
280    StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
281};
282
283// Re-export batch processing types
284pub use batch::{
285    batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
286    BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
287    ProgressCallback, ProgressInfo,
288};
289
290// Re-export recovery types
291pub use recovery::{
292    analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
293    CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
294    RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
295};
296
297// Re-export structure types
298pub use structure::{
299    Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
300    OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
301};
302
303// Re-export action types
304pub use actions::{
305    Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
306    RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
307};
308
309// Re-export page label types
310pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
311
312/// Current version of oxidize-pdf
313pub const VERSION: &str = env!("CARGO_PKG_VERSION");
314
315/// Scanned page analysis and OCR example
316///
317/// ```rust,no_run
318/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
319/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
320/// use oxidize_pdf::parser::PdfReader;
321///
322/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
323/// let document = PdfReader::open_document("scanned.pdf")?;
324/// let analyzer = PageContentAnalyzer::new(document);
325///
326/// // Analyze pages for scanned content
327/// let analyses = analyzer.analyze_document()?;
328/// for analysis in analyses {
329///     match analysis.page_type {
330///         PageType::Scanned => {
331///             println!("Page {} is scanned - applying OCR", analysis.page_number);
332///             
333///             // Process with OCR
334///             let ocr_provider = MockOcrProvider::new();
335///             let ocr_result = analyzer.extract_text_from_scanned_page(
336///                 analysis.page_number,
337///                 &ocr_provider
338///             )?;
339///             
340///             println!("OCR extracted: {}", ocr_result.text);
341///             println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
342///         }
343///         PageType::Text => println!("Page {} has vector text", analysis.page_number),
344///         PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
345///     }
346/// }
347/// # Ok(())
348/// # }
349/// ```
350///
351/// ### Font Embedding
352///
353/// ```rust,no_run
354/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
355/// use std::collections::HashSet;
356///
357/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
358/// // Create font embedder
359/// let mut embedder = FontEmbedder::new();
360///
361/// // Define used glyphs (example with basic ASCII)
362/// let mut used_glyphs = HashSet::new();
363/// used_glyphs.insert(65); // 'A'
364/// used_glyphs.insert(66); // 'B'
365/// used_glyphs.insert(67); // 'C'
366///
367/// // Configure embedding options
368/// let options = EmbeddingOptions {
369///     subset: true,                    // Create font subset
370///     compress_font_streams: true,     // Compress font data
371///     ..Default::default()
372/// };
373///
374/// // Load font data (example - you'd load actual TrueType data)
375/// let font_data = std::fs::read("path/to/font.ttf")?;
376///
377/// // Embed the font
378/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
379/// println!("Embedded font as: {}", font_name);
380///
381/// // Generate PDF dictionary for the embedded font
382/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
383/// println!("Font dictionary generated successfully");
384/// # Ok(())
385/// # }
386/// ```
387///
388/// Supported PDF versions
389pub mod pdf_version {
390    /// PDF 1.0 - 1.7 are fully supported
391    pub const SUPPORTED_VERSIONS: &[&str] =
392        &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
393    /// PDF 2.0 support is planned
394    pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn test_create_empty_document() {
403        let doc = Document::new();
404        assert_eq!(doc.pages.len(), 0);
405    }
406
407    #[test]
408    fn test_create_page() {
409        let page = Page::new(595.0, 842.0);
410        assert_eq!(page.width(), 595.0);
411        assert_eq!(page.height(), 842.0);
412    }
413
414    #[test]
415    fn test_version_info() {
416        assert!(!VERSION.is_empty());
417        assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
418    }
419
420    #[test]
421    fn test_pdf_version_constants() {
422        // Test that all expected PDF versions are supported
423        let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
424
425        for version in expected_versions {
426            assert!(
427                pdf_version::SUPPORTED_VERSIONS.contains(&version),
428                "Expected PDF version {} to be supported",
429                version
430            );
431        }
432
433        // Test that we have exactly 8 supported versions
434        assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
435
436        // Test planned versions
437        assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
438        assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
439    }
440
441    #[test]
442    fn test_document_with_metadata() {
443        let mut doc = Document::new();
444        doc.set_title("Test Document");
445        doc.set_author("Test Author");
446        doc.set_subject("Test Subject");
447
448        // Verify metadata is set (checking internal state)
449        assert_eq!(doc.pages.len(), 0);
450        // Note: We can't directly test metadata without exposing getters
451        // This test ensures the methods don't panic
452    }
453
454    #[test]
455    fn test_page_creation_variants() {
456        // Test different page creation methods
457        let page_a4 = Page::a4();
458        let page_letter = Page::letter();
459        let page_custom = Page::new(400.0, 600.0);
460
461        // A4 dimensions: 595.276 x 841.89 points (approximation)
462        assert!((page_a4.width() - 595.0).abs() < 10.0);
463        assert!((page_a4.height() - 842.0).abs() < 10.0);
464
465        // Letter dimensions: 612 x 792 points
466        assert_eq!(page_letter.width(), 612.0);
467        assert_eq!(page_letter.height(), 792.0);
468
469        // Custom dimensions
470        assert_eq!(page_custom.width(), 400.0);
471        assert_eq!(page_custom.height(), 600.0);
472    }
473
474    #[test]
475    fn test_color_creation() {
476        let red = Color::rgb(1.0, 0.0, 0.0);
477        let green = Color::rgb(0.0, 1.0, 0.0);
478        let blue = Color::rgb(0.0, 0.0, 1.0);
479        let black = Color::rgb(0.0, 0.0, 0.0);
480        let white = Color::rgb(1.0, 1.0, 1.0);
481
482        // Test color creation doesn't panic
483        let _colors = [red, green, blue, black, white];
484
485        // Test CMYK color (if available)
486        let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
487        let _cmyk_test = cyan;
488    }
489
490    #[test]
491    fn test_font_types() {
492        let helvetica = Font::Helvetica;
493        let times = Font::TimesRoman;
494        let courier = Font::Courier;
495
496        // Test font creation doesn't panic
497        let _fonts = [helvetica, times, courier];
498
499        // Test font family
500        let helvetica_family = FontFamily::Helvetica;
501        let times_family = FontFamily::Times;
502        let courier_family = FontFamily::Courier;
503
504        let _families = [helvetica_family, times_family, courier_family];
505    }
506
507    #[test]
508    fn test_error_types() {
509        // Test that error types can be created
510        let pdf_error = PdfError::InvalidStructure("test error".to_string());
511        let _error_test = pdf_error;
512
513        // Test result type
514        let ok_result: Result<i32> = Ok(42);
515        let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
516
517        assert!(ok_result.is_ok());
518        assert!(err_result.is_err());
519    }
520
521    #[test]
522    fn test_module_exports() {
523        // Test that all major types are properly exported
524        let _doc = Document::new();
525        let _page = Page::new(100.0, 100.0);
526        let _color = Color::rgb(0.5, 0.5, 0.5);
527        let _font = Font::Helvetica;
528
529        // Test parsing types
530        let _array = PdfArray::new();
531        let _dict = PdfDictionary::new();
532        let _name = PdfName::new("Test".to_string());
533        let _string = PdfString::new(b"Test".to_vec());
534
535        // Test operation types
536        let _margins = Margins {
537            top: 10.0,
538            right: 10.0,
539            bottom: 10.0,
540            left: 10.0,
541        };
542        let _align = TextAlign::Left;
543    }
544
545    #[test]
546    fn test_ocr_types() {
547        // Test OCR-related types
548        let _mock_ocr = MockOcrProvider::new();
549        let _ocr_options = OcrOptions::default();
550        let _ocr_engine = OcrEngine::Tesseract;
551
552        // Test fragment types
553        let _fragment_type = FragmentType::Word;
554        let _image_preprocessing = ImagePreprocessing::default();
555    }
556
557    #[test]
558    fn test_text_utilities() {
559        // Test text utility functions
560        let text = "Hello world test";
561        let words = split_into_words(text);
562        assert!(!words.is_empty());
563        assert!(words.contains(&"Hello"));
564        assert!(words.contains(&"world"));
565
566        // Test text measurement (with mock font)
567        let font = Font::Helvetica;
568        let size = 12.0;
569        let width = measure_text(text, font, size);
570        assert!(width > 0.0);
571    }
572
573    #[test]
574    fn test_image_types() {
575        // Test image-related types
576        let _format = ImageFormat::Jpeg;
577        let _color_space = ImageColorSpace::DeviceRGB;
578
579        // Test that image creation doesn't panic
580        let image_data = vec![0u8; 100];
581        let _image = Image::from_jpeg_data(image_data);
582    }
583
584    #[test]
585    fn test_version_string_format() {
586        // Test that version string follows semantic versioning
587        let version_parts: Vec<&str> = VERSION.split('.').collect();
588        assert!(
589            version_parts.len() >= 2,
590            "Version should have at least major.minor format"
591        );
592
593        // Test that major and minor are numeric
594        assert!(
595            version_parts[0].parse::<u32>().is_ok(),
596            "Major version should be numeric"
597        );
598        assert!(
599            version_parts[1].parse::<u32>().is_ok(),
600            "Minor version should be numeric"
601        );
602
603        // Test that version is not empty
604        assert!(!VERSION.is_empty());
605        assert!(!VERSION.is_empty());
606    }
607}