oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod annotations;
180
181pub mod batch;
182pub mod charts;
183pub mod compression;
184pub mod coordinate_system;
185pub mod document;
186pub mod encryption;
187pub mod error;
188pub mod fonts;
189pub mod forms;
190pub mod geometry;
191pub mod graphics;
192pub mod memory;
193pub mod objects;
194pub mod operations;
195pub mod page;
196pub mod page_forms;
197pub mod page_labels;
198pub mod page_lists;
199pub mod page_tables;
200pub mod page_transitions;
201pub mod page_tree;
202pub mod parser;
203#[cfg(feature = "performance")]
204pub mod performance;
205pub mod recovery;
206pub mod streaming;
207pub mod structure;
208pub mod templates;
209pub mod text;
210pub mod verification;
211pub mod viewer_preferences;
212pub mod writer;
213
214pub mod semantic;
215
216// Dashboard and reporting modules
217pub mod dashboard;
218
219// Re-export generation types
220pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
221pub use document::{Document, DocumentMetadata};
222pub use error::{OxidizePdfError, PdfError, Result};
223pub use geometry::{Point, Rectangle};
224pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
225pub use page::{Margins, Page};
226pub use page_lists::{ListStyle, ListType, PageLists};
227pub use page_tables::{PageTables, TableStyle};
228pub use text::{
229 measure_text,
230 split_into_words,
231 BulletStyle,
232 Font,
233 FontFamily,
234 FragmentType,
235 HeaderStyle,
236 ImagePreprocessing,
237 ListElement,
238 ListOptions,
239 MockOcrProvider,
240 OcrEngine,
241 OcrError,
242 OcrOptions,
243 OcrProcessingResult,
244 OcrProvider,
245 OcrResult,
246 OcrTextFragment,
247 // List exports
248 OrderedList,
249 OrderedListStyle,
250 // Table exports
251 Table,
252 TableCell,
253 TableOptions,
254 TextAlign,
255 TextContext,
256 TextFlowContext,
257 UnorderedList,
258};
259
260// Re-export forms types
261pub use forms::{
262 calculations::FieldValue,
263 field_actions::{
264 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
265 SpecialFormatType, ValidateActionType,
266 },
267 validation::{
268 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
269 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
270 },
271 BorderStyle, FieldType, TextField, Widget,
272};
273
274// Re-export font embedding types
275pub use text::fonts::embedding::{
276 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
277 FontEncoding, FontFlags, FontMetrics, FontType,
278};
279
280// Re-export font management types
281pub use text::font_manager::{CustomFont, FontManager};
282
283// Re-export parsing types
284pub use parser::{
285 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
286 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
287 PdfString,
288};
289
290// Re-export operations
291pub use operations::{
292 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
293 ExtractImagesOptions, ExtractedImage, ImageExtractor,
294};
295
296// Re-export dashboard types
297pub use dashboard::{
298 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
299 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
300};
301
302// Re-export memory optimization types
303pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
304
305// Re-export streaming types
306pub use streaming::{
307 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
308 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
309 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
310};
311
312// Re-export batch processing types
313pub use batch::{
314 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
315 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
316 ProgressCallback, ProgressInfo,
317};
318
319// Re-export recovery types
320pub use recovery::{
321 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
322 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
323 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
324};
325
326// Re-export structure types
327pub use structure::{
328 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
329 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
330};
331
332// Re-export action types
333pub use actions::{
334 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
335 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
336};
337
338// Re-export page label types
339pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
340
341// Re-export template types
342pub use templates::{
343 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
344};
345
346// Re-export verification types
347pub use verification::comparators::{
348 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
349};
350pub use verification::compliance_report::{
351 format_report_markdown, generate_compliance_report, ComplianceReport,
352};
353pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
354pub use verification::validators::{
355 check_available_validators, validate_external, validate_with_qpdf,
356};
357pub use verification::{
358 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
359 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
360};
361
362/// Current version of oxidize-pdf
363pub const VERSION: &str = env!("CARGO_PKG_VERSION");
364
365/// Scanned page analysis and OCR example
366///
367/// ```rust,no_run
368/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
369/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
370/// use oxidize_pdf::parser::PdfReader;
371///
372/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
373/// let document = PdfReader::open_document("scanned.pdf")?;
374/// let analyzer = PageContentAnalyzer::new(document);
375///
376/// // Analyze pages for scanned content
377/// let analyses = analyzer.analyze_document()?;
378/// for analysis in analyses {
379/// match analysis.page_type {
380/// PageType::Scanned => {
381/// println!("Page {} is scanned - applying OCR", analysis.page_number);
382///
383/// // Process with OCR
384/// let ocr_provider = MockOcrProvider::new();
385/// let ocr_result = analyzer.extract_text_from_scanned_page(
386/// analysis.page_number,
387/// &ocr_provider
388/// )?;
389///
390/// println!("OCR extracted: {}", ocr_result.text);
391/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
392/// }
393/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
394/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
395/// }
396/// }
397/// # Ok(())
398/// # }
399/// ```
400///
401/// ### Font Embedding
402///
403/// ```rust,no_run
404/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
405/// use std::collections::HashSet;
406///
407/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
408/// // Create font embedder
409/// let mut embedder = FontEmbedder::new();
410///
411/// // Define used glyphs (example with basic ASCII)
412/// let mut used_glyphs = HashSet::new();
413/// used_glyphs.insert(65); // 'A'
414/// used_glyphs.insert(66); // 'B'
415/// used_glyphs.insert(67); // 'C'
416///
417/// // Configure embedding options
418/// let options = EmbeddingOptions {
419/// subset: true, // Create font subset
420/// compress_font_streams: true, // Compress font data
421/// ..Default::default()
422/// };
423///
424/// // Load font data (example - you'd load actual TrueType data)
425/// let font_data = std::fs::read("path/to/font.ttf")?;
426///
427/// // Embed the font
428/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
429/// println!("Embedded font as: {}", font_name);
430///
431/// // Generate PDF dictionary for the embedded font
432/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
433/// println!("Font dictionary generated successfully");
434/// # Ok(())
435/// # }
436/// ```
437///
438/// Supported PDF versions
439pub mod pdf_version {
440 /// PDF 1.0 - 1.7 are fully supported
441 pub const SUPPORTED_VERSIONS: &[&str] =
442 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
443 /// PDF 2.0 support is planned
444 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
445}
446
447#[cfg(test)]
448mod tests {
449 use super::*;
450
451 #[test]
452 fn test_create_empty_document() {
453 let doc = Document::new();
454 assert_eq!(doc.pages.len(), 0);
455 }
456
457 #[test]
458 fn test_create_page() {
459 let page = Page::new(595.0, 842.0);
460 assert_eq!(page.width(), 595.0);
461 assert_eq!(page.height(), 842.0);
462 }
463
464 #[test]
465 fn test_version_info() {
466 assert!(!VERSION.is_empty());
467 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
468 }
469
470 #[test]
471 fn test_pdf_version_constants() {
472 // Test that all expected PDF versions are supported
473 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
474
475 for version in expected_versions {
476 assert!(
477 pdf_version::SUPPORTED_VERSIONS.contains(&version),
478 "Expected PDF version {version} to be supported"
479 );
480 }
481
482 // Test that we have exactly 8 supported versions
483 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
484
485 // Test planned versions
486 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
487 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
488 }
489
490 #[test]
491 fn test_document_with_metadata() {
492 let mut doc = Document::new();
493 doc.set_title("Test Document");
494 doc.set_author("Test Author");
495 doc.set_subject("Test Subject");
496
497 // Verify metadata is set (checking internal state)
498 assert_eq!(doc.pages.len(), 0);
499 // Note: We can't directly test metadata without exposing getters
500 // This test ensures the methods don't panic
501 }
502
503 #[test]
504 fn test_page_creation_variants() {
505 // Test different page creation methods
506 let page_a4 = Page::a4();
507 let page_letter = Page::letter();
508 let page_custom = Page::new(400.0, 600.0);
509
510 // A4 dimensions: 595.276 x 841.89 points (approximation)
511 assert!((page_a4.width() - 595.0).abs() < 10.0);
512 assert!((page_a4.height() - 842.0).abs() < 10.0);
513
514 // Letter dimensions: 612 x 792 points
515 assert_eq!(page_letter.width(), 612.0);
516 assert_eq!(page_letter.height(), 792.0);
517
518 // Custom dimensions
519 assert_eq!(page_custom.width(), 400.0);
520 assert_eq!(page_custom.height(), 600.0);
521 }
522
523 #[test]
524 fn test_color_creation() {
525 let red = Color::rgb(1.0, 0.0, 0.0);
526 let green = Color::rgb(0.0, 1.0, 0.0);
527 let blue = Color::rgb(0.0, 0.0, 1.0);
528 let black = Color::rgb(0.0, 0.0, 0.0);
529 let white = Color::rgb(1.0, 1.0, 1.0);
530
531 // Test color creation doesn't panic
532 let _colors = [red, green, blue, black, white];
533
534 // Test CMYK color (if available)
535 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
536 let _cmyk_test = cyan;
537 }
538
539 #[test]
540 fn test_font_types() {
541 let helvetica = Font::Helvetica;
542 let times = Font::TimesRoman;
543 let courier = Font::Courier;
544
545 // Test font creation doesn't panic
546 let _fonts = [helvetica, times, courier];
547
548 // Test font family
549 let helvetica_family = FontFamily::Helvetica;
550 let times_family = FontFamily::Times;
551 let courier_family = FontFamily::Courier;
552
553 let _families = [helvetica_family, times_family, courier_family];
554 }
555
556 #[test]
557 fn test_error_types() {
558 // Test that error types can be created
559 let pdf_error = PdfError::InvalidStructure("test error".to_string());
560 let _error_test = pdf_error;
561
562 // Test result type
563 let ok_result: Result<i32> = Ok(42);
564 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
565
566 assert!(ok_result.is_ok());
567 assert!(err_result.is_err());
568 }
569
570 #[test]
571 fn test_module_exports() {
572 // Test that all major types are properly exported
573 let _doc = Document::new();
574 let _page = Page::new(100.0, 100.0);
575 let _color = Color::rgb(0.5, 0.5, 0.5);
576 let _font = Font::Helvetica;
577
578 // Test parsing types
579 let _array = PdfArray::new();
580 let _dict = PdfDictionary::new();
581 let _name = PdfName::new("Test".to_string());
582 let _string = PdfString::new(b"Test".to_vec());
583
584 // Test operation types
585 let _margins = Margins {
586 top: 10.0,
587 right: 10.0,
588 bottom: 10.0,
589 left: 10.0,
590 };
591 let _align = TextAlign::Left;
592 }
593
594 #[test]
595 fn test_ocr_types() {
596 // Test OCR-related types
597 let _mock_ocr = MockOcrProvider::new();
598 let _ocr_options = OcrOptions::default();
599 let _ocr_engine = OcrEngine::Tesseract;
600
601 // Test fragment types
602 let _fragment_type = FragmentType::Word;
603 let _image_preprocessing = ImagePreprocessing::default();
604 }
605
606 #[test]
607 fn test_text_utilities() {
608 // Test text utility functions
609 let text = "Hello world test";
610 let words = split_into_words(text);
611 assert!(!words.is_empty());
612 assert!(words.contains(&"Hello"));
613 assert!(words.contains(&"world"));
614
615 // Test text measurement (with mock font)
616 let font = Font::Helvetica;
617 let size = 12.0;
618 let width = measure_text(text, font, size);
619 assert!(width > 0.0);
620 }
621
622 #[test]
623 fn test_image_types() {
624 // Test image-related types
625 let _format = ImageFormat::Jpeg;
626 let _color_space = ColorSpace::DeviceRGB;
627
628 // Test that image creation doesn't panic
629 let image_data = vec![0u8; 100];
630 let _image = Image::from_jpeg_data(image_data);
631 }
632
633 #[test]
634 fn test_version_string_format() {
635 // Test that version string follows semantic versioning
636 let version_parts: Vec<&str> = VERSION.split('.').collect();
637 assert!(
638 version_parts.len() >= 2,
639 "Version should have at least major.minor format"
640 );
641
642 // Test that major and minor are numeric
643 assert!(
644 version_parts[0].parse::<u32>().is_ok(),
645 "Major version should be numeric"
646 );
647 assert!(
648 version_parts[1].parse::<u32>().is_ok(),
649 "Minor version should be numeric"
650 );
651
652 // Test that version is not empty
653 assert!(!VERSION.is_empty());
654 assert!(!VERSION.is_empty());
655 }
656}