oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod objects;
195pub mod operations;
196pub mod page;
197pub mod page_forms;
198pub mod page_labels;
199pub mod page_lists;
200pub mod page_tables;
201pub mod page_transitions;
202pub mod page_tree;
203pub mod parser;
204#[cfg(feature = "performance")]
205pub mod performance;
206pub mod recovery;
207pub mod streaming;
208pub mod structure;
209pub mod templates;
210pub mod text;
211pub mod verification;
212pub mod viewer_preferences;
213pub mod writer;
214
215pub mod semantic;
216
217// Dashboard and reporting modules
218pub mod dashboard;
219
220// Re-export generation types
221pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
222pub use document::{Document, DocumentMetadata};
223pub use error::{OxidizePdfError, PdfError, Result};
224pub use geometry::{Point, Rectangle};
225pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
226pub use page::{Margins, Page};
227pub use page_lists::{ListStyle, ListType, PageLists};
228pub use page_tables::{PageTables, TableStyle};
229pub use text::{
230 measure_text,
231 split_into_words,
232 BulletStyle,
233 Font,
234 FontFamily,
235 FragmentType,
236 HeaderStyle,
237 ImagePreprocessing,
238 ListElement,
239 ListOptions,
240 MockOcrProvider,
241 OcrEngine,
242 OcrError,
243 OcrOptions,
244 OcrProcessingResult,
245 OcrProvider,
246 OcrResult,
247 OcrTextFragment,
248 // List exports
249 OrderedList,
250 OrderedListStyle,
251 // Table exports
252 Table,
253 TableCell,
254 TableOptions,
255 TextAlign,
256 TextContext,
257 TextFlowContext,
258 UnorderedList,
259};
260
261// Re-export forms types
262pub use forms::{
263 calculations::FieldValue,
264 field_actions::{
265 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
266 SpecialFormatType, ValidateActionType,
267 },
268 validation::{
269 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
270 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
271 },
272 BorderStyle, FieldType, TextField, Widget,
273};
274
275// Re-export font embedding types
276pub use text::fonts::embedding::{
277 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
278 FontEncoding, FontFlags, FontMetrics, FontType,
279};
280
281// Re-export font management types
282pub use text::font_manager::{CustomFont, FontManager};
283
284// Re-export parsing types
285pub use parser::{
286 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
287 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
288 PdfString,
289};
290
291// Re-export operations
292pub use operations::{
293 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
294 ExtractImagesOptions, ExtractedImage, ImageExtractor,
295};
296
297// Re-export dashboard types
298pub use dashboard::{
299 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
300 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
301};
302
303// Re-export memory optimization types
304pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
305
306// Re-export streaming types
307pub use streaming::{
308 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
309 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
310 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
311};
312
313// Re-export batch processing types
314pub use batch::{
315 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
316 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
317 ProgressCallback, ProgressInfo,
318};
319
320// Re-export recovery types
321pub use recovery::{
322 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
323 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
324 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
325};
326
327// Re-export structure types
328pub use structure::{
329 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
330 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
331};
332
333// Re-export action types
334pub use actions::{
335 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
336 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
337};
338
339// Re-export page label types
340pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
341
342// Re-export template types
343pub use templates::{
344 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
345};
346
347// Re-export semantic types for AI-Ready PDFs
348pub use semantic::{
349 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
350 RelationType, SemanticEntity, SemanticMarking,
351};
352
353// Re-export verification types
354pub use verification::comparators::{
355 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
356};
357pub use verification::compliance_report::{
358 format_report_markdown, generate_compliance_report, ComplianceReport,
359};
360pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
361pub use verification::validators::{
362 check_available_validators, validate_external, validate_with_qpdf,
363};
364pub use verification::{
365 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
366 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
367};
368
369/// Current version of oxidize-pdf
370pub const VERSION: &str = env!("CARGO_PKG_VERSION");
371
372/// Scanned page analysis and OCR example
373///
374/// ```rust,no_run
375/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
376/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
377/// use oxidize_pdf::parser::PdfReader;
378///
379/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
380/// let document = PdfReader::open_document("scanned.pdf")?;
381/// let analyzer = PageContentAnalyzer::new(document);
382///
383/// // Analyze pages for scanned content
384/// let analyses = analyzer.analyze_document()?;
385/// for analysis in analyses {
386/// match analysis.page_type {
387/// PageType::Scanned => {
388/// println!("Page {} is scanned - applying OCR", analysis.page_number);
389///
390/// // Process with OCR
391/// let ocr_provider = MockOcrProvider::new();
392/// let ocr_result = analyzer.extract_text_from_scanned_page(
393/// analysis.page_number,
394/// &ocr_provider
395/// )?;
396///
397/// println!("OCR extracted: {}", ocr_result.text);
398/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
399/// }
400/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
401/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
402/// }
403/// }
404/// # Ok(())
405/// # }
406/// ```
407///
408/// ### Font Embedding
409///
410/// ```rust,no_run
411/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
412/// use std::collections::HashSet;
413///
414/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
415/// // Create font embedder
416/// let mut embedder = FontEmbedder::new();
417///
418/// // Define used glyphs (example with basic ASCII)
419/// let mut used_glyphs = HashSet::new();
420/// used_glyphs.insert(65); // 'A'
421/// used_glyphs.insert(66); // 'B'
422/// used_glyphs.insert(67); // 'C'
423///
424/// // Configure embedding options
425/// let options = EmbeddingOptions {
426/// subset: true, // Create font subset
427/// compress_font_streams: true, // Compress font data
428/// ..Default::default()
429/// };
430///
431/// // Load font data (example - you'd load actual TrueType data)
432/// let font_data = std::fs::read("path/to/font.ttf")?;
433///
434/// // Embed the font
435/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
436/// println!("Embedded font as: {}", font_name);
437///
438/// // Generate PDF dictionary for the embedded font
439/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
440/// println!("Font dictionary generated successfully");
441/// # Ok(())
442/// # }
443/// ```
444///
445/// Supported PDF versions
446pub mod pdf_version {
447 /// PDF 1.0 - 1.7 are fully supported
448 pub const SUPPORTED_VERSIONS: &[&str] =
449 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
450 /// PDF 2.0 support is planned
451 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
452}
453
454#[cfg(test)]
455mod tests {
456 use super::*;
457
458 #[test]
459 fn test_create_empty_document() {
460 let doc = Document::new();
461 assert_eq!(doc.pages.len(), 0);
462 }
463
464 #[test]
465 fn test_create_page() {
466 let page = Page::new(595.0, 842.0);
467 assert_eq!(page.width(), 595.0);
468 assert_eq!(page.height(), 842.0);
469 }
470
471 #[test]
472 fn test_version_info() {
473 assert!(!VERSION.is_empty());
474 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
475 }
476
477 #[test]
478 fn test_pdf_version_constants() {
479 // Test that all expected PDF versions are supported
480 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
481
482 for version in expected_versions {
483 assert!(
484 pdf_version::SUPPORTED_VERSIONS.contains(&version),
485 "Expected PDF version {version} to be supported"
486 );
487 }
488
489 // Test that we have exactly 8 supported versions
490 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
491
492 // Test planned versions
493 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
494 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
495 }
496
497 #[test]
498 fn test_document_with_metadata() {
499 let mut doc = Document::new();
500 doc.set_title("Test Document");
501 doc.set_author("Test Author");
502 doc.set_subject("Test Subject");
503
504 // Verify metadata is set (checking internal state)
505 assert_eq!(doc.pages.len(), 0);
506 // Note: We can't directly test metadata without exposing getters
507 // This test ensures the methods don't panic
508 }
509
510 #[test]
511 fn test_page_creation_variants() {
512 // Test different page creation methods
513 let page_a4 = Page::a4();
514 let page_letter = Page::letter();
515 let page_custom = Page::new(400.0, 600.0);
516
517 // A4 dimensions: 595.276 x 841.89 points (approximation)
518 assert!((page_a4.width() - 595.0).abs() < 10.0);
519 assert!((page_a4.height() - 842.0).abs() < 10.0);
520
521 // Letter dimensions: 612 x 792 points
522 assert_eq!(page_letter.width(), 612.0);
523 assert_eq!(page_letter.height(), 792.0);
524
525 // Custom dimensions
526 assert_eq!(page_custom.width(), 400.0);
527 assert_eq!(page_custom.height(), 600.0);
528 }
529
530 #[test]
531 fn test_color_creation() {
532 let red = Color::rgb(1.0, 0.0, 0.0);
533 let green = Color::rgb(0.0, 1.0, 0.0);
534 let blue = Color::rgb(0.0, 0.0, 1.0);
535 let black = Color::rgb(0.0, 0.0, 0.0);
536 let white = Color::rgb(1.0, 1.0, 1.0);
537
538 // Test color creation doesn't panic
539 let _colors = [red, green, blue, black, white];
540
541 // Test CMYK color (if available)
542 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
543 let _cmyk_test = cyan;
544 }
545
546 #[test]
547 fn test_font_types() {
548 let helvetica = Font::Helvetica;
549 let times = Font::TimesRoman;
550 let courier = Font::Courier;
551
552 // Test font creation doesn't panic
553 let _fonts = [helvetica, times, courier];
554
555 // Test font family
556 let helvetica_family = FontFamily::Helvetica;
557 let times_family = FontFamily::Times;
558 let courier_family = FontFamily::Courier;
559
560 let _families = [helvetica_family, times_family, courier_family];
561 }
562
563 #[test]
564 fn test_error_types() {
565 // Test that error types can be created
566 let pdf_error = PdfError::InvalidStructure("test error".to_string());
567 let _error_test = pdf_error;
568
569 // Test result type
570 let ok_result: Result<i32> = Ok(42);
571 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
572
573 assert!(ok_result.is_ok());
574 assert!(err_result.is_err());
575 }
576
577 #[test]
578 fn test_module_exports() {
579 // Test that all major types are properly exported
580 let _doc = Document::new();
581 let _page = Page::new(100.0, 100.0);
582 let _color = Color::rgb(0.5, 0.5, 0.5);
583 let _font = Font::Helvetica;
584
585 // Test parsing types
586 let _array = PdfArray::new();
587 let _dict = PdfDictionary::new();
588 let _name = PdfName::new("Test".to_string());
589 let _string = PdfString::new(b"Test".to_vec());
590
591 // Test operation types
592 let _margins = Margins {
593 top: 10.0,
594 right: 10.0,
595 bottom: 10.0,
596 left: 10.0,
597 };
598 let _align = TextAlign::Left;
599 }
600
601 #[test]
602 fn test_ocr_types() {
603 // Test OCR-related types
604 let _mock_ocr = MockOcrProvider::new();
605 let _ocr_options = OcrOptions::default();
606 let _ocr_engine = OcrEngine::Tesseract;
607
608 // Test fragment types
609 let _fragment_type = FragmentType::Word;
610 let _image_preprocessing = ImagePreprocessing::default();
611 }
612
613 #[test]
614 fn test_text_utilities() {
615 // Test text utility functions
616 let text = "Hello world test";
617 let words = split_into_words(text);
618 assert!(!words.is_empty());
619 assert!(words.contains(&"Hello"));
620 assert!(words.contains(&"world"));
621
622 // Test text measurement (with mock font)
623 let font = Font::Helvetica;
624 let size = 12.0;
625 let width = measure_text(text, font, size);
626 assert!(width > 0.0);
627 }
628
629 #[test]
630 fn test_image_types() {
631 // Test image-related types
632 let _format = ImageFormat::Jpeg;
633 let _color_space = ColorSpace::DeviceRGB;
634
635 // Test that image creation doesn't panic
636 let image_data = vec![0u8; 100];
637 let _image = Image::from_jpeg_data(image_data);
638 }
639
640 #[test]
641 fn test_version_string_format() {
642 // Test that version string follows semantic versioning
643 let version_parts: Vec<&str> = VERSION.split('.').collect();
644 assert!(
645 version_parts.len() >= 2,
646 "Version should have at least major.minor format"
647 );
648
649 // Test that major and minor are numeric
650 assert!(
651 version_parts[0].parse::<u32>().is_ok(),
652 "Major version should be numeric"
653 );
654 assert!(
655 version_parts[1].parse::<u32>().is_ok(),
656 "Minor version should be numeric"
657 );
658
659 // Test that version is not empty
660 assert!(!VERSION.is_empty());
661 assert!(!VERSION.is_empty());
662 }
663}