oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205pub mod pdf_objects;
206#[cfg(feature = "performance")]
207pub mod performance;
208pub mod recovery;
209pub mod streaming;
210pub mod structure;
211pub mod templates;
212pub mod text;
213pub mod verification;
214pub mod viewer_preferences;
215pub mod writer;
216
217pub mod semantic;
218
219// Dashboard and reporting modules
220pub mod dashboard;
221
222// Re-export generation types
223pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
224pub use document::{Document, DocumentMetadata};
225pub use error::{OxidizePdfError, PdfError, Result};
226pub use geometry::{Point, Rectangle};
227pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
228pub use page::{Margins, Page};
229pub use page_lists::{ListStyle, ListType, PageLists};
230pub use page_tables::{PageTables, TableStyle};
231pub use text::{
232 measure_text,
233 split_into_words,
234 BulletStyle,
235 Font,
236 FontFamily,
237 FragmentType,
238 HeaderStyle,
239 ImagePreprocessing,
240 ListElement,
241 ListOptions,
242 MockOcrProvider,
243 OcrEngine,
244 OcrError,
245 OcrOptions,
246 OcrProcessingResult,
247 OcrProvider,
248 OcrResult,
249 OcrTextFragment,
250 // List exports
251 OrderedList,
252 OrderedListStyle,
253 // Table exports
254 Table,
255 TableCell,
256 TableOptions,
257 TextAlign,
258 TextContext,
259 TextFlowContext,
260 UnorderedList,
261};
262
263// Re-export forms types
264pub use forms::{
265 calculations::FieldValue,
266 field_actions::{
267 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
268 SpecialFormatType, ValidateActionType,
269 },
270 validation::{
271 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
272 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
273 },
274 BorderStyle, FieldType, TextField, Widget,
275};
276
277// Re-export font embedding types
278pub use text::fonts::embedding::{
279 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
280 FontEncoding, FontFlags, FontMetrics, FontType,
281};
282
283// Re-export font management types
284pub use text::font_manager::{CustomFont, FontManager};
285
286// Re-export parsing types
287pub use parser::{
288 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
289 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
290 PdfString,
291};
292
293// Re-export operations
294pub use operations::{
295 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
296 ExtractImagesOptions, ExtractedImage, ImageExtractor,
297};
298
299// Re-export dashboard types
300pub use dashboard::{
301 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
302 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
303};
304
305// Re-export memory optimization types
306pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
307
308// Re-export streaming types
309pub use streaming::{
310 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
311 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
312 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
313};
314
315// Re-export batch processing types
316pub use batch::{
317 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
318 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
319 ProgressCallback, ProgressInfo,
320};
321
322// Re-export recovery types
323pub use recovery::{
324 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
325 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
326 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
327};
328
329// Re-export structure types
330pub use structure::{
331 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
332 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
333};
334
335// Re-export action types
336pub use actions::{
337 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
338 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
339};
340
341// Re-export page label types
342pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
343
344// Re-export template types
345pub use templates::{
346 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
347};
348
349// Re-export semantic types for AI-Ready PDFs
350pub use semantic::{
351 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
352 RelationType, SemanticEntity, SemanticMarking,
353};
354
355// Re-export verification types
356pub use verification::comparators::{
357 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
358};
359pub use verification::compliance_report::{
360 format_report_markdown, generate_compliance_report, ComplianceReport,
361};
362pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
363pub use verification::validators::{
364 check_available_validators, validate_external, validate_with_qpdf,
365};
366pub use verification::{
367 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
368 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
369};
370
371/// Current version of oxidize-pdf
372pub const VERSION: &str = env!("CARGO_PKG_VERSION");
373
374/// Scanned page analysis and OCR example
375///
376/// ```rust,no_run
377/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
378/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
379/// use oxidize_pdf::parser::PdfReader;
380///
381/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
382/// let document = PdfReader::open_document("scanned.pdf")?;
383/// let analyzer = PageContentAnalyzer::new(document);
384///
385/// // Analyze pages for scanned content
386/// let analyses = analyzer.analyze_document()?;
387/// for analysis in analyses {
388/// match analysis.page_type {
389/// PageType::Scanned => {
390/// println!("Page {} is scanned - applying OCR", analysis.page_number);
391///
392/// // Process with OCR
393/// let ocr_provider = MockOcrProvider::new();
394/// let ocr_result = analyzer.extract_text_from_scanned_page(
395/// analysis.page_number,
396/// &ocr_provider
397/// )?;
398///
399/// println!("OCR extracted: {}", ocr_result.text);
400/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
401/// }
402/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
403/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
404/// }
405/// }
406/// # Ok(())
407/// # }
408/// ```
409///
410/// ### Font Embedding
411///
412/// ```rust,no_run
413/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
414/// use std::collections::HashSet;
415///
416/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
417/// // Create font embedder
418/// let mut embedder = FontEmbedder::new();
419///
420/// // Define used glyphs (example with basic ASCII)
421/// let mut used_glyphs = HashSet::new();
422/// used_glyphs.insert(65); // 'A'
423/// used_glyphs.insert(66); // 'B'
424/// used_glyphs.insert(67); // 'C'
425///
426/// // Configure embedding options
427/// let options = EmbeddingOptions {
428/// subset: true, // Create font subset
429/// compress_font_streams: true, // Compress font data
430/// ..Default::default()
431/// };
432///
433/// // Load font data (example - you'd load actual TrueType data)
434/// let font_data = std::fs::read("path/to/font.ttf")?;
435///
436/// // Embed the font
437/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
438/// println!("Embedded font as: {}", font_name);
439///
440/// // Generate PDF dictionary for the embedded font
441/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
442/// println!("Font dictionary generated successfully");
443/// # Ok(())
444/// # }
445/// ```
446///
447/// Supported PDF versions
448pub mod pdf_version {
449 /// PDF 1.0 - 1.7 are fully supported
450 pub const SUPPORTED_VERSIONS: &[&str] =
451 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
452 /// PDF 2.0 support is planned
453 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459
460 #[test]
461 fn test_create_empty_document() {
462 let doc = Document::new();
463 assert_eq!(doc.pages.len(), 0);
464 }
465
466 #[test]
467 fn test_create_page() {
468 let page = Page::new(595.0, 842.0);
469 assert_eq!(page.width(), 595.0);
470 assert_eq!(page.height(), 842.0);
471 }
472
473 #[test]
474 fn test_version_info() {
475 assert!(!VERSION.is_empty());
476 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
477 }
478
479 #[test]
480 fn test_pdf_version_constants() {
481 // Test that all expected PDF versions are supported
482 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
483
484 for version in expected_versions {
485 assert!(
486 pdf_version::SUPPORTED_VERSIONS.contains(&version),
487 "Expected PDF version {version} to be supported"
488 );
489 }
490
491 // Test that we have exactly 8 supported versions
492 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
493
494 // Test planned versions
495 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
496 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
497 }
498
499 #[test]
500 fn test_document_with_metadata() {
501 let mut doc = Document::new();
502 doc.set_title("Test Document");
503 doc.set_author("Test Author");
504 doc.set_subject("Test Subject");
505
506 // Verify metadata is set (checking internal state)
507 assert_eq!(doc.pages.len(), 0);
508 // Note: We can't directly test metadata without exposing getters
509 // This test ensures the methods don't panic
510 }
511
512 #[test]
513 fn test_page_creation_variants() {
514 // Test different page creation methods
515 let page_a4 = Page::a4();
516 let page_letter = Page::letter();
517 let page_custom = Page::new(400.0, 600.0);
518
519 // A4 dimensions: 595.276 x 841.89 points (approximation)
520 assert!((page_a4.width() - 595.0).abs() < 10.0);
521 assert!((page_a4.height() - 842.0).abs() < 10.0);
522
523 // Letter dimensions: 612 x 792 points
524 assert_eq!(page_letter.width(), 612.0);
525 assert_eq!(page_letter.height(), 792.0);
526
527 // Custom dimensions
528 assert_eq!(page_custom.width(), 400.0);
529 assert_eq!(page_custom.height(), 600.0);
530 }
531
532 #[test]
533 fn test_color_creation() {
534 let red = Color::rgb(1.0, 0.0, 0.0);
535 let green = Color::rgb(0.0, 1.0, 0.0);
536 let blue = Color::rgb(0.0, 0.0, 1.0);
537 let black = Color::rgb(0.0, 0.0, 0.0);
538 let white = Color::rgb(1.0, 1.0, 1.0);
539
540 // Test color creation doesn't panic
541 let _colors = [red, green, blue, black, white];
542
543 // Test CMYK color (if available)
544 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
545 let _cmyk_test = cyan;
546 }
547
548 #[test]
549 fn test_font_types() {
550 let helvetica = Font::Helvetica;
551 let times = Font::TimesRoman;
552 let courier = Font::Courier;
553
554 // Test font creation doesn't panic
555 let _fonts = [helvetica, times, courier];
556
557 // Test font family
558 let helvetica_family = FontFamily::Helvetica;
559 let times_family = FontFamily::Times;
560 let courier_family = FontFamily::Courier;
561
562 let _families = [helvetica_family, times_family, courier_family];
563 }
564
565 #[test]
566 fn test_error_types() {
567 // Test that error types can be created
568 let pdf_error = PdfError::InvalidStructure("test error".to_string());
569 let _error_test = pdf_error;
570
571 // Test result type
572 let ok_result: Result<i32> = Ok(42);
573 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
574
575 assert!(ok_result.is_ok());
576 assert!(err_result.is_err());
577 }
578
579 #[test]
580 fn test_module_exports() {
581 // Test that all major types are properly exported
582 let _doc = Document::new();
583 let _page = Page::new(100.0, 100.0);
584 let _color = Color::rgb(0.5, 0.5, 0.5);
585 let _font = Font::Helvetica;
586
587 // Test parsing types
588 let _array = PdfArray::new();
589 let _dict = PdfDictionary::new();
590 let _name = PdfName::new("Test".to_string());
591 let _string = PdfString::new(b"Test".to_vec());
592
593 // Test operation types
594 let _margins = Margins {
595 top: 10.0,
596 right: 10.0,
597 bottom: 10.0,
598 left: 10.0,
599 };
600 let _align = TextAlign::Left;
601 }
602
603 #[test]
604 fn test_ocr_types() {
605 // Test OCR-related types
606 let _mock_ocr = MockOcrProvider::new();
607 let _ocr_options = OcrOptions::default();
608 let _ocr_engine = OcrEngine::Tesseract;
609
610 // Test fragment types
611 let _fragment_type = FragmentType::Word;
612 let _image_preprocessing = ImagePreprocessing::default();
613 }
614
615 #[test]
616 fn test_text_utilities() {
617 // Test text utility functions
618 let text = "Hello world test";
619 let words = split_into_words(text);
620 assert!(!words.is_empty());
621 assert!(words.contains(&"Hello"));
622 assert!(words.contains(&"world"));
623
624 // Test text measurement (with mock font)
625 let font = Font::Helvetica;
626 let size = 12.0;
627 let width = measure_text(text, font, size);
628 assert!(width > 0.0);
629 }
630
631 #[test]
632 fn test_image_types() {
633 // Test image-related types
634 let _format = ImageFormat::Jpeg;
635 let _color_space = ColorSpace::DeviceRGB;
636
637 // Test that image creation doesn't panic
638 let image_data = vec![0u8; 100];
639 let _image = Image::from_jpeg_data(image_data);
640 }
641
642 #[test]
643 fn test_version_string_format() {
644 // Test that version string follows semantic versioning
645 let version_parts: Vec<&str> = VERSION.split('.').collect();
646 assert!(
647 version_parts.len() >= 2,
648 "Version should have at least major.minor format"
649 );
650
651 // Test that major and minor are numeric
652 assert!(
653 version_parts[0].parse::<u32>().is_ok(),
654 "Major version should be numeric"
655 );
656 assert!(
657 version_parts[1].parse::<u32>().is_ok(),
658 "Minor version should be numeric"
659 );
660
661 // Test that version is not empty
662 assert!(!VERSION.is_empty());
663 assert!(!VERSION.is_empty());
664 }
665}