oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod layout;
194pub mod memory;
195pub mod metadata;
196pub mod objects;
197pub mod operations;
198pub mod page;
199pub mod page_forms;
200pub mod page_labels;
201pub mod page_lists;
202pub mod page_tables;
203pub mod page_transitions;
204pub mod page_tree;
205pub mod parser;
206pub mod pdf_objects;
207pub mod pdfa;
208#[cfg(feature = "performance")]
209pub mod performance;
210pub mod pipeline;
211pub mod recovery;
212pub mod streaming;
213pub mod structure;
214pub mod templates;
215pub mod text;
216pub mod verification;
217pub mod viewer_preferences;
218pub mod writer;
219
220pub mod semantic;
221pub mod signatures;
222
223// Dashboard and reporting modules
224pub mod dashboard;
225
226// Re-export generation types
227pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
228pub use document::{Document, DocumentMetadata};
229pub use error::{OxidizePdfError, PdfError, Result};
230pub use geometry::{Point, Rectangle};
231pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
232pub use layout::{
233 centered_image_x, fit_image_dimensions, DocumentBuilder, FlowElement, FlowLayout, PageConfig,
234 RichText, TextSpan,
235};
236pub use page::{Margins, Page};
237pub use page_lists::{ListStyle, ListType, PageLists};
238pub use page_tables::{PageTables, TableStyle};
239pub use text::{
240 measure_text,
241 measure_text_block,
242 split_into_words,
243 BulletStyle,
244 Font,
245 FontFamily,
246 FragmentType,
247 HeaderStyle,
248 ImagePreprocessing,
249 ListElement,
250 ListOptions,
251 MockOcrProvider,
252 OcrEngine,
253 OcrError,
254 OcrOptions,
255 OcrProcessingResult,
256 OcrProvider,
257 OcrResult,
258 OcrTextFragment,
259 // List exports
260 OrderedList,
261 OrderedListStyle,
262 // Table exports
263 Table,
264 TableCell,
265 TableOptions,
266 TextAlign,
267 TextBlockMetrics,
268 TextContext,
269 TextFlowContext,
270 UnorderedList,
271};
272
273// Re-export forms types
274pub use forms::{
275 calculations::FieldValue,
276 field_actions::{
277 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
278 SpecialFormatType, ValidateActionType,
279 },
280 validation::{
281 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
282 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
283 },
284 BorderStyle, FieldType, TextField, Widget,
285};
286
287// Re-export font embedding types
288pub use text::fonts::embedding::{
289 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
290 FontEncoding, FontFlags, FontMetrics, FontType,
291};
292
293// Re-export font management types
294pub use text::font_manager::{CustomFont, FontManager};
295
296// Re-export parsing types
297pub use parser::{
298 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
299 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
300 PdfString,
301};
302
303// Re-export operations
304pub use operations::{
305 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, move_pdf_page, overlay_pdf,
306 reorder_pdf_pages, reverse_pdf_pages, rotate_pdf_pages, split_pdf, swap_pdf_pages,
307 ExtractImagesOptions, ExtractedImage, ImageExtractor, OverlayOptions, OverlayPosition,
308 ReorderOptions,
309};
310
311// Re-export dashboard types
312pub use dashboard::{
313 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
314 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
315};
316
317// Re-export memory optimization types
318pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
319
320// Re-export streaming types
321pub use streaming::{
322 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
323 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
324 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
325};
326
327// Re-export batch processing types
328pub use batch::{
329 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
330 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
331 ProgressCallback, ProgressInfo,
332};
333
334// Re-export recovery types
335pub use recovery::{
336 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
337 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
338 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
339};
340
341// Re-export structure types
342pub use structure::{
343 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
344 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
345};
346
347// Re-export action types
348pub use actions::{
349 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
350 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
351};
352
353// Re-export page label types
354pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
355
356// Re-export template types
357pub use templates::{
358 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
359};
360
361// Re-export semantic types for AI-Ready PDFs
362pub use semantic::{
363 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
364 RelationType, SemanticEntity, SemanticMarking,
365};
366
367// Re-export verification types
368pub use verification::comparators::{
369 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
370};
371pub use verification::compliance_report::{
372 format_report_markdown, generate_compliance_report, ComplianceReport,
373};
374pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
375pub use verification::validators::{
376 check_available_validators, validate_external, validate_with_qpdf,
377};
378pub use verification::{
379 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
380 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
381};
382
383// Re-export PDF/A compliance types
384pub use pdfa::{
385 PdfAConformance, PdfAError, PdfALevel, PdfAResult, PdfAValidator,
386 ValidationError as PdfAValidationError, ValidationResult as PdfAValidationResult,
387 ValidationWarning as PdfAValidationWarning, XmpMetadata, XmpPdfAIdentifier,
388};
389
390/// Current version of oxidize-pdf
391pub const VERSION: &str = env!("CARGO_PKG_VERSION");
392
393/// Scanned page analysis and OCR example
394///
395/// ```rust,no_run
396/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
397/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
398/// use oxidize_pdf::parser::PdfReader;
399///
400/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
401/// let document = PdfReader::open_document("scanned.pdf")?;
402/// let analyzer = PageContentAnalyzer::new(document);
403///
404/// // Analyze pages for scanned content
405/// let analyses = analyzer.analyze_document()?;
406/// for analysis in analyses {
407/// match analysis.page_type {
408/// PageType::Scanned => {
409/// println!("Page {} is scanned - applying OCR", analysis.page_number);
410///
411/// // Process with OCR
412/// let ocr_provider = MockOcrProvider::new();
413/// let ocr_result = analyzer.extract_text_from_scanned_page(
414/// analysis.page_number,
415/// &ocr_provider
416/// )?;
417///
418/// println!("OCR extracted: {}", ocr_result.text);
419/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
420/// }
421/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
422/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
423/// }
424/// }
425/// # Ok(())
426/// # }
427/// ```
428///
429/// ### Font Embedding
430///
431/// ```rust,no_run
432/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
433/// use std::collections::HashSet;
434///
435/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
436/// // Create font embedder
437/// let mut embedder = FontEmbedder::new();
438///
439/// // Define used glyphs (example with basic ASCII)
440/// let mut used_glyphs = HashSet::new();
441/// used_glyphs.insert(65); // 'A'
442/// used_glyphs.insert(66); // 'B'
443/// used_glyphs.insert(67); // 'C'
444///
445/// // Configure embedding options
446/// let options = EmbeddingOptions {
447/// subset: true, // Create font subset
448/// compress_font_streams: true, // Compress font data
449/// ..Default::default()
450/// };
451///
452/// // Load font data (example - you'd load actual TrueType data)
453/// let font_data = std::fs::read("path/to/font.ttf")?;
454///
455/// // Embed the font
456/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
457/// println!("Embedded font as: {}", font_name);
458///
459/// // Generate PDF dictionary for the embedded font
460/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
461/// println!("Font dictionary generated successfully");
462/// # Ok(())
463/// # }
464/// ```
465///
466/// Supported PDF versions
467pub mod pdf_version {
468 /// PDF 1.0 - 1.7 are fully supported
469 pub const SUPPORTED_VERSIONS: &[&str] =
470 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
471 /// PDF 2.0 support is planned
472 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
473}
474
475#[cfg(test)]
476mod tests {
477 use super::*;
478
479 #[test]
480 fn test_create_empty_document() {
481 let doc = Document::new();
482 assert_eq!(doc.pages.len(), 0);
483 }
484
485 #[test]
486 fn test_create_page() {
487 let page = Page::new(595.0, 842.0);
488 assert_eq!(page.width(), 595.0);
489 assert_eq!(page.height(), 842.0);
490 }
491
492 #[test]
493 fn test_version_info() {
494 assert!(!VERSION.is_empty());
495 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
496 }
497
498 #[test]
499 fn test_pdf_version_constants() {
500 // Test that all expected PDF versions are supported
501 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
502
503 for version in expected_versions {
504 assert!(
505 pdf_version::SUPPORTED_VERSIONS.contains(&version),
506 "Expected PDF version {version} to be supported"
507 );
508 }
509
510 // Test that we have exactly 8 supported versions
511 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
512
513 // Test planned versions
514 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
515 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
516 }
517
518 #[test]
519 fn test_document_with_metadata() {
520 let mut doc = Document::new();
521 doc.set_title("Test Document");
522 doc.set_author("Test Author");
523 doc.set_subject("Test Subject");
524
525 // Verify metadata is set (checking internal state)
526 assert_eq!(doc.pages.len(), 0);
527 // Note: We can't directly test metadata without exposing getters
528 // This test ensures the methods don't panic
529 }
530
531 #[test]
532 fn test_page_creation_variants() {
533 // Test different page creation methods
534 let page_a4 = Page::a4();
535 let page_letter = Page::letter();
536 let page_custom = Page::new(400.0, 600.0);
537
538 // A4 dimensions: 595.276 x 841.89 points (approximation)
539 assert!((page_a4.width() - 595.0).abs() < 10.0);
540 assert!((page_a4.height() - 842.0).abs() < 10.0);
541
542 // Letter dimensions: 612 x 792 points
543 assert_eq!(page_letter.width(), 612.0);
544 assert_eq!(page_letter.height(), 792.0);
545
546 // Custom dimensions
547 assert_eq!(page_custom.width(), 400.0);
548 assert_eq!(page_custom.height(), 600.0);
549 }
550
551 #[test]
552 fn test_color_creation() {
553 let red = Color::rgb(1.0, 0.0, 0.0);
554 let green = Color::rgb(0.0, 1.0, 0.0);
555 let blue = Color::rgb(0.0, 0.0, 1.0);
556 let black = Color::rgb(0.0, 0.0, 0.0);
557 let white = Color::rgb(1.0, 1.0, 1.0);
558
559 // Test color creation doesn't panic
560 let _colors = [red, green, blue, black, white];
561
562 // Test CMYK color (if available)
563 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
564 let _cmyk_test = cyan;
565 }
566
567 #[test]
568 fn test_font_types() {
569 let helvetica = Font::Helvetica;
570 let times = Font::TimesRoman;
571 let courier = Font::Courier;
572
573 // Test font creation doesn't panic
574 let _fonts = [helvetica, times, courier];
575
576 // Test font family
577 let helvetica_family = FontFamily::Helvetica;
578 let times_family = FontFamily::Times;
579 let courier_family = FontFamily::Courier;
580
581 let _families = [helvetica_family, times_family, courier_family];
582 }
583
584 #[test]
585 fn test_error_types() {
586 // Test that error types can be created
587 let pdf_error = PdfError::InvalidStructure("test error".to_string());
588 let _error_test = pdf_error;
589
590 // Test result type
591 let ok_result: Result<i32> = Ok(42);
592 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
593
594 assert!(ok_result.is_ok());
595 assert!(err_result.is_err());
596 }
597
598 #[test]
599 fn test_module_exports() {
600 // Test that all major types are properly exported
601 let _doc = Document::new();
602 let _page = Page::new(100.0, 100.0);
603 let _color = Color::rgb(0.5, 0.5, 0.5);
604 let _font = Font::Helvetica;
605
606 // Test parsing types
607 let _array = PdfArray::new();
608 let _dict = PdfDictionary::new();
609 let _name = PdfName::new("Test".to_string());
610 let _string = PdfString::new(b"Test".to_vec());
611
612 // Test operation types
613 let _margins = Margins {
614 top: 10.0,
615 right: 10.0,
616 bottom: 10.0,
617 left: 10.0,
618 };
619 let _align = TextAlign::Left;
620 }
621
622 #[test]
623 fn test_ocr_types() {
624 // Test OCR-related types
625 let _mock_ocr = MockOcrProvider::new();
626 let _ocr_options = OcrOptions::default();
627 let _ocr_engine = OcrEngine::Tesseract;
628
629 // Test fragment types
630 let _fragment_type = FragmentType::Word;
631 let _image_preprocessing = ImagePreprocessing::default();
632 }
633
634 #[test]
635 fn test_text_utilities() {
636 // Test text utility functions
637 let text = "Hello world test";
638 let words = split_into_words(text);
639 assert!(!words.is_empty());
640 assert!(words.contains(&"Hello"));
641 assert!(words.contains(&"world"));
642
643 // Test text measurement (with mock font)
644 let font = Font::Helvetica;
645 let size = 12.0;
646 let width = measure_text(text, &font, size);
647 assert!(width > 0.0);
648 }
649
650 #[test]
651 fn test_image_types() {
652 // Test image-related types
653 let _format = ImageFormat::Jpeg;
654 let _color_space = ColorSpace::DeviceRGB;
655
656 // Test that image creation doesn't panic
657 let image_data = vec![0u8; 100];
658 let _image = Image::from_jpeg_data(image_data);
659 }
660
661 #[test]
662 fn test_version_string_format() {
663 // Test that version string follows semantic versioning
664 let version_parts: Vec<&str> = VERSION.split('.').collect();
665 assert!(
666 version_parts.len() >= 2,
667 "Version should have at least major.minor format"
668 );
669
670 // Test that major and minor are numeric
671 assert!(
672 version_parts[0].parse::<u32>().is_ok(),
673 "Major version should be numeric"
674 );
675 assert!(
676 version_parts[1].parse::<u32>().is_ok(),
677 "Minor version should be numeric"
678 );
679
680 // Test that version is not empty
681 assert!(!VERSION.is_empty());
682 assert!(!VERSION.is_empty());
683 }
684}