oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod annotations;
180
181pub mod batch;
182pub mod charts;
183pub mod compression;
184pub mod coordinate_system;
185pub mod document;
186pub mod encryption;
187pub mod error;
188pub mod fonts;
189pub mod forms;
190pub mod geometry;
191pub mod graphics;
192pub mod memory;
193pub mod objects;
194pub mod operations;
195pub mod page;
196pub mod page_forms;
197pub mod page_labels;
198pub mod page_lists;
199pub mod page_tables;
200pub mod page_transitions;
201pub mod page_tree;
202pub mod parser;
203#[cfg(feature = "performance")]
204pub mod performance;
205pub mod recovery;
206pub mod streaming;
207pub mod structure;
208pub mod templates;
209pub mod text;
210pub mod verification;
211pub mod viewer_preferences;
212pub mod writer;
213
214pub mod semantic;
215
216// Dashboard and reporting modules
217pub mod dashboard;
218
219// Re-export generation types
220pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
221pub use document::{Document, DocumentMetadata};
222pub use error::{OxidizePdfError, PdfError, Result};
223pub use geometry::{Point, Rectangle};
224pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
225pub use page::{Margins, Page};
226pub use page_lists::{ListStyle, ListType, PageLists};
227pub use page_tables::{PageTables, TableStyle};
228pub use text::{
229 measure_text,
230 split_into_words,
231 BulletStyle,
232 Font,
233 FontFamily,
234 FragmentType,
235 HeaderStyle,
236 ImagePreprocessing,
237 ListElement,
238 ListOptions,
239 MockOcrProvider,
240 OcrEngine,
241 OcrError,
242 OcrOptions,
243 OcrProcessingResult,
244 OcrProvider,
245 OcrResult,
246 OcrTextFragment,
247 // List exports
248 OrderedList,
249 OrderedListStyle,
250 // Table exports
251 Table,
252 TableCell,
253 TableOptions,
254 TextAlign,
255 TextContext,
256 TextFlowContext,
257 UnorderedList,
258};
259
260// Re-export forms types
261pub use forms::{
262 calculations::FieldValue,
263 field_actions::{
264 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
265 SpecialFormatType, ValidateActionType,
266 },
267 validation::{
268 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
269 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
270 },
271 BorderStyle, FieldType, TextField, Widget,
272};
273
274// Re-export font embedding types
275pub use text::fonts::embedding::{
276 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
277 FontEncoding, FontFlags, FontMetrics, FontType,
278};
279
280// Re-export font management types
281pub use text::font_manager::{CustomFont, FontManager};
282
283// Re-export parsing types
284pub use parser::{
285 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
286 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
287 PdfString,
288};
289
290// Re-export operations
291pub use operations::{merge_pdfs, rotate_pdf_pages, split_pdf};
292
293// Re-export dashboard types
294pub use dashboard::{
295 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
296 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
297};
298
299// Re-export memory optimization types
300pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
301
302// Re-export streaming types
303pub use streaming::{
304 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
305 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
306 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
307};
308
309// Re-export batch processing types
310pub use batch::{
311 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
312 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
313 ProgressCallback, ProgressInfo,
314};
315
316// Re-export recovery types
317pub use recovery::{
318 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
319 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
320 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
321};
322
323// Re-export structure types
324pub use structure::{
325 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
326 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
327};
328
329// Re-export action types
330pub use actions::{
331 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
332 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
333};
334
335// Re-export page label types
336pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
337
338// Re-export template types
339pub use templates::{
340 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
341};
342
343// Re-export verification types
344pub use verification::comparators::{
345 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
346};
347pub use verification::compliance_report::{
348 format_report_markdown, generate_compliance_report, ComplianceReport,
349};
350pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
351pub use verification::validators::{
352 check_available_validators, validate_external, validate_with_qpdf,
353};
354pub use verification::{
355 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
356 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
357};
358
359/// Current version of oxidize-pdf
360pub const VERSION: &str = env!("CARGO_PKG_VERSION");
361
362/// Scanned page analysis and OCR example
363///
364/// ```rust,no_run
365/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
366/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
367/// use oxidize_pdf::parser::PdfReader;
368///
369/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
370/// let document = PdfReader::open_document("scanned.pdf")?;
371/// let analyzer = PageContentAnalyzer::new(document);
372///
373/// // Analyze pages for scanned content
374/// let analyses = analyzer.analyze_document()?;
375/// for analysis in analyses {
376/// match analysis.page_type {
377/// PageType::Scanned => {
378/// println!("Page {} is scanned - applying OCR", analysis.page_number);
379///
380/// // Process with OCR
381/// let ocr_provider = MockOcrProvider::new();
382/// let ocr_result = analyzer.extract_text_from_scanned_page(
383/// analysis.page_number,
384/// &ocr_provider
385/// )?;
386///
387/// println!("OCR extracted: {}", ocr_result.text);
388/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
389/// }
390/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
391/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
392/// }
393/// }
394/// # Ok(())
395/// # }
396/// ```
397///
398/// ### Font Embedding
399///
400/// ```rust,no_run
401/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
402/// use std::collections::HashSet;
403///
404/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
405/// // Create font embedder
406/// let mut embedder = FontEmbedder::new();
407///
408/// // Define used glyphs (example with basic ASCII)
409/// let mut used_glyphs = HashSet::new();
410/// used_glyphs.insert(65); // 'A'
411/// used_glyphs.insert(66); // 'B'
412/// used_glyphs.insert(67); // 'C'
413///
414/// // Configure embedding options
415/// let options = EmbeddingOptions {
416/// subset: true, // Create font subset
417/// compress_font_streams: true, // Compress font data
418/// ..Default::default()
419/// };
420///
421/// // Load font data (example - you'd load actual TrueType data)
422/// let font_data = std::fs::read("path/to/font.ttf")?;
423///
424/// // Embed the font
425/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
426/// println!("Embedded font as: {}", font_name);
427///
428/// // Generate PDF dictionary for the embedded font
429/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
430/// println!("Font dictionary generated successfully");
431/// # Ok(())
432/// # }
433/// ```
434///
435/// Supported PDF versions
436pub mod pdf_version {
437 /// PDF 1.0 - 1.7 are fully supported
438 pub const SUPPORTED_VERSIONS: &[&str] =
439 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
440 /// PDF 2.0 support is planned
441 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_create_empty_document() {
450 let doc = Document::new();
451 assert_eq!(doc.pages.len(), 0);
452 }
453
454 #[test]
455 fn test_create_page() {
456 let page = Page::new(595.0, 842.0);
457 assert_eq!(page.width(), 595.0);
458 assert_eq!(page.height(), 842.0);
459 }
460
461 #[test]
462 fn test_version_info() {
463 assert!(!VERSION.is_empty());
464 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
465 }
466
467 #[test]
468 fn test_pdf_version_constants() {
469 // Test that all expected PDF versions are supported
470 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
471
472 for version in expected_versions {
473 assert!(
474 pdf_version::SUPPORTED_VERSIONS.contains(&version),
475 "Expected PDF version {version} to be supported"
476 );
477 }
478
479 // Test that we have exactly 8 supported versions
480 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
481
482 // Test planned versions
483 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
484 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
485 }
486
487 #[test]
488 fn test_document_with_metadata() {
489 let mut doc = Document::new();
490 doc.set_title("Test Document");
491 doc.set_author("Test Author");
492 doc.set_subject("Test Subject");
493
494 // Verify metadata is set (checking internal state)
495 assert_eq!(doc.pages.len(), 0);
496 // Note: We can't directly test metadata without exposing getters
497 // This test ensures the methods don't panic
498 }
499
500 #[test]
501 fn test_page_creation_variants() {
502 // Test different page creation methods
503 let page_a4 = Page::a4();
504 let page_letter = Page::letter();
505 let page_custom = Page::new(400.0, 600.0);
506
507 // A4 dimensions: 595.276 x 841.89 points (approximation)
508 assert!((page_a4.width() - 595.0).abs() < 10.0);
509 assert!((page_a4.height() - 842.0).abs() < 10.0);
510
511 // Letter dimensions: 612 x 792 points
512 assert_eq!(page_letter.width(), 612.0);
513 assert_eq!(page_letter.height(), 792.0);
514
515 // Custom dimensions
516 assert_eq!(page_custom.width(), 400.0);
517 assert_eq!(page_custom.height(), 600.0);
518 }
519
520 #[test]
521 fn test_color_creation() {
522 let red = Color::rgb(1.0, 0.0, 0.0);
523 let green = Color::rgb(0.0, 1.0, 0.0);
524 let blue = Color::rgb(0.0, 0.0, 1.0);
525 let black = Color::rgb(0.0, 0.0, 0.0);
526 let white = Color::rgb(1.0, 1.0, 1.0);
527
528 // Test color creation doesn't panic
529 let _colors = [red, green, blue, black, white];
530
531 // Test CMYK color (if available)
532 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
533 let _cmyk_test = cyan;
534 }
535
536 #[test]
537 fn test_font_types() {
538 let helvetica = Font::Helvetica;
539 let times = Font::TimesRoman;
540 let courier = Font::Courier;
541
542 // Test font creation doesn't panic
543 let _fonts = [helvetica, times, courier];
544
545 // Test font family
546 let helvetica_family = FontFamily::Helvetica;
547 let times_family = FontFamily::Times;
548 let courier_family = FontFamily::Courier;
549
550 let _families = [helvetica_family, times_family, courier_family];
551 }
552
553 #[test]
554 fn test_error_types() {
555 // Test that error types can be created
556 let pdf_error = PdfError::InvalidStructure("test error".to_string());
557 let _error_test = pdf_error;
558
559 // Test result type
560 let ok_result: Result<i32> = Ok(42);
561 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
562
563 assert!(ok_result.is_ok());
564 assert!(err_result.is_err());
565 }
566
567 #[test]
568 fn test_module_exports() {
569 // Test that all major types are properly exported
570 let _doc = Document::new();
571 let _page = Page::new(100.0, 100.0);
572 let _color = Color::rgb(0.5, 0.5, 0.5);
573 let _font = Font::Helvetica;
574
575 // Test parsing types
576 let _array = PdfArray::new();
577 let _dict = PdfDictionary::new();
578 let _name = PdfName::new("Test".to_string());
579 let _string = PdfString::new(b"Test".to_vec());
580
581 // Test operation types
582 let _margins = Margins {
583 top: 10.0,
584 right: 10.0,
585 bottom: 10.0,
586 left: 10.0,
587 };
588 let _align = TextAlign::Left;
589 }
590
591 #[test]
592 fn test_ocr_types() {
593 // Test OCR-related types
594 let _mock_ocr = MockOcrProvider::new();
595 let _ocr_options = OcrOptions::default();
596 let _ocr_engine = OcrEngine::Tesseract;
597
598 // Test fragment types
599 let _fragment_type = FragmentType::Word;
600 let _image_preprocessing = ImagePreprocessing::default();
601 }
602
603 #[test]
604 fn test_text_utilities() {
605 // Test text utility functions
606 let text = "Hello world test";
607 let words = split_into_words(text);
608 assert!(!words.is_empty());
609 assert!(words.contains(&"Hello"));
610 assert!(words.contains(&"world"));
611
612 // Test text measurement (with mock font)
613 let font = Font::Helvetica;
614 let size = 12.0;
615 let width = measure_text(text, font, size);
616 assert!(width > 0.0);
617 }
618
619 #[test]
620 fn test_image_types() {
621 // Test image-related types
622 let _format = ImageFormat::Jpeg;
623 let _color_space = ColorSpace::DeviceRGB;
624
625 // Test that image creation doesn't panic
626 let image_data = vec![0u8; 100];
627 let _image = Image::from_jpeg_data(image_data);
628 }
629
630 #[test]
631 fn test_version_string_format() {
632 // Test that version string follows semantic versioning
633 let version_parts: Vec<&str> = VERSION.split('.').collect();
634 assert!(
635 version_parts.len() >= 2,
636 "Version should have at least major.minor format"
637 );
638
639 // Test that major and minor are numeric
640 assert!(
641 version_parts[0].parse::<u32>().is_ok(),
642 "Major version should be numeric"
643 );
644 assert!(
645 version_parts[1].parse::<u32>().is_ok(),
646 "Minor version should be numeric"
647 );
648
649 // Test that version is not empty
650 assert!(!VERSION.is_empty());
651 assert!(!VERSION.is_empty());
652 }
653}