oxidize_pdf/lib.rs
1//! # oxidize-pdf
2//!
3//! A comprehensive, pure Rust PDF library for generation, parsing, and manipulation with zero external PDF dependencies.
4
5#![allow(clippy::all)]
6//!
7//! ## Features
8//!
9//! - **PDF Generation**: Create multi-page documents with text, graphics, and images
10//! - **PDF Parsing**: Complete parser supporting rendering and content extraction
11//! - **PDF Operations**: Split, merge, rotate, and extract pages
12//! - **Text Extraction**: Extract text with position and formatting information
13//! - **Image Extraction**: Extract images in JPEG, PNG, and TIFF formats
14//! - **Font Embedding**: TrueType and OpenType font embedding with subsetting support (v1.1.6+)
15//! - **Page Analysis**: Detect scanned vs text content with intelligent classification
16//! - **OCR Integration**: Pluggable OCR support with Tesseract for processing scanned documents (v0.1.3+)
17//! - **Resource Access**: Work with fonts, images, and other PDF resources
18//! - **Pure Rust**: No C dependencies or external libraries
19//! - **100% Native**: Complete PDF implementation from scratch
20//!
21//! ## Quick Start
22//!
23//! ### Creating PDFs
24//!
25//! ```rust
26//! use oxidize_pdf::{Document, Page, Font, Color, Result};
27//!
28//! # fn main() -> Result<()> {
29//! // Create a new document
30//! let mut doc = Document::new();
31//! doc.set_title("My PDF");
32//!
33//! // Create a page
34//! let mut page = Page::a4();
35//!
36//! // Add text
37//! page.text()
38//! .set_font(Font::Helvetica, 24.0)
39//! .at(50.0, 700.0)
40//! .write("Hello, PDF!")?;
41//!
42//! // Add graphics
43//! page.graphics()
44//! .set_fill_color(Color::rgb(0.0, 0.5, 1.0))
45//! .circle(300.0, 400.0, 50.0)
46//! .fill();
47//!
48//! // Save the document
49//! doc.add_page(page);
50//! doc.save("output.pdf")?;
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Parsing PDFs
56//!
57//! ```rust,no_run
58//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
59//!
60//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
61//! // Open and parse a PDF
62//! let reader = PdfReader::open("document.pdf")?;
63//! let document = PdfDocument::new(reader);
64//!
65//! // Get document information
66//! println!("Pages: {}", document.page_count()?);
67//! println!("Version: {}", document.version()?);
68//!
69//! // Process pages
70//! for i in 0..document.page_count()? {
71//! let page = document.get_page(i)?;
72//! println!("Page {} size: {}x{} points", i+1, page.width(), page.height());
73//! }
74//!
75//! // Extract text
76//! let text_pages = document.extract_text()?;
77//! for (i, page_text) in text_pages.iter().enumerate() {
78//! println!("Page {} text: {}", i+1, page_text.text);
79//! }
80//! # Ok(())
81//! # }
82//! ```
83//!
84//! ## Modules
85//!
86//! ### Generation Modules
87//! - [`document`] - PDF document creation and management
88//! - [`page`] - Page creation and layout
89//! - [`graphics`] - Vector graphics and images
90//! - [`text`] - Text rendering and flow
91//! - [`writer`] - Low-level PDF writing
92//!
93//! ### Parsing Modules
94//! - [`parser`] - Complete PDF parsing and reading
95//! - [`parser::PdfDocument`] - High-level document interface
96//! - [`parser::ParsedPage`] - Page representation with resources
97//! - [`parser::ContentParser`] - Content stream parsing
98//! - [`parser::PdfObject`] - Low-level PDF objects
99//!
100//! ### Manipulation Modules
101//! - [`operations`] - PDF manipulation (split, merge, rotate, extract images)
102//! - [`operations::page_analysis`] - Page content analysis and scanned page detection
103//! - [`text::extraction`] - Text extraction with positioning
104//!
105//! ### OCR Modules (v0.1.3+)
106//! - [`text::ocr`] - OCR trait system and types
107//! - [`text::tesseract_provider`] - Tesseract OCR provider (requires `ocr-tesseract` feature)
108//! - [`text::ocr`] - OCR integration for scanned documents
109//!
110//! ## Examples
111//!
112//! ### Content Stream Processing
113//!
114//! ```rust,no_run
115//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
116//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
117//!
118//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
119//! let reader = PdfReader::open("document.pdf")?;
120//! let document = PdfDocument::new(reader);
121//! let page = document.get_page(0)?;
122//!
123//! // Get and parse content streams
124//! let streams = page.content_streams_with_document(&document)?;
125//! for stream in streams {
126//! let operations = ContentParser::parse(&stream)?;
127//!
128//! for op in operations {
129//! match op {
130//! ContentOperation::ShowText(text) => {
131//! println!("Text: {:?}", String::from_utf8_lossy(&text));
132//! }
133//! ContentOperation::SetFont(name, size) => {
134//! println!("Font: {} at {} pt", name, size);
135//! }
136//! ContentOperation::MoveTo(x, y) => {
137//! println!("Move to ({}, {})", x, y);
138//! }
139//! _ => {} // Handle other operations
140//! }
141//! }
142//! }
143//! # Ok(())
144//! # }
145//! ```
146//!
147//! ### Resource Access
148//!
149//! ```rust,no_run
150//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
151//!
152//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
153//! let reader = PdfReader::open("document.pdf")?;
154//! let document = PdfDocument::new(reader);
155//! let page = document.get_page(0)?;
156//!
157//! // Access page resources
158//! if let Some(resources) = page.get_resources() {
159//! // Check fonts
160//! if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
161//! for (name, _) in &fonts.0 {
162//! println!("Font resource: {}", name.as_str());
163//! }
164//! }
165//!
166//! // Check images/XObjects
167//! if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
168//! for (name, _) in &xobjects.0 {
169//! println!("XObject resource: {}", name.as_str());
170//! }
171//! }
172//! }
173//! # Ok(())
174//! # }
175//! ```
176
177pub mod actions;
178pub mod advanced_tables;
179pub mod ai;
180pub mod annotations;
181
182pub mod batch;
183pub mod charts;
184pub mod compression;
185pub mod coordinate_system;
186pub mod document;
187pub mod encryption;
188pub mod error;
189pub mod fonts;
190pub mod forms;
191pub mod geometry;
192pub mod graphics;
193pub mod memory;
194pub mod metadata;
195pub mod objects;
196pub mod operations;
197pub mod page;
198pub mod page_forms;
199pub mod page_labels;
200pub mod page_lists;
201pub mod page_tables;
202pub mod page_transitions;
203pub mod page_tree;
204pub mod parser;
205#[cfg(feature = "performance")]
206pub mod performance;
207pub mod recovery;
208pub mod streaming;
209pub mod structure;
210pub mod templates;
211pub mod text;
212pub mod verification;
213pub mod viewer_preferences;
214pub mod writer;
215
216pub mod semantic;
217
218// Dashboard and reporting modules
219pub mod dashboard;
220
221// Re-export generation types
222pub use coordinate_system::{CoordinateSystem, RenderContext, TransformMatrix};
223pub use document::{Document, DocumentMetadata};
224pub use error::{OxidizePdfError, PdfError, Result};
225pub use geometry::{Point, Rectangle};
226pub use graphics::{Color, ColorSpace, GraphicsContext, Image, ImageFormat, MaskType};
227pub use page::{Margins, Page};
228pub use page_lists::{ListStyle, ListType, PageLists};
229pub use page_tables::{PageTables, TableStyle};
230pub use text::{
231 measure_text,
232 split_into_words,
233 BulletStyle,
234 Font,
235 FontFamily,
236 FragmentType,
237 HeaderStyle,
238 ImagePreprocessing,
239 ListElement,
240 ListOptions,
241 MockOcrProvider,
242 OcrEngine,
243 OcrError,
244 OcrOptions,
245 OcrProcessingResult,
246 OcrProvider,
247 OcrResult,
248 OcrTextFragment,
249 // List exports
250 OrderedList,
251 OrderedListStyle,
252 // Table exports
253 Table,
254 TableCell,
255 TableOptions,
256 TextAlign,
257 TextContext,
258 TextFlowContext,
259 UnorderedList,
260};
261
262// Re-export forms types
263pub use forms::{
264 calculations::FieldValue,
265 field_actions::{
266 ActionSettings, FieldAction, FieldActionSystem, FieldActions, FormatActionType,
267 SpecialFormatType, ValidateActionType,
268 },
269 validation::{
270 DateFormat, FieldValidator, FormValidationSystem, FormatMask, PhoneCountry,
271 RequiredFieldInfo, RequirementCondition, TimeFormat, ValidationRule, ValidationSettings,
272 },
273 BorderStyle, FieldType, TextField, Widget,
274};
275
276// Re-export font embedding types
277pub use text::fonts::embedding::{
278 EmbeddedFontData, EmbeddingOptions, EncodingDifference, FontDescriptor, FontEmbedder,
279 FontEncoding, FontFlags, FontMetrics, FontType,
280};
281
282// Re-export font management types
283pub use text::font_manager::{CustomFont, FontManager};
284
285// Re-export parsing types
286pub use parser::{
287 ContentOperation, ContentParser, DocumentMetadata as ParsedDocumentMetadata, ParseOptions,
288 ParsedPage, PdfArray, PdfDictionary, PdfDocument, PdfName, PdfObject, PdfReader, PdfStream,
289 PdfString,
290};
291
292// Re-export operations
293pub use operations::{
294 extract_images_from_pages, extract_images_from_pdf, merge_pdfs, rotate_pdf_pages, split_pdf,
295 ExtractImagesOptions, ExtractedImage, ImageExtractor,
296};
297
298// Re-export dashboard types
299pub use dashboard::{
300 Dashboard, DashboardBuilder, DashboardComponent, DashboardConfig, DashboardLayout,
301 DashboardTheme, HeatMap, KpiCard, PivotTable, ScatterPlot, TreeMap, Typography,
302};
303
304// Re-export memory optimization types
305pub use memory::{LazyDocument, MemoryOptions, StreamProcessor, StreamingOptions};
306
307// Re-export streaming types
308pub use streaming::{
309 process_in_chunks, stream_text, ChunkOptions, ChunkProcessor, ChunkType, ContentChunk,
310 IncrementalParser, ParseEvent, StreamingDocument, StreamingOptions as StreamOptions,
311 StreamingPage, TextChunk, TextStreamOptions, TextStreamer,
312};
313
314// Re-export batch processing types
315pub use batch::{
316 batch_merge_pdfs, batch_process_files, batch_split_pdfs, BatchJob, BatchOptions,
317 BatchProcessor, BatchProgress, BatchResult, BatchSummary, JobResult, JobStatus, JobType,
318 ProgressCallback, ProgressInfo,
319};
320
321// Re-export recovery types
322pub use recovery::{
323 analyze_corruption, detect_corruption, quick_recover, repair_document, validate_pdf,
324 CorruptionReport, CorruptionType, ObjectScanner, PartialRecovery, PdfRecovery, RecoveredPage,
325 RecoveryOptions, RepairResult, RepairStrategy, ScanResult, ValidationError, ValidationResult,
326};
327
328// Re-export structure types
329pub use structure::{
330 Destination, DestinationType, NameTree, NameTreeNode, NamedDestinations, OutlineBuilder,
331 OutlineItem, OutlineTree, PageDestination, PageTree, PageTreeBuilder, PageTreeNode,
332};
333
334// Re-export action types
335pub use actions::{
336 Action, ActionDictionary, ActionType, GoToAction, LaunchAction, LaunchParameters, NamedAction,
337 RemoteGoToAction, StandardNamedAction, UriAction, UriActionFlags,
338};
339
340// Re-export page label types
341pub use page_labels::{PageLabel, PageLabelBuilder, PageLabelRange, PageLabelStyle, PageLabelTree};
342
343// Re-export template types
344pub use templates::{
345 Template, TemplateContext, TemplateError, TemplateRenderer, TemplateResult, TemplateValue,
346};
347
348// Re-export semantic types for AI-Ready PDFs
349pub use semantic::{
350 BoundingBox, Entity, EntityMap, EntityMetadata, EntityRelation, EntityType, ExportFormat,
351 RelationType, SemanticEntity, SemanticMarking,
352};
353
354// Re-export verification types
355pub use verification::comparators::{
356 compare_pdfs, ComparisonResult, DifferenceSeverity, PdfDifference,
357};
358pub use verification::compliance_report::{
359 format_report_markdown, generate_compliance_report, ComplianceReport,
360};
361pub use verification::iso_matrix::{load_default_matrix, load_matrix, ComplianceStats, IsoMatrix};
362pub use verification::validators::{
363 check_available_validators, validate_external, validate_with_qpdf,
364};
365pub use verification::{
366 extract_pdf_differences, pdfs_structurally_equivalent, verify_iso_requirement,
367 ExternalValidationResult, IsoRequirement, VerificationLevel, VerificationResult,
368};
369
370/// Current version of oxidize-pdf
371pub const VERSION: &str = env!("CARGO_PKG_VERSION");
372
373/// Scanned page analysis and OCR example
374///
375/// ```rust,no_run
376/// use oxidize_pdf::operations::page_analysis::{PageContentAnalyzer, PageType};
377/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
378/// use oxidize_pdf::parser::PdfReader;
379///
380/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
381/// let document = PdfReader::open_document("scanned.pdf")?;
382/// let analyzer = PageContentAnalyzer::new(document);
383///
384/// // Analyze pages for scanned content
385/// let analyses = analyzer.analyze_document()?;
386/// for analysis in analyses {
387/// match analysis.page_type {
388/// PageType::Scanned => {
389/// println!("Page {} is scanned - applying OCR", analysis.page_number);
390///
391/// // Process with OCR
392/// let ocr_provider = MockOcrProvider::new();
393/// let ocr_result = analyzer.extract_text_from_scanned_page(
394/// analysis.page_number,
395/// &ocr_provider
396/// )?;
397///
398/// println!("OCR extracted: {}", ocr_result.text);
399/// println!("Confidence: {:.1}%", ocr_result.confidence * 100.0);
400/// }
401/// PageType::Text => println!("Page {} has vector text", analysis.page_number),
402/// PageType::Mixed => println!("Page {} has mixed content", analysis.page_number),
403/// }
404/// }
405/// # Ok(())
406/// # }
407/// ```
408///
409/// ### Font Embedding
410///
411/// ```rust,no_run
412/// use oxidize_pdf::{FontEmbedder, EmbeddingOptions, Document, Page, Font};
413/// use std::collections::HashSet;
414///
415/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
416/// // Create font embedder
417/// let mut embedder = FontEmbedder::new();
418///
419/// // Define used glyphs (example with basic ASCII)
420/// let mut used_glyphs = HashSet::new();
421/// used_glyphs.insert(65); // 'A'
422/// used_glyphs.insert(66); // 'B'
423/// used_glyphs.insert(67); // 'C'
424///
425/// // Configure embedding options
426/// let options = EmbeddingOptions {
427/// subset: true, // Create font subset
428/// compress_font_streams: true, // Compress font data
429/// ..Default::default()
430/// };
431///
432/// // Load font data (example - you'd load actual TrueType data)
433/// let font_data = std::fs::read("path/to/font.ttf")?;
434///
435/// // Embed the font
436/// let font_name = embedder.embed_truetype_font(&font_data, &used_glyphs, &options)?;
437/// println!("Embedded font as: {}", font_name);
438///
439/// // Generate PDF dictionary for the embedded font
440/// let font_dict = embedder.generate_font_dictionary(&font_name)?;
441/// println!("Font dictionary generated successfully");
442/// # Ok(())
443/// # }
444/// ```
445///
446/// Supported PDF versions
447pub mod pdf_version {
448 /// PDF 1.0 - 1.7 are fully supported
449 pub const SUPPORTED_VERSIONS: &[&str] =
450 &["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
451 /// PDF 2.0 support is planned
452 pub const PLANNED_VERSIONS: &[&str] = &["2.0"];
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 #[test]
460 fn test_create_empty_document() {
461 let doc = Document::new();
462 assert_eq!(doc.pages.len(), 0);
463 }
464
465 #[test]
466 fn test_create_page() {
467 let page = Page::new(595.0, 842.0);
468 assert_eq!(page.width(), 595.0);
469 assert_eq!(page.height(), 842.0);
470 }
471
472 #[test]
473 fn test_version_info() {
474 assert!(!VERSION.is_empty());
475 assert!(pdf_version::SUPPORTED_VERSIONS.contains(&"1.7"));
476 }
477
478 #[test]
479 fn test_pdf_version_constants() {
480 // Test that all expected PDF versions are supported
481 let expected_versions = ["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7"];
482
483 for version in expected_versions {
484 assert!(
485 pdf_version::SUPPORTED_VERSIONS.contains(&version),
486 "Expected PDF version {version} to be supported"
487 );
488 }
489
490 // Test that we have exactly 8 supported versions
491 assert_eq!(pdf_version::SUPPORTED_VERSIONS.len(), 8);
492
493 // Test planned versions
494 assert!(pdf_version::PLANNED_VERSIONS.contains(&"2.0"));
495 assert_eq!(pdf_version::PLANNED_VERSIONS.len(), 1);
496 }
497
498 #[test]
499 fn test_document_with_metadata() {
500 let mut doc = Document::new();
501 doc.set_title("Test Document");
502 doc.set_author("Test Author");
503 doc.set_subject("Test Subject");
504
505 // Verify metadata is set (checking internal state)
506 assert_eq!(doc.pages.len(), 0);
507 // Note: We can't directly test metadata without exposing getters
508 // This test ensures the methods don't panic
509 }
510
511 #[test]
512 fn test_page_creation_variants() {
513 // Test different page creation methods
514 let page_a4 = Page::a4();
515 let page_letter = Page::letter();
516 let page_custom = Page::new(400.0, 600.0);
517
518 // A4 dimensions: 595.276 x 841.89 points (approximation)
519 assert!((page_a4.width() - 595.0).abs() < 10.0);
520 assert!((page_a4.height() - 842.0).abs() < 10.0);
521
522 // Letter dimensions: 612 x 792 points
523 assert_eq!(page_letter.width(), 612.0);
524 assert_eq!(page_letter.height(), 792.0);
525
526 // Custom dimensions
527 assert_eq!(page_custom.width(), 400.0);
528 assert_eq!(page_custom.height(), 600.0);
529 }
530
531 #[test]
532 fn test_color_creation() {
533 let red = Color::rgb(1.0, 0.0, 0.0);
534 let green = Color::rgb(0.0, 1.0, 0.0);
535 let blue = Color::rgb(0.0, 0.0, 1.0);
536 let black = Color::rgb(0.0, 0.0, 0.0);
537 let white = Color::rgb(1.0, 1.0, 1.0);
538
539 // Test color creation doesn't panic
540 let _colors = [red, green, blue, black, white];
541
542 // Test CMYK color (if available)
543 let cyan = Color::cmyk(1.0, 0.0, 0.0, 0.0);
544 let _cmyk_test = cyan;
545 }
546
547 #[test]
548 fn test_font_types() {
549 let helvetica = Font::Helvetica;
550 let times = Font::TimesRoman;
551 let courier = Font::Courier;
552
553 // Test font creation doesn't panic
554 let _fonts = [helvetica, times, courier];
555
556 // Test font family
557 let helvetica_family = FontFamily::Helvetica;
558 let times_family = FontFamily::Times;
559 let courier_family = FontFamily::Courier;
560
561 let _families = [helvetica_family, times_family, courier_family];
562 }
563
564 #[test]
565 fn test_error_types() {
566 // Test that error types can be created
567 let pdf_error = PdfError::InvalidStructure("test error".to_string());
568 let _error_test = pdf_error;
569
570 // Test result type
571 let ok_result: Result<i32> = Ok(42);
572 let err_result: Result<i32> = Err(PdfError::InvalidStructure("test error".to_string()));
573
574 assert!(ok_result.is_ok());
575 assert!(err_result.is_err());
576 }
577
578 #[test]
579 fn test_module_exports() {
580 // Test that all major types are properly exported
581 let _doc = Document::new();
582 let _page = Page::new(100.0, 100.0);
583 let _color = Color::rgb(0.5, 0.5, 0.5);
584 let _font = Font::Helvetica;
585
586 // Test parsing types
587 let _array = PdfArray::new();
588 let _dict = PdfDictionary::new();
589 let _name = PdfName::new("Test".to_string());
590 let _string = PdfString::new(b"Test".to_vec());
591
592 // Test operation types
593 let _margins = Margins {
594 top: 10.0,
595 right: 10.0,
596 bottom: 10.0,
597 left: 10.0,
598 };
599 let _align = TextAlign::Left;
600 }
601
602 #[test]
603 fn test_ocr_types() {
604 // Test OCR-related types
605 let _mock_ocr = MockOcrProvider::new();
606 let _ocr_options = OcrOptions::default();
607 let _ocr_engine = OcrEngine::Tesseract;
608
609 // Test fragment types
610 let _fragment_type = FragmentType::Word;
611 let _image_preprocessing = ImagePreprocessing::default();
612 }
613
614 #[test]
615 fn test_text_utilities() {
616 // Test text utility functions
617 let text = "Hello world test";
618 let words = split_into_words(text);
619 assert!(!words.is_empty());
620 assert!(words.contains(&"Hello"));
621 assert!(words.contains(&"world"));
622
623 // Test text measurement (with mock font)
624 let font = Font::Helvetica;
625 let size = 12.0;
626 let width = measure_text(text, font, size);
627 assert!(width > 0.0);
628 }
629
630 #[test]
631 fn test_image_types() {
632 // Test image-related types
633 let _format = ImageFormat::Jpeg;
634 let _color_space = ColorSpace::DeviceRGB;
635
636 // Test that image creation doesn't panic
637 let image_data = vec![0u8; 100];
638 let _image = Image::from_jpeg_data(image_data);
639 }
640
641 #[test]
642 fn test_version_string_format() {
643 // Test that version string follows semantic versioning
644 let version_parts: Vec<&str> = VERSION.split('.').collect();
645 assert!(
646 version_parts.len() >= 2,
647 "Version should have at least major.minor format"
648 );
649
650 // Test that major and minor are numeric
651 assert!(
652 version_parts[0].parse::<u32>().is_ok(),
653 "Major version should be numeric"
654 );
655 assert!(
656 version_parts[1].parse::<u32>().is_ok(),
657 "Minor version should be numeric"
658 );
659
660 // Test that version is not empty
661 assert!(!VERSION.is_empty());
662 assert!(!VERSION.is_empty());
663 }
664}