oxidize_pdf/text/ocr/mod.rs
1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//! 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//! 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//! println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//! let analysis = analyzer.analyze_page(page_num)?;
62//! if analysis.is_scanned() {
63//! println!("Processing scanned page {}", page_num);
64//! // OCR processing would happen here
65//! }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81 /// OCR provider is not available or not configured
82 #[error("OCR provider not available: {0}")]
83 ProviderNotAvailable(String),
84
85 /// Unsupported image format for OCR processing
86 #[error("Unsupported image format: {0:?}")]
87 UnsupportedImageFormat(ImageFormat),
88
89 /// Invalid or corrupted image data
90 #[error("Invalid image data: {0}")]
91 InvalidImageData(String),
92
93 /// OCR processing failed
94 #[error("OCR processing failed: {0}")]
95 ProcessingFailed(String),
96
97 /// Network error when using cloud OCR providers
98 #[error("Network error: {0}")]
99 NetworkError(String),
100
101 /// API key or authentication error
102 #[error("Authentication error: {0}")]
103 AuthenticationError(String),
104
105 /// Rate limiting or quota exceeded
106 #[error("Rate limit exceeded: {0}")]
107 RateLimitExceeded(String),
108
109 /// OCR provider returned low confidence results
110 #[error("Low confidence results: {0}")]
111 LowConfidence(String),
112
113 /// Generic IO error
114 #[error("IO error: {0}")]
115 Io(#[from] std::io::Error),
116
117 /// Configuration error
118 #[error("Configuration error: {0}")]
119 Configuration(String),
120}
121
122/// A rectangular region for selective OCR processing
123#[derive(Debug, Clone, PartialEq)]
124pub struct OcrRegion {
125 /// X coordinate of the top-left corner (pixels)
126 pub x: u32,
127
128 /// Y coordinate of the top-left corner (pixels)
129 pub y: u32,
130
131 /// Width of the region (pixels)
132 pub width: u32,
133
134 /// Height of the region (pixels)
135 pub height: u32,
136
137 /// Optional label for this region (e.g., "header", "table", "paragraph")
138 pub label: Option<String>,
139}
140
141impl OcrRegion {
142 /// Create a new OCR region
143 pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
144 Self {
145 x,
146 y,
147 width,
148 height,
149 label: None,
150 }
151 }
152
153 /// Create a new OCR region with a label
154 pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
155 Self {
156 x,
157 y,
158 width,
159 height,
160 label: Some(label.into()),
161 }
162 }
163
164 /// Check if this region contains a point
165 pub fn contains_point(&self, x: u32, y: u32) -> bool {
166 x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
167 }
168
169 /// Check if this region overlaps with another region
170 pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
171 !(self.x + self.width <= other.x
172 || other.x + other.width <= self.x
173 || self.y + self.height <= other.y
174 || other.y + other.height <= self.y)
175 }
176}
177
178/// OCR processing options and configuration
179#[derive(Debug, Clone)]
180pub struct OcrOptions {
181 /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
182 pub language: String,
183
184 /// Minimum confidence threshold (0.0 to 1.0)
185 pub min_confidence: f64,
186
187 /// Whether to preserve text layout and positioning
188 pub preserve_layout: bool,
189
190 /// Image preprocessing options
191 pub preprocessing: ImagePreprocessing,
192
193 /// OCR engine specific options
194 pub engine_options: std::collections::HashMap<String, String>,
195
196 /// Timeout for OCR operations (in seconds)
197 pub timeout_seconds: u32,
198
199 /// Specific regions to process (None = process entire image)
200 pub regions: Option<Vec<OcrRegion>>,
201
202 /// Whether to save extracted images for debug purposes
203 pub debug_output: bool,
204}
205
206impl Default for OcrOptions {
207 fn default() -> Self {
208 Self {
209 language: "en".to_string(),
210 min_confidence: 0.6,
211 preserve_layout: true,
212 preprocessing: ImagePreprocessing::default(),
213 engine_options: std::collections::HashMap::new(),
214 timeout_seconds: 60, // Aumentado para documentos complejos
215 regions: None,
216 debug_output: false,
217 }
218 }
219}
220
221/// Image preprocessing options for OCR
222#[derive(Debug, Clone)]
223pub struct ImagePreprocessing {
224 /// Whether to apply image denoising
225 pub denoise: bool,
226
227 /// Whether to apply image deskewing
228 pub deskew: bool,
229
230 /// Whether to enhance contrast
231 pub enhance_contrast: bool,
232
233 /// Whether to apply image sharpening
234 pub sharpen: bool,
235
236 /// Scale factor for image resizing (1.0 = no scaling)
237 pub scale_factor: f64,
238}
239
240impl Default for ImagePreprocessing {
241 fn default() -> Self {
242 Self {
243 denoise: true,
244 deskew: true,
245 enhance_contrast: true,
246 sharpen: false,
247 scale_factor: 1.0,
248 }
249 }
250}
251
252/// Word-level confidence information for detailed OCR analysis
253#[derive(Debug, Clone)]
254pub struct WordConfidence {
255 /// The word text
256 pub word: String,
257
258 /// Confidence score for this specific word (0.0 to 1.0)
259 pub confidence: f64,
260
261 /// X position of the word within the fragment (relative to fragment start)
262 pub x_offset: f64,
263
264 /// Width of the word in points
265 pub width: f64,
266
267 /// Optional character-level confidences (for ultimate granularity)
268 pub character_confidences: Option<Vec<CharacterConfidence>>,
269}
270
271impl WordConfidence {
272 /// Create a new word confidence
273 pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
274 Self {
275 word,
276 confidence,
277 x_offset,
278 width,
279 character_confidences: None,
280 }
281 }
282
283 /// Create a word confidence with character-level details
284 pub fn with_characters(
285 word: String,
286 confidence: f64,
287 x_offset: f64,
288 width: f64,
289 character_confidences: Vec<CharacterConfidence>,
290 ) -> Self {
291 Self {
292 word,
293 confidence,
294 x_offset,
295 width,
296 character_confidences: Some(character_confidences),
297 }
298 }
299
300 /// Get the average character confidence if available
301 pub fn average_character_confidence(&self) -> Option<f64> {
302 self.character_confidences.as_ref().map(|chars| {
303 let sum: f64 = chars.iter().map(|c| c.confidence).sum();
304 sum / chars.len() as f64
305 })
306 }
307
308 /// Check if this word has low confidence (below threshold)
309 pub fn is_low_confidence(&self, threshold: f64) -> bool {
310 self.confidence < threshold
311 }
312}
313
314/// Character-level confidence information for ultimate OCR granularity
315#[derive(Debug, Clone)]
316pub struct CharacterConfidence {
317 /// The character
318 pub character: char,
319
320 /// Confidence score for this character (0.0 to 1.0)
321 pub confidence: f64,
322
323 /// X position relative to word start
324 pub x_offset: f64,
325
326 /// Character width in points
327 pub width: f64,
328}
329
330impl CharacterConfidence {
331 /// Create a new character confidence
332 pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
333 Self {
334 character,
335 confidence,
336 x_offset,
337 width,
338 }
339 }
340}
341
342/// Candidate for OCR post-processing correction
343#[derive(Debug, Clone)]
344pub struct CorrectionCandidate {
345 /// The original word with low confidence or errors
346 pub word: String,
347
348 /// Original confidence score
349 pub confidence: f64,
350
351 /// Position within the text fragment
352 pub position_in_fragment: usize,
353
354 /// Suggested corrections ranked by likelihood
355 pub suggested_corrections: Vec<CorrectionSuggestion>,
356
357 /// Reason why this word needs correction
358 pub correction_reason: CorrectionReason,
359}
360
361/// A suggested correction for an OCR error
362#[derive(Debug, Clone)]
363pub struct CorrectionSuggestion {
364 /// The corrected word
365 pub corrected_word: String,
366
367 /// Confidence in this correction (0.0 to 1.0)
368 pub correction_confidence: f64,
369
370 /// Type of correction applied
371 pub correction_type: CorrectionType,
372
373 /// Explanation of why this correction was suggested
374 pub explanation: Option<String>,
375}
376
377/// Reasons why a word might need correction
378#[derive(Debug, Clone, PartialEq, Eq)]
379pub enum CorrectionReason {
380 /// Word has low OCR confidence
381 LowConfidence,
382
383 /// Word contains common OCR confusion patterns
384 ConfusionPattern,
385
386 /// Word not found in dictionary
387 NotInDictionary,
388
389 /// Word doesn't fit context
390 ContextualError,
391
392 /// Word has suspicious character combinations
393 SuspiciousPattern,
394}
395
396/// Types of corrections that can be applied
397#[derive(Debug, Clone, PartialEq, Eq, Hash)]
398pub enum CorrectionType {
399 /// Character substitution (e.g., "0" -> "O")
400 CharacterSubstitution,
401
402 /// Dictionary lookup and replacement
403 DictionaryCorrection,
404
405 /// Contextual correction based on surrounding words
406 ContextualCorrection,
407
408 /// Pattern-based correction (e.g., "rn" -> "m")
409 PatternCorrection,
410
411 /// Manual review suggested
412 ManualReview,
413}
414
415/// OCR post-processor for automatic text correction
416#[derive(Debug, Clone)]
417pub struct OcrPostProcessor {
418 /// Common OCR character confusions
419 pub character_corrections: std::collections::HashMap<char, Vec<char>>,
420
421 /// Dictionary of valid words (optional)
422 pub dictionary: Option<std::collections::HashSet<String>>,
423
424 /// Common pattern corrections
425 pub pattern_corrections: std::collections::HashMap<String, String>,
426
427 /// Confidence threshold for correction
428 pub correction_threshold: f64,
429
430 /// Maximum edit distance for corrections
431 pub max_edit_distance: usize,
432}
433
434impl OcrPostProcessor {
435 /// Create a new post-processor with common OCR corrections
436 pub fn new() -> Self {
437 let mut character_corrections = std::collections::HashMap::new();
438
439 // Common OCR character confusions
440 character_corrections.insert('0', vec!['O', 'o', 'Q']);
441 character_corrections.insert('O', vec!['0', 'Q', 'o']);
442 character_corrections.insert('1', vec!['l', 'I', '|']);
443 character_corrections.insert('l', vec!['1', 'I', '|']);
444 character_corrections.insert('I', vec!['1', 'l', '|']);
445 character_corrections.insert('S', vec!['5', '$']);
446 character_corrections.insert('5', vec!['S', '$']);
447 character_corrections.insert('2', vec!['Z', 'z']);
448 character_corrections.insert('Z', vec!['2', 'z']);
449
450 let mut pattern_corrections = std::collections::HashMap::new();
451 pattern_corrections.insert("rn".to_string(), "m".to_string());
452 pattern_corrections.insert("cl".to_string(), "d".to_string());
453 pattern_corrections.insert("fi".to_string(), "fi".to_string()); // ligature
454 pattern_corrections.insert("fl".to_string(), "fl".to_string()); // ligature
455
456 Self {
457 character_corrections,
458 dictionary: None,
459 pattern_corrections,
460 correction_threshold: 0.7,
461 max_edit_distance: 2,
462 }
463 }
464
465 /// Add a dictionary for word validation
466 pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
467 self.dictionary = Some(dictionary);
468 self
469 }
470
471 /// Process a fragment and suggest corrections
472 pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
473 let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
474
475 // Enhance candidates with suggestions
476 for candidate in &mut candidates {
477 candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
478 }
479
480 candidates
481 }
482
483 /// Generate correction suggestions for a word
484 pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
485 let mut suggestions = Vec::new();
486
487 // Character substitution corrections
488 suggestions.extend(self.character_substitution_corrections(word));
489
490 // Pattern-based corrections
491 suggestions.extend(self.pattern_corrections(word));
492
493 // Dictionary corrections (if available)
494 if let Some(dict) = &self.dictionary {
495 suggestions.extend(self.dictionary_corrections(word, dict));
496 }
497
498 // Sort by confidence and limit results
499 suggestions.sort_by(|a, b| b.correction_confidence.total_cmp(&a.correction_confidence));
500 suggestions.truncate(5); // Limit to top 5 suggestions
501
502 suggestions
503 }
504
505 /// Generate character substitution corrections
506 fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
507 let mut suggestions = Vec::new();
508 let chars: Vec<char> = word.chars().collect();
509
510 for (i, &ch) in chars.iter().enumerate() {
511 if let Some(alternatives) = self.character_corrections.get(&ch) {
512 for &alt_ch in alternatives {
513 let mut corrected_chars = chars.clone();
514 corrected_chars[i] = alt_ch;
515 let corrected_word: String = corrected_chars.into_iter().collect();
516
517 suggestions.push(CorrectionSuggestion {
518 corrected_word,
519 correction_confidence: 0.8,
520 correction_type: CorrectionType::CharacterSubstitution,
521 explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
522 });
523 }
524 }
525 }
526
527 suggestions
528 }
529
530 /// Generate pattern-based corrections
531 fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
532 let mut suggestions = Vec::new();
533
534 for (pattern, replacement) in &self.pattern_corrections {
535 if word.contains(pattern) {
536 let corrected_word = word.replace(pattern, replacement);
537 suggestions.push(CorrectionSuggestion {
538 corrected_word,
539 correction_confidence: 0.85,
540 correction_type: CorrectionType::PatternCorrection,
541 explanation: Some(format!(
542 "Pattern '{}' -> '{}' correction",
543 pattern, replacement
544 )),
545 });
546 }
547 }
548
549 suggestions
550 }
551
552 /// Generate dictionary-based corrections
553 fn dictionary_corrections(
554 &self,
555 word: &str,
556 dictionary: &std::collections::HashSet<String>,
557 ) -> Vec<CorrectionSuggestion> {
558 let mut suggestions = Vec::new();
559
560 // Check if word is already valid
561 if dictionary.contains(word) {
562 return suggestions;
563 }
564
565 // Find similar words using simple edit distance
566 for dict_word in dictionary {
567 if self.edit_distance(word, dict_word) <= self.max_edit_distance {
568 let confidence = 1.0
569 - (self.edit_distance(word, dict_word) as f64
570 / word.len().max(dict_word.len()) as f64);
571 suggestions.push(CorrectionSuggestion {
572 corrected_word: dict_word.clone(),
573 correction_confidence: confidence * 0.9, // Slightly lower than pattern corrections
574 correction_type: CorrectionType::DictionaryCorrection,
575 explanation: Some(format!(
576 "Dictionary match with edit distance {}",
577 self.edit_distance(word, dict_word)
578 )),
579 });
580 }
581 }
582
583 suggestions
584 }
585
586 /// Calculate simple edit distance (Levenshtein distance)
587 fn edit_distance(&self, s1: &str, s2: &str) -> usize {
588 let len1 = s1.len();
589 let len2 = s2.len();
590
591 let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
592
593 #[allow(clippy::needless_range_loop)]
594 for i in 0..=len1 {
595 dp[i][0] = i;
596 }
597 for j in 0..=len2 {
598 dp[0][j] = j;
599 }
600
601 let s1_chars: Vec<char> = s1.chars().collect();
602 let s2_chars: Vec<char> = s2.chars().collect();
603
604 for i in 1..=len1 {
605 for j in 1..=len2 {
606 if s1_chars[i - 1] == s2_chars[j - 1] {
607 dp[i][j] = dp[i - 1][j - 1];
608 } else {
609 dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
610 }
611 }
612 }
613
614 dp[len1][len2]
615 }
616}
617
618impl Default for OcrPostProcessor {
619 fn default() -> Self {
620 Self::new()
621 }
622}
623
624/// Text fragment extracted by OCR with position and confidence information
625#[derive(Debug, Clone)]
626pub struct OcrTextFragment {
627 /// The extracted text content
628 pub text: String,
629
630 /// X position in page coordinates (points)
631 pub x: f64,
632
633 /// Y position in page coordinates (points)
634 pub y: f64,
635
636 /// Width of the text fragment (points)
637 pub width: f64,
638
639 /// Height of the text fragment (points)
640 pub height: f64,
641
642 /// Confidence score for this fragment (0.0 to 1.0)
643 pub confidence: f64,
644
645 /// Word-level confidence scores (optional, for advanced OCR engines)
646 pub word_confidences: Option<Vec<WordConfidence>>,
647
648 /// Font size estimation (points)
649 pub font_size: f64,
650
651 /// Whether this fragment is part of a word or line
652 pub fragment_type: FragmentType,
653}
654
655impl OcrTextFragment {
656 /// Create a new OCR text fragment
657 #[allow(clippy::too_many_arguments)]
658 pub fn new(
659 text: String,
660 x: f64,
661 y: f64,
662 width: f64,
663 height: f64,
664 confidence: f64,
665 font_size: f64,
666 fragment_type: FragmentType,
667 ) -> Self {
668 Self {
669 text,
670 x,
671 y,
672 width,
673 height,
674 confidence,
675 word_confidences: None,
676 font_size,
677 fragment_type,
678 }
679 }
680
681 /// Create a fragment with word-level confidence scores
682 #[allow(clippy::too_many_arguments)]
683 pub fn with_word_confidences(
684 text: String,
685 x: f64,
686 y: f64,
687 width: f64,
688 height: f64,
689 confidence: f64,
690 font_size: f64,
691 fragment_type: FragmentType,
692 word_confidences: Vec<WordConfidence>,
693 ) -> Self {
694 Self {
695 text,
696 x,
697 y,
698 width,
699 height,
700 confidence,
701 word_confidences: Some(word_confidences),
702 font_size,
703 fragment_type,
704 }
705 }
706
707 /// Get words with confidence below the threshold
708 pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
709 self.word_confidences
710 .as_ref()
711 .map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
712 .unwrap_or_default()
713 }
714
715 /// Get the average word confidence if available
716 pub fn average_word_confidence(&self) -> Option<f64> {
717 self.word_confidences.as_ref().map(|words| {
718 if words.is_empty() {
719 return 0.0;
720 }
721 let sum: f64 = words.iter().map(|w| w.confidence).sum();
722 sum / words.len() as f64
723 })
724 }
725
726 /// Get words sorted by confidence (lowest first)
727 pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
728 self.word_confidences
729 .as_ref()
730 .map(|words| {
731 let mut sorted_words: Vec<_> = words.iter().collect();
732 sorted_words.sort_by(|a, b| a.confidence.total_cmp(&b.confidence));
733 sorted_words
734 })
735 .unwrap_or_default()
736 }
737
738 /// Check if this fragment has any low-confidence words
739 pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
740 self.word_confidences
741 .as_ref()
742 .map(|words| words.iter().any(|w| w.confidence < threshold))
743 .unwrap_or(false)
744 }
745
746 /// Get words that are candidates for correction (low confidence + patterns)
747 pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
748 self.word_confidences
749 .as_ref()
750 .map(|words| {
751 words
752 .iter()
753 .enumerate()
754 .filter(|(_, w)| w.confidence < threshold)
755 .map(|(index, word)| CorrectionCandidate {
756 word: word.word.clone(),
757 confidence: word.confidence,
758 position_in_fragment: index,
759 suggested_corrections: vec![], // Will be filled by post-processor
760 correction_reason: CorrectionReason::LowConfidence,
761 })
762 .collect()
763 })
764 .unwrap_or_default()
765 }
766
767 /// Generate a confidence report for this fragment
768 pub fn confidence_report(&self) -> String {
769 let mut report = format!(
770 "Fragment confidence: {:.1}% - \"{}\"\n",
771 self.confidence * 100.0,
772 self.text.trim()
773 );
774
775 if let Some(words) = &self.word_confidences {
776 report.push_str(&format!(
777 " Word-level breakdown ({} words):\n",
778 words.len()
779 ));
780 for (i, word) in words.iter().enumerate() {
781 report.push_str(&format!(
782 " {}: \"{}\" - {:.1}%\n",
783 i + 1,
784 word.word,
785 word.confidence * 100.0
786 ));
787
788 if let Some(chars) = &word.character_confidences {
789 report.push_str(" Characters: ");
790 for ch in chars {
791 report.push_str(&format!(
792 "'{}'({:.0}%) ",
793 ch.character,
794 ch.confidence * 100.0
795 ));
796 }
797 report.push('\n');
798 }
799 }
800 } else {
801 report.push_str(" (No word-level data available)\n");
802 }
803
804 report
805 }
806}
807
808/// Type of text fragment
809#[derive(Debug, Clone, Copy, PartialEq, Eq)]
810pub enum FragmentType {
811 /// Individual character
812 Character,
813 /// Complete word
814 Word,
815 /// Text line
816 Line,
817 /// Paragraph
818 Paragraph,
819}
820
821/// Complete result of OCR processing
822#[derive(Debug, Clone)]
823pub struct OcrProcessingResult {
824 /// The complete extracted text
825 pub text: String,
826
827 /// Overall confidence score (0.0 to 1.0)
828 pub confidence: f64,
829
830 /// Individual text fragments with position information
831 pub fragments: Vec<OcrTextFragment>,
832
833 /// Processing time in milliseconds
834 pub processing_time_ms: u64,
835
836 /// OCR engine used for processing
837 pub engine_name: String,
838
839 /// Language detected/used
840 pub language: String,
841
842 /// Region that was processed (None if entire image was processed)
843 pub processed_region: Option<OcrRegion>,
844
845 /// Image dimensions that were processed
846 pub image_dimensions: (u32, u32),
847}
848
849impl OcrProcessingResult {
850 /// Create a new OCR processing result
851 pub fn new(
852 text: String,
853 confidence: f64,
854 fragments: Vec<OcrTextFragment>,
855 processing_time_ms: u64,
856 engine_name: String,
857 language: String,
858 image_dimensions: (u32, u32),
859 ) -> Self {
860 Self {
861 text,
862 confidence,
863 fragments,
864 processing_time_ms,
865 engine_name,
866 language,
867 processed_region: None,
868 image_dimensions,
869 }
870 }
871
872 /// Create a new OCR processing result for a specific region
873 #[allow(clippy::too_many_arguments)]
874 pub fn with_region(
875 text: String,
876 confidence: f64,
877 fragments: Vec<OcrTextFragment>,
878 processing_time_ms: u64,
879 engine_name: String,
880 language: String,
881 image_dimensions: (u32, u32),
882 region: OcrRegion,
883 ) -> Self {
884 Self {
885 text,
886 confidence,
887 fragments,
888 processing_time_ms,
889 engine_name,
890 language,
891 processed_region: Some(region),
892 image_dimensions,
893 }
894 }
895
896 /// Filter fragments by minimum confidence
897 pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
898 self.fragments
899 .iter()
900 .filter(|fragment| fragment.confidence >= min_confidence)
901 .collect()
902 }
903
904 /// Get text fragments within a specific region
905 pub fn fragments_in_region(
906 &self,
907 x: f64,
908 y: f64,
909 width: f64,
910 height: f64,
911 ) -> Vec<&OcrTextFragment> {
912 self.fragments
913 .iter()
914 .filter(|fragment| {
915 fragment.x >= x
916 && fragment.y >= y
917 && fragment.x + fragment.width <= x + width
918 && fragment.y + fragment.height <= y + height
919 })
920 .collect()
921 }
922
923 /// Get fragments of a specific type
924 pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
925 self.fragments
926 .iter()
927 .filter(|fragment| fragment.fragment_type == fragment_type)
928 .collect()
929 }
930
931 /// Calculate average confidence for all fragments
932 pub fn average_confidence(&self) -> f64 {
933 if self.fragments.is_empty() {
934 return 0.0;
935 }
936
937 let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
938 sum / self.fragments.len() as f64
939 }
940}
941
942/// Supported OCR engines
943#[derive(Debug, Clone, Copy, PartialEq, Eq)]
944pub enum OcrEngine {
945 /// Mock OCR provider for testing
946 Mock,
947 /// Tesseract OCR (local processing)
948 Tesseract,
949 /// Azure Computer Vision OCR
950 Azure,
951 /// AWS Textract
952 Aws,
953 /// Google Cloud Vision OCR
954 GoogleCloud,
955}
956
957impl OcrEngine {
958 /// Get the name of the OCR engine
959 pub fn name(&self) -> &'static str {
960 match self {
961 OcrEngine::Mock => "Mock OCR",
962 OcrEngine::Tesseract => "Tesseract",
963 OcrEngine::Azure => "Azure Computer Vision",
964 OcrEngine::Aws => "AWS Textract",
965 OcrEngine::GoogleCloud => "Google Cloud Vision",
966 }
967 }
968
969 /// Check if this engine supports the given image format
970 pub fn supports_format(&self, format: ImageFormat) -> bool {
971 match self {
972 OcrEngine::Mock => true, // Mock supports all formats
973 OcrEngine::Tesseract => matches!(
974 format,
975 ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
976 ),
977 OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
978 OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
979 OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
980 }
981 }
982}
983
984impl fmt::Display for OcrEngine {
985 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
986 write!(f, "{}", self.name())
987 }
988}
989
990/// Trait for OCR providers
991///
992/// This trait defines the interface that all OCR providers must implement.
993/// It provides methods for processing images and extracting text with position information.
994///
995/// # Implementation Notes
996///
997/// - Implementations should handle errors gracefully and return meaningful error messages
998/// - The `process_image` method is the core functionality that all providers must implement
999/// - The `process_page` method is a convenience method for working with page analysis results
1000/// - Providers should validate image formats and reject unsupported formats
1001///
1002/// # Examples
1003///
1004/// ```rust
1005/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
1006/// use oxidize_pdf::graphics::ImageFormat;
1007///
1008/// struct MyOcrProvider;
1009///
1010/// impl OcrProvider for MyOcrProvider {
1011/// fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
1012/// // Implementation here
1013/// # Ok(OcrProcessingResult {
1014/// # text: "Sample text".to_string(),
1015/// # confidence: 0.95,
1016/// # fragments: vec![],
1017/// # processing_time_ms: 100,
1018/// # engine_name: "MyOCR".to_string(),
1019/// # language: "en".to_string(),
1020/// # image_dimensions: (800, 600),
1021/// # processed_region: None,
1022/// # })
1023/// }
1024///
1025/// fn supported_formats(&self) -> Vec<ImageFormat> {
1026/// vec![ImageFormat::Jpeg, ImageFormat::Png]
1027/// }
1028///
1029/// fn engine_name(&self) -> &str {
1030/// "MyOCR"
1031/// }
1032///
1033/// fn engine_type(&self) -> OcrEngine {
1034/// OcrEngine::Mock
1035/// }
1036/// }
1037/// ```
1038pub trait OcrProvider: Send + Sync {
1039 /// Process an image and extract text using OCR
1040 ///
1041 /// This is the core method that all OCR providers must implement.
1042 /// It takes image data as bytes and returns structured text results.
1043 ///
1044 /// # Arguments
1045 ///
1046 /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
1047 /// * `options` - OCR processing options and configuration
1048 ///
1049 /// # Returns
1050 ///
1051 /// A `Result` containing the OCR results with text, confidence, and positioning information.
1052 ///
1053 /// # Errors
1054 ///
1055 /// Returns an error if:
1056 /// - The image format is not supported
1057 /// - The image data is corrupted or invalid
1058 /// - OCR processing fails
1059 /// - Network errors occur (for cloud providers)
1060 /// - Authentication fails (for cloud providers)
1061 fn process_image(
1062 &self,
1063 image_data: &[u8],
1064 options: &OcrOptions,
1065 ) -> OcrResult<OcrProcessingResult>;
1066
1067 /// Process a scanned page using content analysis information
1068 ///
1069 /// This method provides a higher-level interface that works with page analysis results.
1070 /// It's particularly useful when integrating with the page analysis module.
1071 ///
1072 /// # Arguments
1073 ///
1074 /// * `page_analysis` - Results from page content analysis
1075 /// * `page_data` - Raw page data or image data
1076 /// * `options` - OCR processing options
1077 ///
1078 /// # Returns
1079 ///
1080 /// OCR results optimized for the specific page content type.
1081 ///
1082 /// # Default Implementation
1083 ///
1084 /// The default implementation simply calls `process_image` with the page data.
1085 /// Providers can override this to provide specialized handling based on page analysis.
1086 fn process_page(
1087 &self,
1088 _page_analysis: &ContentAnalysis,
1089 page_data: &[u8],
1090 options: &OcrOptions,
1091 ) -> OcrResult<OcrProcessingResult> {
1092 self.process_image(page_data, options)
1093 }
1094
1095 /// Process multiple images with region information
1096 ///
1097 /// This method allows for selective OCR processing where each image corresponds
1098 /// to a specific region. This is useful for:
1099 /// - Processing pre-cropped regions of a document
1100 /// - Batch processing of multiple regions with different OCR settings
1101 /// - Optimizing performance by avoiding full-image processing
1102 ///
1103 /// # Arguments
1104 ///
1105 /// * `image_region_pairs` - Vector of (image_data, region) pairs
1106 /// * `options` - OCR processing options (applies to all regions)
1107 ///
1108 /// # Returns
1109 ///
1110 /// A vector of `OcrProcessingResult`, one for each processed region.
1111 /// The order matches the input pairs vector.
1112 ///
1113 /// # Default Implementation
1114 ///
1115 /// The default implementation processes each image separately and sets
1116 /// the region information in the result.
1117 fn process_image_regions(
1118 &self,
1119 image_region_pairs: &[(&[u8], &OcrRegion)],
1120 options: &OcrOptions,
1121 ) -> OcrResult<Vec<OcrProcessingResult>> {
1122 let mut results = Vec::with_capacity(image_region_pairs.len());
1123
1124 for (image_data, region) in image_region_pairs {
1125 let mut result = self.process_image(image_data, options)?;
1126
1127 // Adjust fragment coordinates to match original image coordinates
1128 // (assuming the input image_data is already cropped to the region)
1129 for fragment in &mut result.fragments {
1130 fragment.x += region.x as f64;
1131 fragment.y += region.y as f64;
1132 }
1133
1134 result.processed_region = Some((*region).clone());
1135 results.push(result);
1136 }
1137
1138 Ok(results)
1139 }
1140
1141 /// Get the list of supported image formats
1142 ///
1143 /// # Returns
1144 ///
1145 /// A vector of `ImageFormat` values that this provider can process.
1146 fn supported_formats(&self) -> Vec<ImageFormat>;
1147
1148 /// Get the name of this OCR provider
1149 ///
1150 /// # Returns
1151 ///
1152 /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
1153 fn engine_name(&self) -> &str;
1154
1155 /// Get the engine type for this provider
1156 ///
1157 /// # Returns
1158 ///
1159 /// The `OcrEngine` enum value corresponding to this provider.
1160 fn engine_type(&self) -> OcrEngine;
1161
1162 /// Check if this provider supports the given image format
1163 ///
1164 /// # Arguments
1165 ///
1166 /// * `format` - The image format to check
1167 ///
1168 /// # Returns
1169 ///
1170 /// `true` if the format is supported, `false` otherwise.
1171 fn supports_format(&self, format: ImageFormat) -> bool {
1172 self.supported_formats().contains(&format)
1173 }
1174
1175 /// Validate image data before processing
1176 ///
1177 /// This method can be used to perform basic validation of image data
1178 /// before attempting OCR processing.
1179 ///
1180 /// # Arguments
1181 ///
1182 /// * `image_data` - Raw image bytes to validate
1183 ///
1184 /// # Returns
1185 ///
1186 /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
1187 ///
1188 /// # Default Implementation
1189 ///
1190 /// The default implementation performs basic format detection based on magic bytes.
1191 fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
1192 if image_data.len() < 8 {
1193 return Err(OcrError::InvalidImageData(
1194 "Image data too short".to_string(),
1195 ));
1196 }
1197
1198 // Check for common image format signatures
1199 let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
1200 ImageFormat::Jpeg
1201 } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
1202 ImageFormat::Png
1203 } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
1204 ImageFormat::Tiff
1205 } else {
1206 return Err(OcrError::InvalidImageData(
1207 "Unrecognized image format".to_string(),
1208 ));
1209 };
1210
1211 if !self.supports_format(format) {
1212 return Err(OcrError::UnsupportedImageFormat(format));
1213 }
1214
1215 Ok(())
1216 }
1217}
1218
1219/// Mock OCR provider for testing and development
1220///
1221/// This provider simulates OCR processing without actually performing text recognition.
1222/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
1223///
1224/// # Examples
1225///
1226/// ```rust
1227/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
1228///
1229/// let provider = MockOcrProvider::new();
1230/// let options = OcrOptions::default();
1231/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
1232///
1233/// let result = provider.process_image(&image_data, &options).unwrap();
1234/// assert!(result.text.contains("Mock OCR"));
1235/// ```
1236#[derive(Clone)]
1237pub struct MockOcrProvider {
1238 /// Mock confidence level to return
1239 confidence: f64,
1240 /// Mock text to return
1241 mock_text: String,
1242 /// Simulated processing delay (milliseconds)
1243 processing_delay_ms: u64,
1244}
1245
1246impl MockOcrProvider {
1247 /// Create a new mock OCR provider with default settings
1248 pub fn new() -> Self {
1249 Self {
1250 confidence: 0.85,
1251 mock_text: "Mock OCR extracted text from scanned image".to_string(),
1252 processing_delay_ms: 100,
1253 }
1254 }
1255
1256 /// Create a mock provider with custom text and confidence
1257 pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
1258 Self {
1259 confidence,
1260 mock_text: text,
1261 processing_delay_ms: 100,
1262 }
1263 }
1264
1265 /// Set the mock text to return
1266 pub fn set_mock_text(&mut self, text: String) {
1267 self.mock_text = text;
1268 }
1269
1270 /// Set the confidence level to return
1271 pub fn set_confidence(&mut self, confidence: f64) {
1272 self.confidence = confidence.clamp(0.0, 1.0);
1273 }
1274
1275 /// Set the simulated processing delay
1276 pub fn set_processing_delay(&mut self, delay_ms: u64) {
1277 self.processing_delay_ms = delay_ms;
1278 }
1279}
1280
1281impl Default for MockOcrProvider {
1282 fn default() -> Self {
1283 Self::new()
1284 }
1285}
1286
1287impl OcrProvider for MockOcrProvider {
1288 fn process_image(
1289 &self,
1290 image_data: &[u8],
1291 options: &OcrOptions,
1292 ) -> OcrResult<OcrProcessingResult> {
1293 // Validate image data
1294 self.validate_image_data(image_data)?;
1295
1296 // Simulate processing time
1297 std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
1298
1299 // Create mock text fragments
1300 let fragments = vec![
1301 OcrTextFragment {
1302 text: self.mock_text.clone(),
1303 x: 50.0,
1304 y: 700.0,
1305 width: 200.0,
1306 height: 20.0,
1307 confidence: self.confidence,
1308 word_confidences: None,
1309 font_size: 12.0,
1310 fragment_type: FragmentType::Line,
1311 },
1312 OcrTextFragment {
1313 text: "Additional mock text".to_string(),
1314 x: 50.0,
1315 y: 680.0,
1316 width: 150.0,
1317 height: 20.0,
1318 confidence: self.confidence * 0.9,
1319 word_confidences: None,
1320 font_size: 12.0,
1321 fragment_type: FragmentType::Line,
1322 },
1323 ];
1324
1325 Ok(OcrProcessingResult {
1326 text: format!("{}\nAdditional mock text", self.mock_text),
1327 confidence: self.confidence,
1328 fragments,
1329 processing_time_ms: self.processing_delay_ms,
1330 engine_name: "Mock OCR".to_string(),
1331 language: options.language.clone(),
1332 processed_region: None,
1333 image_dimensions: (800, 600), // Mock dimensions
1334 })
1335 }
1336
1337 fn supported_formats(&self) -> Vec<ImageFormat> {
1338 vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
1339 }
1340
1341 fn engine_name(&self) -> &str {
1342 "Mock OCR"
1343 }
1344
1345 fn engine_type(&self) -> OcrEngine {
1346 OcrEngine::Mock
1347 }
1348}
1349
1350#[cfg(test)]
1351mod tests;
1352
1353#[cfg(test)]
1354mod postprocessor_tests;
1355
1356#[cfg(test)]
1357mod rigorous_tests;