oxidize_pdf/text/ocr/mod.rs
1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//! 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//! 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//! println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//! let analysis = analyzer.analyze_page(page_num)?;
62//! if analysis.is_scanned() {
63//! println!("Processing scanned page {}", page_num);
64//! // OCR processing would happen here
65//! }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81 /// OCR provider is not available or not configured
82 #[error("OCR provider not available: {0}")]
83 ProviderNotAvailable(String),
84
85 /// Unsupported image format for OCR processing
86 #[error("Unsupported image format: {0:?}")]
87 UnsupportedImageFormat(ImageFormat),
88
89 /// Invalid or corrupted image data
90 #[error("Invalid image data: {0}")]
91 InvalidImageData(String),
92
93 /// OCR processing failed
94 #[error("OCR processing failed: {0}")]
95 ProcessingFailed(String),
96
97 /// Network error when using cloud OCR providers
98 #[error("Network error: {0}")]
99 NetworkError(String),
100
101 /// API key or authentication error
102 #[error("Authentication error: {0}")]
103 AuthenticationError(String),
104
105 /// Rate limiting or quota exceeded
106 #[error("Rate limit exceeded: {0}")]
107 RateLimitExceeded(String),
108
109 /// OCR provider returned low confidence results
110 #[error("Low confidence results: {0}")]
111 LowConfidence(String),
112
113 /// Generic IO error
114 #[error("IO error: {0}")]
115 Io(#[from] std::io::Error),
116
117 /// Configuration error
118 #[error("Configuration error: {0}")]
119 Configuration(String),
120}
121
122/// A rectangular region for selective OCR processing
123#[derive(Debug, Clone, PartialEq)]
124pub struct OcrRegion {
125 /// X coordinate of the top-left corner (pixels)
126 pub x: u32,
127
128 /// Y coordinate of the top-left corner (pixels)
129 pub y: u32,
130
131 /// Width of the region (pixels)
132 pub width: u32,
133
134 /// Height of the region (pixels)
135 pub height: u32,
136
137 /// Optional label for this region (e.g., "header", "table", "paragraph")
138 pub label: Option<String>,
139}
140
141impl OcrRegion {
142 /// Create a new OCR region
143 pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
144 Self {
145 x,
146 y,
147 width,
148 height,
149 label: None,
150 }
151 }
152
153 /// Create a new OCR region with a label
154 pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
155 Self {
156 x,
157 y,
158 width,
159 height,
160 label: Some(label.into()),
161 }
162 }
163
164 /// Check if this region contains a point
165 pub fn contains_point(&self, x: u32, y: u32) -> bool {
166 x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
167 }
168
169 /// Check if this region overlaps with another region
170 pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
171 !(self.x + self.width <= other.x
172 || other.x + other.width <= self.x
173 || self.y + self.height <= other.y
174 || other.y + other.height <= self.y)
175 }
176}
177
178/// OCR processing options and configuration
179#[derive(Debug, Clone)]
180pub struct OcrOptions {
181 /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
182 pub language: String,
183
184 /// Minimum confidence threshold (0.0 to 1.0)
185 pub min_confidence: f64,
186
187 /// Whether to preserve text layout and positioning
188 pub preserve_layout: bool,
189
190 /// Image preprocessing options
191 pub preprocessing: ImagePreprocessing,
192
193 /// OCR engine specific options
194 pub engine_options: std::collections::HashMap<String, String>,
195
196 /// Timeout for OCR operations (in seconds)
197 pub timeout_seconds: u32,
198
199 /// Specific regions to process (None = process entire image)
200 pub regions: Option<Vec<OcrRegion>>,
201
202 /// Whether to save extracted images for debug purposes
203 pub debug_output: bool,
204}
205
206impl Default for OcrOptions {
207 fn default() -> Self {
208 Self {
209 language: "en".to_string(),
210 min_confidence: 0.6,
211 preserve_layout: true,
212 preprocessing: ImagePreprocessing::default(),
213 engine_options: std::collections::HashMap::new(),
214 timeout_seconds: 60, // Aumentado para documentos complejos
215 regions: None,
216 debug_output: false,
217 }
218 }
219}
220
221/// Image preprocessing options for OCR
222#[derive(Debug, Clone)]
223pub struct ImagePreprocessing {
224 /// Whether to apply image denoising
225 pub denoise: bool,
226
227 /// Whether to apply image deskewing
228 pub deskew: bool,
229
230 /// Whether to enhance contrast
231 pub enhance_contrast: bool,
232
233 /// Whether to apply image sharpening
234 pub sharpen: bool,
235
236 /// Scale factor for image resizing (1.0 = no scaling)
237 pub scale_factor: f64,
238}
239
240impl Default for ImagePreprocessing {
241 fn default() -> Self {
242 Self {
243 denoise: true,
244 deskew: true,
245 enhance_contrast: true,
246 sharpen: false,
247 scale_factor: 1.0,
248 }
249 }
250}
251
252/// Word-level confidence information for detailed OCR analysis
253#[derive(Debug, Clone)]
254pub struct WordConfidence {
255 /// The word text
256 pub word: String,
257
258 /// Confidence score for this specific word (0.0 to 1.0)
259 pub confidence: f64,
260
261 /// X position of the word within the fragment (relative to fragment start)
262 pub x_offset: f64,
263
264 /// Width of the word in points
265 pub width: f64,
266
267 /// Optional character-level confidences (for ultimate granularity)
268 pub character_confidences: Option<Vec<CharacterConfidence>>,
269}
270
271impl WordConfidence {
272 /// Create a new word confidence
273 pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
274 Self {
275 word,
276 confidence,
277 x_offset,
278 width,
279 character_confidences: None,
280 }
281 }
282
283 /// Create a word confidence with character-level details
284 pub fn with_characters(
285 word: String,
286 confidence: f64,
287 x_offset: f64,
288 width: f64,
289 character_confidences: Vec<CharacterConfidence>,
290 ) -> Self {
291 Self {
292 word,
293 confidence,
294 x_offset,
295 width,
296 character_confidences: Some(character_confidences),
297 }
298 }
299
300 /// Get the average character confidence if available
301 pub fn average_character_confidence(&self) -> Option<f64> {
302 self.character_confidences.as_ref().map(|chars| {
303 let sum: f64 = chars.iter().map(|c| c.confidence).sum();
304 sum / chars.len() as f64
305 })
306 }
307
308 /// Check if this word has low confidence (below threshold)
309 pub fn is_low_confidence(&self, threshold: f64) -> bool {
310 self.confidence < threshold
311 }
312}
313
314/// Character-level confidence information for ultimate OCR granularity
315#[derive(Debug, Clone)]
316pub struct CharacterConfidence {
317 /// The character
318 pub character: char,
319
320 /// Confidence score for this character (0.0 to 1.0)
321 pub confidence: f64,
322
323 /// X position relative to word start
324 pub x_offset: f64,
325
326 /// Character width in points
327 pub width: f64,
328}
329
330impl CharacterConfidence {
331 /// Create a new character confidence
332 pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
333 Self {
334 character,
335 confidence,
336 x_offset,
337 width,
338 }
339 }
340}
341
342/// Candidate for OCR post-processing correction
343#[derive(Debug, Clone)]
344pub struct CorrectionCandidate {
345 /// The original word with low confidence or errors
346 pub word: String,
347
348 /// Original confidence score
349 pub confidence: f64,
350
351 /// Position within the text fragment
352 pub position_in_fragment: usize,
353
354 /// Suggested corrections ranked by likelihood
355 pub suggested_corrections: Vec<CorrectionSuggestion>,
356
357 /// Reason why this word needs correction
358 pub correction_reason: CorrectionReason,
359}
360
361/// A suggested correction for an OCR error
362#[derive(Debug, Clone)]
363pub struct CorrectionSuggestion {
364 /// The corrected word
365 pub corrected_word: String,
366
367 /// Confidence in this correction (0.0 to 1.0)
368 pub correction_confidence: f64,
369
370 /// Type of correction applied
371 pub correction_type: CorrectionType,
372
373 /// Explanation of why this correction was suggested
374 pub explanation: Option<String>,
375}
376
377/// Reasons why a word might need correction
378#[derive(Debug, Clone, PartialEq, Eq)]
379pub enum CorrectionReason {
380 /// Word has low OCR confidence
381 LowConfidence,
382
383 /// Word contains common OCR confusion patterns
384 ConfusionPattern,
385
386 /// Word not found in dictionary
387 NotInDictionary,
388
389 /// Word doesn't fit context
390 ContextualError,
391
392 /// Word has suspicious character combinations
393 SuspiciousPattern,
394}
395
396/// Types of corrections that can be applied
397#[derive(Debug, Clone, PartialEq, Eq, Hash)]
398pub enum CorrectionType {
399 /// Character substitution (e.g., "0" -> "O")
400 CharacterSubstitution,
401
402 /// Dictionary lookup and replacement
403 DictionaryCorrection,
404
405 /// Contextual correction based on surrounding words
406 ContextualCorrection,
407
408 /// Pattern-based correction (e.g., "rn" -> "m")
409 PatternCorrection,
410
411 /// Manual review suggested
412 ManualReview,
413}
414
415/// OCR post-processor for automatic text correction
416#[derive(Debug, Clone)]
417pub struct OcrPostProcessor {
418 /// Common OCR character confusions
419 pub character_corrections: std::collections::HashMap<char, Vec<char>>,
420
421 /// Dictionary of valid words (optional)
422 pub dictionary: Option<std::collections::HashSet<String>>,
423
424 /// Common pattern corrections
425 pub pattern_corrections: std::collections::HashMap<String, String>,
426
427 /// Confidence threshold for correction
428 pub correction_threshold: f64,
429
430 /// Maximum edit distance for corrections
431 pub max_edit_distance: usize,
432}
433
434impl OcrPostProcessor {
435 /// Create a new post-processor with common OCR corrections
436 pub fn new() -> Self {
437 let mut character_corrections = std::collections::HashMap::new();
438
439 // Common OCR character confusions
440 character_corrections.insert('0', vec!['O', 'o', 'Q']);
441 character_corrections.insert('O', vec!['0', 'Q', 'o']);
442 character_corrections.insert('1', vec!['l', 'I', '|']);
443 character_corrections.insert('l', vec!['1', 'I', '|']);
444 character_corrections.insert('I', vec!['1', 'l', '|']);
445 character_corrections.insert('S', vec!['5', '$']);
446 character_corrections.insert('5', vec!['S', '$']);
447 character_corrections.insert('2', vec!['Z', 'z']);
448 character_corrections.insert('Z', vec!['2', 'z']);
449
450 let mut pattern_corrections = std::collections::HashMap::new();
451 pattern_corrections.insert("rn".to_string(), "m".to_string());
452 pattern_corrections.insert("cl".to_string(), "d".to_string());
453 pattern_corrections.insert("fi".to_string(), "fi".to_string()); // ligature
454 pattern_corrections.insert("fl".to_string(), "fl".to_string()); // ligature
455
456 Self {
457 character_corrections,
458 dictionary: None,
459 pattern_corrections,
460 correction_threshold: 0.7,
461 max_edit_distance: 2,
462 }
463 }
464
465 /// Add a dictionary for word validation
466 pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
467 self.dictionary = Some(dictionary);
468 self
469 }
470
471 /// Process a fragment and suggest corrections
472 pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
473 let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
474
475 // Enhance candidates with suggestions
476 for candidate in &mut candidates {
477 candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
478 }
479
480 candidates
481 }
482
483 /// Generate correction suggestions for a word
484 pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
485 let mut suggestions = Vec::new();
486
487 // Character substitution corrections
488 suggestions.extend(self.character_substitution_corrections(word));
489
490 // Pattern-based corrections
491 suggestions.extend(self.pattern_corrections(word));
492
493 // Dictionary corrections (if available)
494 if let Some(dict) = &self.dictionary {
495 suggestions.extend(self.dictionary_corrections(word, dict));
496 }
497
498 // Sort by confidence and limit results
499 suggestions.sort_by(|a, b| {
500 b.correction_confidence
501 .partial_cmp(&a.correction_confidence)
502 .unwrap_or(std::cmp::Ordering::Equal)
503 });
504 suggestions.truncate(5); // Limit to top 5 suggestions
505
506 suggestions
507 }
508
509 /// Generate character substitution corrections
510 fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
511 let mut suggestions = Vec::new();
512 let chars: Vec<char> = word.chars().collect();
513
514 for (i, &ch) in chars.iter().enumerate() {
515 if let Some(alternatives) = self.character_corrections.get(&ch) {
516 for &alt_ch in alternatives {
517 let mut corrected_chars = chars.clone();
518 corrected_chars[i] = alt_ch;
519 let corrected_word: String = corrected_chars.into_iter().collect();
520
521 suggestions.push(CorrectionSuggestion {
522 corrected_word,
523 correction_confidence: 0.8,
524 correction_type: CorrectionType::CharacterSubstitution,
525 explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
526 });
527 }
528 }
529 }
530
531 suggestions
532 }
533
534 /// Generate pattern-based corrections
535 fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
536 let mut suggestions = Vec::new();
537
538 for (pattern, replacement) in &self.pattern_corrections {
539 if word.contains(pattern) {
540 let corrected_word = word.replace(pattern, replacement);
541 suggestions.push(CorrectionSuggestion {
542 corrected_word,
543 correction_confidence: 0.85,
544 correction_type: CorrectionType::PatternCorrection,
545 explanation: Some(format!(
546 "Pattern '{}' -> '{}' correction",
547 pattern, replacement
548 )),
549 });
550 }
551 }
552
553 suggestions
554 }
555
556 /// Generate dictionary-based corrections
557 fn dictionary_corrections(
558 &self,
559 word: &str,
560 dictionary: &std::collections::HashSet<String>,
561 ) -> Vec<CorrectionSuggestion> {
562 let mut suggestions = Vec::new();
563
564 // Check if word is already valid
565 if dictionary.contains(word) {
566 return suggestions;
567 }
568
569 // Find similar words using simple edit distance
570 for dict_word in dictionary {
571 if self.edit_distance(word, dict_word) <= self.max_edit_distance {
572 let confidence = 1.0
573 - (self.edit_distance(word, dict_word) as f64
574 / word.len().max(dict_word.len()) as f64);
575 suggestions.push(CorrectionSuggestion {
576 corrected_word: dict_word.clone(),
577 correction_confidence: confidence * 0.9, // Slightly lower than pattern corrections
578 correction_type: CorrectionType::DictionaryCorrection,
579 explanation: Some(format!(
580 "Dictionary match with edit distance {}",
581 self.edit_distance(word, dict_word)
582 )),
583 });
584 }
585 }
586
587 suggestions
588 }
589
590 /// Calculate simple edit distance (Levenshtein distance)
591 fn edit_distance(&self, s1: &str, s2: &str) -> usize {
592 let len1 = s1.len();
593 let len2 = s2.len();
594
595 let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
596
597 #[allow(clippy::needless_range_loop)]
598 for i in 0..=len1 {
599 dp[i][0] = i;
600 }
601 for j in 0..=len2 {
602 dp[0][j] = j;
603 }
604
605 let s1_chars: Vec<char> = s1.chars().collect();
606 let s2_chars: Vec<char> = s2.chars().collect();
607
608 for i in 1..=len1 {
609 for j in 1..=len2 {
610 if s1_chars[i - 1] == s2_chars[j - 1] {
611 dp[i][j] = dp[i - 1][j - 1];
612 } else {
613 dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
614 }
615 }
616 }
617
618 dp[len1][len2]
619 }
620}
621
622impl Default for OcrPostProcessor {
623 fn default() -> Self {
624 Self::new()
625 }
626}
627
628/// Text fragment extracted by OCR with position and confidence information
629#[derive(Debug, Clone)]
630pub struct OcrTextFragment {
631 /// The extracted text content
632 pub text: String,
633
634 /// X position in page coordinates (points)
635 pub x: f64,
636
637 /// Y position in page coordinates (points)
638 pub y: f64,
639
640 /// Width of the text fragment (points)
641 pub width: f64,
642
643 /// Height of the text fragment (points)
644 pub height: f64,
645
646 /// Confidence score for this fragment (0.0 to 1.0)
647 pub confidence: f64,
648
649 /// Word-level confidence scores (optional, for advanced OCR engines)
650 pub word_confidences: Option<Vec<WordConfidence>>,
651
652 /// Font size estimation (points)
653 pub font_size: f64,
654
655 /// Whether this fragment is part of a word or line
656 pub fragment_type: FragmentType,
657}
658
659impl OcrTextFragment {
660 /// Create a new OCR text fragment
661 #[allow(clippy::too_many_arguments)]
662 pub fn new(
663 text: String,
664 x: f64,
665 y: f64,
666 width: f64,
667 height: f64,
668 confidence: f64,
669 font_size: f64,
670 fragment_type: FragmentType,
671 ) -> Self {
672 Self {
673 text,
674 x,
675 y,
676 width,
677 height,
678 confidence,
679 word_confidences: None,
680 font_size,
681 fragment_type,
682 }
683 }
684
685 /// Create a fragment with word-level confidence scores
686 #[allow(clippy::too_many_arguments)]
687 pub fn with_word_confidences(
688 text: String,
689 x: f64,
690 y: f64,
691 width: f64,
692 height: f64,
693 confidence: f64,
694 font_size: f64,
695 fragment_type: FragmentType,
696 word_confidences: Vec<WordConfidence>,
697 ) -> Self {
698 Self {
699 text,
700 x,
701 y,
702 width,
703 height,
704 confidence,
705 word_confidences: Some(word_confidences),
706 font_size,
707 fragment_type,
708 }
709 }
710
711 /// Get words with confidence below the threshold
712 pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
713 self.word_confidences
714 .as_ref()
715 .map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
716 .unwrap_or_default()
717 }
718
719 /// Get the average word confidence if available
720 pub fn average_word_confidence(&self) -> Option<f64> {
721 self.word_confidences.as_ref().map(|words| {
722 if words.is_empty() {
723 return 0.0;
724 }
725 let sum: f64 = words.iter().map(|w| w.confidence).sum();
726 sum / words.len() as f64
727 })
728 }
729
730 /// Get words sorted by confidence (lowest first)
731 pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
732 self.word_confidences
733 .as_ref()
734 .map(|words| {
735 let mut sorted_words: Vec<_> = words.iter().collect();
736 sorted_words.sort_by(|a, b| {
737 a.confidence
738 .partial_cmp(&b.confidence)
739 .unwrap_or(std::cmp::Ordering::Equal)
740 });
741 sorted_words
742 })
743 .unwrap_or_default()
744 }
745
746 /// Check if this fragment has any low-confidence words
747 pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
748 self.word_confidences
749 .as_ref()
750 .map(|words| words.iter().any(|w| w.confidence < threshold))
751 .unwrap_or(false)
752 }
753
754 /// Get words that are candidates for correction (low confidence + patterns)
755 pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
756 self.word_confidences
757 .as_ref()
758 .map(|words| {
759 words
760 .iter()
761 .enumerate()
762 .filter(|(_, w)| w.confidence < threshold)
763 .map(|(index, word)| CorrectionCandidate {
764 word: word.word.clone(),
765 confidence: word.confidence,
766 position_in_fragment: index,
767 suggested_corrections: vec![], // Will be filled by post-processor
768 correction_reason: CorrectionReason::LowConfidence,
769 })
770 .collect()
771 })
772 .unwrap_or_default()
773 }
774
775 /// Generate a confidence report for this fragment
776 pub fn confidence_report(&self) -> String {
777 let mut report = format!(
778 "Fragment confidence: {:.1}% - \"{}\"\n",
779 self.confidence * 100.0,
780 self.text.trim()
781 );
782
783 if let Some(words) = &self.word_confidences {
784 report.push_str(&format!(
785 " Word-level breakdown ({} words):\n",
786 words.len()
787 ));
788 for (i, word) in words.iter().enumerate() {
789 report.push_str(&format!(
790 " {}: \"{}\" - {:.1}%\n",
791 i + 1,
792 word.word,
793 word.confidence * 100.0
794 ));
795
796 if let Some(chars) = &word.character_confidences {
797 report.push_str(" Characters: ");
798 for ch in chars {
799 report.push_str(&format!(
800 "'{}'({:.0}%) ",
801 ch.character,
802 ch.confidence * 100.0
803 ));
804 }
805 report.push('\n');
806 }
807 }
808 } else {
809 report.push_str(" (No word-level data available)\n");
810 }
811
812 report
813 }
814}
815
816/// Type of text fragment
817#[derive(Debug, Clone, Copy, PartialEq, Eq)]
818pub enum FragmentType {
819 /// Individual character
820 Character,
821 /// Complete word
822 Word,
823 /// Text line
824 Line,
825 /// Paragraph
826 Paragraph,
827}
828
829/// Complete result of OCR processing
830#[derive(Debug, Clone)]
831pub struct OcrProcessingResult {
832 /// The complete extracted text
833 pub text: String,
834
835 /// Overall confidence score (0.0 to 1.0)
836 pub confidence: f64,
837
838 /// Individual text fragments with position information
839 pub fragments: Vec<OcrTextFragment>,
840
841 /// Processing time in milliseconds
842 pub processing_time_ms: u64,
843
844 /// OCR engine used for processing
845 pub engine_name: String,
846
847 /// Language detected/used
848 pub language: String,
849
850 /// Region that was processed (None if entire image was processed)
851 pub processed_region: Option<OcrRegion>,
852
853 /// Image dimensions that were processed
854 pub image_dimensions: (u32, u32),
855}
856
857impl OcrProcessingResult {
858 /// Create a new OCR processing result
859 pub fn new(
860 text: String,
861 confidence: f64,
862 fragments: Vec<OcrTextFragment>,
863 processing_time_ms: u64,
864 engine_name: String,
865 language: String,
866 image_dimensions: (u32, u32),
867 ) -> Self {
868 Self {
869 text,
870 confidence,
871 fragments,
872 processing_time_ms,
873 engine_name,
874 language,
875 processed_region: None,
876 image_dimensions,
877 }
878 }
879
880 /// Create a new OCR processing result for a specific region
881 #[allow(clippy::too_many_arguments)]
882 pub fn with_region(
883 text: String,
884 confidence: f64,
885 fragments: Vec<OcrTextFragment>,
886 processing_time_ms: u64,
887 engine_name: String,
888 language: String,
889 image_dimensions: (u32, u32),
890 region: OcrRegion,
891 ) -> Self {
892 Self {
893 text,
894 confidence,
895 fragments,
896 processing_time_ms,
897 engine_name,
898 language,
899 processed_region: Some(region),
900 image_dimensions,
901 }
902 }
903
904 /// Filter fragments by minimum confidence
905 pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
906 self.fragments
907 .iter()
908 .filter(|fragment| fragment.confidence >= min_confidence)
909 .collect()
910 }
911
912 /// Get text fragments within a specific region
913 pub fn fragments_in_region(
914 &self,
915 x: f64,
916 y: f64,
917 width: f64,
918 height: f64,
919 ) -> Vec<&OcrTextFragment> {
920 self.fragments
921 .iter()
922 .filter(|fragment| {
923 fragment.x >= x
924 && fragment.y >= y
925 && fragment.x + fragment.width <= x + width
926 && fragment.y + fragment.height <= y + height
927 })
928 .collect()
929 }
930
931 /// Get fragments of a specific type
932 pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
933 self.fragments
934 .iter()
935 .filter(|fragment| fragment.fragment_type == fragment_type)
936 .collect()
937 }
938
939 /// Calculate average confidence for all fragments
940 pub fn average_confidence(&self) -> f64 {
941 if self.fragments.is_empty() {
942 return 0.0;
943 }
944
945 let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
946 sum / self.fragments.len() as f64
947 }
948}
949
950/// Supported OCR engines
951#[derive(Debug, Clone, Copy, PartialEq, Eq)]
952pub enum OcrEngine {
953 /// Mock OCR provider for testing
954 Mock,
955 /// Tesseract OCR (local processing)
956 Tesseract,
957 /// Azure Computer Vision OCR
958 Azure,
959 /// AWS Textract
960 Aws,
961 /// Google Cloud Vision OCR
962 GoogleCloud,
963}
964
965impl OcrEngine {
966 /// Get the name of the OCR engine
967 pub fn name(&self) -> &'static str {
968 match self {
969 OcrEngine::Mock => "Mock OCR",
970 OcrEngine::Tesseract => "Tesseract",
971 OcrEngine::Azure => "Azure Computer Vision",
972 OcrEngine::Aws => "AWS Textract",
973 OcrEngine::GoogleCloud => "Google Cloud Vision",
974 }
975 }
976
977 /// Check if this engine supports the given image format
978 pub fn supports_format(&self, format: ImageFormat) -> bool {
979 match self {
980 OcrEngine::Mock => true, // Mock supports all formats
981 OcrEngine::Tesseract => matches!(
982 format,
983 ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
984 ),
985 OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
986 OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
987 OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
988 }
989 }
990}
991
992impl fmt::Display for OcrEngine {
993 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
994 write!(f, "{}", self.name())
995 }
996}
997
998/// Trait for OCR providers
999///
1000/// This trait defines the interface that all OCR providers must implement.
1001/// It provides methods for processing images and extracting text with position information.
1002///
1003/// # Implementation Notes
1004///
1005/// - Implementations should handle errors gracefully and return meaningful error messages
1006/// - The `process_image` method is the core functionality that all providers must implement
1007/// - The `process_page` method is a convenience method for working with page analysis results
1008/// - Providers should validate image formats and reject unsupported formats
1009///
1010/// # Examples
1011///
1012/// ```rust
1013/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
1014/// use oxidize_pdf::graphics::ImageFormat;
1015///
1016/// struct MyOcrProvider;
1017///
1018/// impl OcrProvider for MyOcrProvider {
1019/// fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
1020/// // Implementation here
1021/// # Ok(OcrProcessingResult {
1022/// # text: "Sample text".to_string(),
1023/// # confidence: 0.95,
1024/// # fragments: vec![],
1025/// # processing_time_ms: 100,
1026/// # engine_name: "MyOCR".to_string(),
1027/// # language: "en".to_string(),
1028/// # image_dimensions: (800, 600),
1029/// # processed_region: None,
1030/// # })
1031/// }
1032///
1033/// fn supported_formats(&self) -> Vec<ImageFormat> {
1034/// vec![ImageFormat::Jpeg, ImageFormat::Png]
1035/// }
1036///
1037/// fn engine_name(&self) -> &str {
1038/// "MyOCR"
1039/// }
1040///
1041/// fn engine_type(&self) -> OcrEngine {
1042/// OcrEngine::Mock
1043/// }
1044/// }
1045/// ```
1046pub trait OcrProvider: Send + Sync {
1047 /// Process an image and extract text using OCR
1048 ///
1049 /// This is the core method that all OCR providers must implement.
1050 /// It takes image data as bytes and returns structured text results.
1051 ///
1052 /// # Arguments
1053 ///
1054 /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
1055 /// * `options` - OCR processing options and configuration
1056 ///
1057 /// # Returns
1058 ///
1059 /// A `Result` containing the OCR results with text, confidence, and positioning information.
1060 ///
1061 /// # Errors
1062 ///
1063 /// Returns an error if:
1064 /// - The image format is not supported
1065 /// - The image data is corrupted or invalid
1066 /// - OCR processing fails
1067 /// - Network errors occur (for cloud providers)
1068 /// - Authentication fails (for cloud providers)
1069 fn process_image(
1070 &self,
1071 image_data: &[u8],
1072 options: &OcrOptions,
1073 ) -> OcrResult<OcrProcessingResult>;
1074
1075 /// Process a scanned page using content analysis information
1076 ///
1077 /// This method provides a higher-level interface that works with page analysis results.
1078 /// It's particularly useful when integrating with the page analysis module.
1079 ///
1080 /// # Arguments
1081 ///
1082 /// * `page_analysis` - Results from page content analysis
1083 /// * `page_data` - Raw page data or image data
1084 /// * `options` - OCR processing options
1085 ///
1086 /// # Returns
1087 ///
1088 /// OCR results optimized for the specific page content type.
1089 ///
1090 /// # Default Implementation
1091 ///
1092 /// The default implementation simply calls `process_image` with the page data.
1093 /// Providers can override this to provide specialized handling based on page analysis.
1094 fn process_page(
1095 &self,
1096 _page_analysis: &ContentAnalysis,
1097 page_data: &[u8],
1098 options: &OcrOptions,
1099 ) -> OcrResult<OcrProcessingResult> {
1100 self.process_image(page_data, options)
1101 }
1102
1103 /// Process multiple images with region information
1104 ///
1105 /// This method allows for selective OCR processing where each image corresponds
1106 /// to a specific region. This is useful for:
1107 /// - Processing pre-cropped regions of a document
1108 /// - Batch processing of multiple regions with different OCR settings
1109 /// - Optimizing performance by avoiding full-image processing
1110 ///
1111 /// # Arguments
1112 ///
1113 /// * `image_region_pairs` - Vector of (image_data, region) pairs
1114 /// * `options` - OCR processing options (applies to all regions)
1115 ///
1116 /// # Returns
1117 ///
1118 /// A vector of `OcrProcessingResult`, one for each processed region.
1119 /// The order matches the input pairs vector.
1120 ///
1121 /// # Default Implementation
1122 ///
1123 /// The default implementation processes each image separately and sets
1124 /// the region information in the result.
1125 fn process_image_regions(
1126 &self,
1127 image_region_pairs: &[(&[u8], &OcrRegion)],
1128 options: &OcrOptions,
1129 ) -> OcrResult<Vec<OcrProcessingResult>> {
1130 let mut results = Vec::with_capacity(image_region_pairs.len());
1131
1132 for (image_data, region) in image_region_pairs {
1133 let mut result = self.process_image(image_data, options)?;
1134
1135 // Adjust fragment coordinates to match original image coordinates
1136 // (assuming the input image_data is already cropped to the region)
1137 for fragment in &mut result.fragments {
1138 fragment.x += region.x as f64;
1139 fragment.y += region.y as f64;
1140 }
1141
1142 result.processed_region = Some((*region).clone());
1143 results.push(result);
1144 }
1145
1146 Ok(results)
1147 }
1148
1149 /// Get the list of supported image formats
1150 ///
1151 /// # Returns
1152 ///
1153 /// A vector of `ImageFormat` values that this provider can process.
1154 fn supported_formats(&self) -> Vec<ImageFormat>;
1155
1156 /// Get the name of this OCR provider
1157 ///
1158 /// # Returns
1159 ///
1160 /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
1161 fn engine_name(&self) -> &str;
1162
1163 /// Get the engine type for this provider
1164 ///
1165 /// # Returns
1166 ///
1167 /// The `OcrEngine` enum value corresponding to this provider.
1168 fn engine_type(&self) -> OcrEngine;
1169
1170 /// Check if this provider supports the given image format
1171 ///
1172 /// # Arguments
1173 ///
1174 /// * `format` - The image format to check
1175 ///
1176 /// # Returns
1177 ///
1178 /// `true` if the format is supported, `false` otherwise.
1179 fn supports_format(&self, format: ImageFormat) -> bool {
1180 self.supported_formats().contains(&format)
1181 }
1182
1183 /// Validate image data before processing
1184 ///
1185 /// This method can be used to perform basic validation of image data
1186 /// before attempting OCR processing.
1187 ///
1188 /// # Arguments
1189 ///
1190 /// * `image_data` - Raw image bytes to validate
1191 ///
1192 /// # Returns
1193 ///
1194 /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
1195 ///
1196 /// # Default Implementation
1197 ///
1198 /// The default implementation performs basic format detection based on magic bytes.
1199 fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
1200 if image_data.len() < 8 {
1201 return Err(OcrError::InvalidImageData(
1202 "Image data too short".to_string(),
1203 ));
1204 }
1205
1206 // Check for common image format signatures
1207 let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
1208 ImageFormat::Jpeg
1209 } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
1210 ImageFormat::Png
1211 } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
1212 ImageFormat::Tiff
1213 } else {
1214 return Err(OcrError::InvalidImageData(
1215 "Unrecognized image format".to_string(),
1216 ));
1217 };
1218
1219 if !self.supports_format(format) {
1220 return Err(OcrError::UnsupportedImageFormat(format));
1221 }
1222
1223 Ok(())
1224 }
1225}
1226
1227/// Mock OCR provider for testing and development
1228///
1229/// This provider simulates OCR processing without actually performing text recognition.
1230/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
1231///
1232/// # Examples
1233///
1234/// ```rust
1235/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
1236///
1237/// let provider = MockOcrProvider::new();
1238/// let options = OcrOptions::default();
1239/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
1240///
1241/// let result = provider.process_image(&image_data, &options).unwrap();
1242/// assert!(result.text.contains("Mock OCR"));
1243/// ```
1244#[derive(Clone)]
1245pub struct MockOcrProvider {
1246 /// Mock confidence level to return
1247 confidence: f64,
1248 /// Mock text to return
1249 mock_text: String,
1250 /// Simulated processing delay (milliseconds)
1251 processing_delay_ms: u64,
1252}
1253
1254impl MockOcrProvider {
1255 /// Create a new mock OCR provider with default settings
1256 pub fn new() -> Self {
1257 Self {
1258 confidence: 0.85,
1259 mock_text: "Mock OCR extracted text from scanned image".to_string(),
1260 processing_delay_ms: 100,
1261 }
1262 }
1263
1264 /// Create a mock provider with custom text and confidence
1265 pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
1266 Self {
1267 confidence,
1268 mock_text: text,
1269 processing_delay_ms: 100,
1270 }
1271 }
1272
1273 /// Set the mock text to return
1274 pub fn set_mock_text(&mut self, text: String) {
1275 self.mock_text = text;
1276 }
1277
1278 /// Set the confidence level to return
1279 pub fn set_confidence(&mut self, confidence: f64) {
1280 self.confidence = confidence.clamp(0.0, 1.0);
1281 }
1282
1283 /// Set the simulated processing delay
1284 pub fn set_processing_delay(&mut self, delay_ms: u64) {
1285 self.processing_delay_ms = delay_ms;
1286 }
1287}
1288
1289impl Default for MockOcrProvider {
1290 fn default() -> Self {
1291 Self::new()
1292 }
1293}
1294
1295impl OcrProvider for MockOcrProvider {
1296 fn process_image(
1297 &self,
1298 image_data: &[u8],
1299 options: &OcrOptions,
1300 ) -> OcrResult<OcrProcessingResult> {
1301 // Validate image data
1302 self.validate_image_data(image_data)?;
1303
1304 // Simulate processing time
1305 std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
1306
1307 // Create mock text fragments
1308 let fragments = vec![
1309 OcrTextFragment {
1310 text: self.mock_text.clone(),
1311 x: 50.0,
1312 y: 700.0,
1313 width: 200.0,
1314 height: 20.0,
1315 confidence: self.confidence,
1316 word_confidences: None,
1317 font_size: 12.0,
1318 fragment_type: FragmentType::Line,
1319 },
1320 OcrTextFragment {
1321 text: "Additional mock text".to_string(),
1322 x: 50.0,
1323 y: 680.0,
1324 width: 150.0,
1325 height: 20.0,
1326 confidence: self.confidence * 0.9,
1327 word_confidences: None,
1328 font_size: 12.0,
1329 fragment_type: FragmentType::Line,
1330 },
1331 ];
1332
1333 Ok(OcrProcessingResult {
1334 text: format!("{}\nAdditional mock text", self.mock_text),
1335 confidence: self.confidence,
1336 fragments,
1337 processing_time_ms: self.processing_delay_ms,
1338 engine_name: "Mock OCR".to_string(),
1339 language: options.language.clone(),
1340 processed_region: None,
1341 image_dimensions: (800, 600), // Mock dimensions
1342 })
1343 }
1344
1345 fn supported_formats(&self) -> Vec<ImageFormat> {
1346 vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
1347 }
1348
1349 fn engine_name(&self) -> &str {
1350 "Mock OCR"
1351 }
1352
1353 fn engine_type(&self) -> OcrEngine {
1354 OcrEngine::Mock
1355 }
1356}
1357
1358#[cfg(test)]
1359mod tests;
1360
1361#[cfg(test)]
1362mod postprocessor_tests;
1363
1364#[cfg(test)]
1365mod rigorous_tests;