oxidize_pdf/text/ocr/
mod.rs

1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//!     0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//!     0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//!     println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//!     let analysis = analyzer.analyze_page(page_num)?;
62//!     if analysis.is_scanned() {
63//!         println!("Processing scanned page {}", page_num);
64//!         // OCR processing would happen here
65//!     }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81    /// OCR provider is not available or not configured
82    #[error("OCR provider not available: {0}")]
83    ProviderNotAvailable(String),
84
85    /// Unsupported image format for OCR processing
86    #[error("Unsupported image format: {0:?}")]
87    UnsupportedImageFormat(ImageFormat),
88
89    /// Invalid or corrupted image data
90    #[error("Invalid image data: {0}")]
91    InvalidImageData(String),
92
93    /// OCR processing failed
94    #[error("OCR processing failed: {0}")]
95    ProcessingFailed(String),
96
97    /// Network error when using cloud OCR providers
98    #[error("Network error: {0}")]
99    NetworkError(String),
100
101    /// API key or authentication error
102    #[error("Authentication error: {0}")]
103    AuthenticationError(String),
104
105    /// Rate limiting or quota exceeded
106    #[error("Rate limit exceeded: {0}")]
107    RateLimitExceeded(String),
108
109    /// OCR provider returned low confidence results
110    #[error("Low confidence results: {0}")]
111    LowConfidence(String),
112
113    /// Generic IO error
114    #[error("IO error: {0}")]
115    Io(#[from] std::io::Error),
116
117    /// Configuration error
118    #[error("Configuration error: {0}")]
119    Configuration(String),
120}
121
122/// A rectangular region for selective OCR processing
123#[derive(Debug, Clone, PartialEq)]
124pub struct OcrRegion {
125    /// X coordinate of the top-left corner (pixels)
126    pub x: u32,
127
128    /// Y coordinate of the top-left corner (pixels)  
129    pub y: u32,
130
131    /// Width of the region (pixels)
132    pub width: u32,
133
134    /// Height of the region (pixels)
135    pub height: u32,
136
137    /// Optional label for this region (e.g., "header", "table", "paragraph")
138    pub label: Option<String>,
139}
140
141impl OcrRegion {
142    /// Create a new OCR region
143    pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
144        Self {
145            x,
146            y,
147            width,
148            height,
149            label: None,
150        }
151    }
152
153    /// Create a new OCR region with a label
154    pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
155        Self {
156            x,
157            y,
158            width,
159            height,
160            label: Some(label.into()),
161        }
162    }
163
164    /// Check if this region contains a point
165    pub fn contains_point(&self, x: u32, y: u32) -> bool {
166        x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
167    }
168
169    /// Check if this region overlaps with another region
170    pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
171        !(self.x + self.width <= other.x
172            || other.x + other.width <= self.x
173            || self.y + self.height <= other.y
174            || other.y + other.height <= self.y)
175    }
176}
177
178/// OCR processing options and configuration
179#[derive(Debug, Clone)]
180pub struct OcrOptions {
181    /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
182    pub language: String,
183
184    /// Minimum confidence threshold (0.0 to 1.0)
185    pub min_confidence: f64,
186
187    /// Whether to preserve text layout and positioning
188    pub preserve_layout: bool,
189
190    /// Image preprocessing options
191    pub preprocessing: ImagePreprocessing,
192
193    /// OCR engine specific options
194    pub engine_options: std::collections::HashMap<String, String>,
195
196    /// Timeout for OCR operations (in seconds)
197    pub timeout_seconds: u32,
198
199    /// Specific regions to process (None = process entire image)
200    pub regions: Option<Vec<OcrRegion>>,
201
202    /// Whether to save extracted images for debug purposes
203    pub debug_output: bool,
204}
205
206impl Default for OcrOptions {
207    fn default() -> Self {
208        Self {
209            language: "en".to_string(),
210            min_confidence: 0.6,
211            preserve_layout: true,
212            preprocessing: ImagePreprocessing::default(),
213            engine_options: std::collections::HashMap::new(),
214            timeout_seconds: 60, // Aumentado para documentos complejos
215            regions: None,
216            debug_output: false,
217        }
218    }
219}
220
221/// Image preprocessing options for OCR
222#[derive(Debug, Clone)]
223pub struct ImagePreprocessing {
224    /// Whether to apply image denoising
225    pub denoise: bool,
226
227    /// Whether to apply image deskewing
228    pub deskew: bool,
229
230    /// Whether to enhance contrast
231    pub enhance_contrast: bool,
232
233    /// Whether to apply image sharpening
234    pub sharpen: bool,
235
236    /// Scale factor for image resizing (1.0 = no scaling)
237    pub scale_factor: f64,
238}
239
240impl Default for ImagePreprocessing {
241    fn default() -> Self {
242        Self {
243            denoise: true,
244            deskew: true,
245            enhance_contrast: true,
246            sharpen: false,
247            scale_factor: 1.0,
248        }
249    }
250}
251
252/// Word-level confidence information for detailed OCR analysis
253#[derive(Debug, Clone)]
254pub struct WordConfidence {
255    /// The word text
256    pub word: String,
257
258    /// Confidence score for this specific word (0.0 to 1.0)
259    pub confidence: f64,
260
261    /// X position of the word within the fragment (relative to fragment start)
262    pub x_offset: f64,
263
264    /// Width of the word in points
265    pub width: f64,
266
267    /// Optional character-level confidences (for ultimate granularity)
268    pub character_confidences: Option<Vec<CharacterConfidence>>,
269}
270
271impl WordConfidence {
272    /// Create a new word confidence
273    pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
274        Self {
275            word,
276            confidence,
277            x_offset,
278            width,
279            character_confidences: None,
280        }
281    }
282
283    /// Create a word confidence with character-level details
284    pub fn with_characters(
285        word: String,
286        confidence: f64,
287        x_offset: f64,
288        width: f64,
289        character_confidences: Vec<CharacterConfidence>,
290    ) -> Self {
291        Self {
292            word,
293            confidence,
294            x_offset,
295            width,
296            character_confidences: Some(character_confidences),
297        }
298    }
299
300    /// Get the average character confidence if available
301    pub fn average_character_confidence(&self) -> Option<f64> {
302        self.character_confidences.as_ref().map(|chars| {
303            let sum: f64 = chars.iter().map(|c| c.confidence).sum();
304            sum / chars.len() as f64
305        })
306    }
307
308    /// Check if this word has low confidence (below threshold)
309    pub fn is_low_confidence(&self, threshold: f64) -> bool {
310        self.confidence < threshold
311    }
312}
313
314/// Character-level confidence information for ultimate OCR granularity
315#[derive(Debug, Clone)]
316pub struct CharacterConfidence {
317    /// The character
318    pub character: char,
319
320    /// Confidence score for this character (0.0 to 1.0)  
321    pub confidence: f64,
322
323    /// X position relative to word start
324    pub x_offset: f64,
325
326    /// Character width in points
327    pub width: f64,
328}
329
330impl CharacterConfidence {
331    /// Create a new character confidence
332    pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
333        Self {
334            character,
335            confidence,
336            x_offset,
337            width,
338        }
339    }
340}
341
342/// Candidate for OCR post-processing correction
343#[derive(Debug, Clone)]
344pub struct CorrectionCandidate {
345    /// The original word with low confidence or errors
346    pub word: String,
347
348    /// Original confidence score
349    pub confidence: f64,
350
351    /// Position within the text fragment
352    pub position_in_fragment: usize,
353
354    /// Suggested corrections ranked by likelihood
355    pub suggested_corrections: Vec<CorrectionSuggestion>,
356
357    /// Reason why this word needs correction
358    pub correction_reason: CorrectionReason,
359}
360
361/// A suggested correction for an OCR error
362#[derive(Debug, Clone)]
363pub struct CorrectionSuggestion {
364    /// The corrected word
365    pub corrected_word: String,
366
367    /// Confidence in this correction (0.0 to 1.0)
368    pub correction_confidence: f64,
369
370    /// Type of correction applied
371    pub correction_type: CorrectionType,
372
373    /// Explanation of why this correction was suggested
374    pub explanation: Option<String>,
375}
376
377/// Reasons why a word might need correction
378#[derive(Debug, Clone, PartialEq, Eq)]
379pub enum CorrectionReason {
380    /// Word has low OCR confidence
381    LowConfidence,
382
383    /// Word contains common OCR confusion patterns
384    ConfusionPattern,
385
386    /// Word not found in dictionary
387    NotInDictionary,
388
389    /// Word doesn't fit context
390    ContextualError,
391
392    /// Word has suspicious character combinations
393    SuspiciousPattern,
394}
395
396/// Types of corrections that can be applied
397#[derive(Debug, Clone, PartialEq, Eq, Hash)]
398pub enum CorrectionType {
399    /// Character substitution (e.g., "0" -> "O")
400    CharacterSubstitution,
401
402    /// Dictionary lookup and replacement
403    DictionaryCorrection,
404
405    /// Contextual correction based on surrounding words
406    ContextualCorrection,
407
408    /// Pattern-based correction (e.g., "rn" -> "m")
409    PatternCorrection,
410
411    /// Manual review suggested
412    ManualReview,
413}
414
415/// OCR post-processor for automatic text correction
416#[derive(Debug, Clone)]
417pub struct OcrPostProcessor {
418    /// Common OCR character confusions
419    pub character_corrections: std::collections::HashMap<char, Vec<char>>,
420
421    /// Dictionary of valid words (optional)
422    pub dictionary: Option<std::collections::HashSet<String>>,
423
424    /// Common pattern corrections
425    pub pattern_corrections: std::collections::HashMap<String, String>,
426
427    /// Confidence threshold for correction
428    pub correction_threshold: f64,
429
430    /// Maximum edit distance for corrections
431    pub max_edit_distance: usize,
432}
433
434impl OcrPostProcessor {
435    /// Create a new post-processor with common OCR corrections
436    pub fn new() -> Self {
437        let mut character_corrections = std::collections::HashMap::new();
438
439        // Common OCR character confusions
440        character_corrections.insert('0', vec!['O', 'o', 'Q']);
441        character_corrections.insert('O', vec!['0', 'Q', 'o']);
442        character_corrections.insert('1', vec!['l', 'I', '|']);
443        character_corrections.insert('l', vec!['1', 'I', '|']);
444        character_corrections.insert('I', vec!['1', 'l', '|']);
445        character_corrections.insert('S', vec!['5', '$']);
446        character_corrections.insert('5', vec!['S', '$']);
447        character_corrections.insert('2', vec!['Z', 'z']);
448        character_corrections.insert('Z', vec!['2', 'z']);
449
450        let mut pattern_corrections = std::collections::HashMap::new();
451        pattern_corrections.insert("rn".to_string(), "m".to_string());
452        pattern_corrections.insert("cl".to_string(), "d".to_string());
453        pattern_corrections.insert("fi".to_string(), "fi".to_string()); // ligature
454        pattern_corrections.insert("fl".to_string(), "fl".to_string()); // ligature
455
456        Self {
457            character_corrections,
458            dictionary: None,
459            pattern_corrections,
460            correction_threshold: 0.7,
461            max_edit_distance: 2,
462        }
463    }
464
465    /// Add a dictionary for word validation
466    pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
467        self.dictionary = Some(dictionary);
468        self
469    }
470
471    /// Process a fragment and suggest corrections
472    pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
473        let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
474
475        // Enhance candidates with suggestions
476        for candidate in &mut candidates {
477            candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
478        }
479
480        candidates
481    }
482
483    /// Generate correction suggestions for a word
484    pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
485        let mut suggestions = Vec::new();
486
487        // Character substitution corrections
488        suggestions.extend(self.character_substitution_corrections(word));
489
490        // Pattern-based corrections
491        suggestions.extend(self.pattern_corrections(word));
492
493        // Dictionary corrections (if available)
494        if let Some(dict) = &self.dictionary {
495            suggestions.extend(self.dictionary_corrections(word, dict));
496        }
497
498        // Sort by confidence and limit results
499        suggestions.sort_by(|a, b| b.correction_confidence.total_cmp(&a.correction_confidence));
500        suggestions.truncate(5); // Limit to top 5 suggestions
501
502        suggestions
503    }
504
505    /// Generate character substitution corrections
506    fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
507        let mut suggestions = Vec::new();
508        let chars: Vec<char> = word.chars().collect();
509
510        for (i, &ch) in chars.iter().enumerate() {
511            if let Some(alternatives) = self.character_corrections.get(&ch) {
512                for &alt_ch in alternatives {
513                    let mut corrected_chars = chars.clone();
514                    corrected_chars[i] = alt_ch;
515                    let corrected_word: String = corrected_chars.into_iter().collect();
516
517                    suggestions.push(CorrectionSuggestion {
518                        corrected_word,
519                        correction_confidence: 0.8,
520                        correction_type: CorrectionType::CharacterSubstitution,
521                        explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
522                    });
523                }
524            }
525        }
526
527        suggestions
528    }
529
530    /// Generate pattern-based corrections
531    fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
532        let mut suggestions = Vec::new();
533
534        for (pattern, replacement) in &self.pattern_corrections {
535            if word.contains(pattern) {
536                let corrected_word = word.replace(pattern, replacement);
537                suggestions.push(CorrectionSuggestion {
538                    corrected_word,
539                    correction_confidence: 0.85,
540                    correction_type: CorrectionType::PatternCorrection,
541                    explanation: Some(format!(
542                        "Pattern '{}' -> '{}' correction",
543                        pattern, replacement
544                    )),
545                });
546            }
547        }
548
549        suggestions
550    }
551
552    /// Generate dictionary-based corrections
553    fn dictionary_corrections(
554        &self,
555        word: &str,
556        dictionary: &std::collections::HashSet<String>,
557    ) -> Vec<CorrectionSuggestion> {
558        let mut suggestions = Vec::new();
559
560        // Check if word is already valid
561        if dictionary.contains(word) {
562            return suggestions;
563        }
564
565        // Find similar words using simple edit distance
566        for dict_word in dictionary {
567            if self.edit_distance(word, dict_word) <= self.max_edit_distance {
568                let confidence = 1.0
569                    - (self.edit_distance(word, dict_word) as f64
570                        / word.len().max(dict_word.len()) as f64);
571                suggestions.push(CorrectionSuggestion {
572                    corrected_word: dict_word.clone(),
573                    correction_confidence: confidence * 0.9, // Slightly lower than pattern corrections
574                    correction_type: CorrectionType::DictionaryCorrection,
575                    explanation: Some(format!(
576                        "Dictionary match with edit distance {}",
577                        self.edit_distance(word, dict_word)
578                    )),
579                });
580            }
581        }
582
583        suggestions
584    }
585
586    /// Calculate simple edit distance (Levenshtein distance)
587    fn edit_distance(&self, s1: &str, s2: &str) -> usize {
588        let len1 = s1.len();
589        let len2 = s2.len();
590
591        let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
592
593        #[allow(clippy::needless_range_loop)]
594        for i in 0..=len1 {
595            dp[i][0] = i;
596        }
597        for j in 0..=len2 {
598            dp[0][j] = j;
599        }
600
601        let s1_chars: Vec<char> = s1.chars().collect();
602        let s2_chars: Vec<char> = s2.chars().collect();
603
604        for i in 1..=len1 {
605            for j in 1..=len2 {
606                if s1_chars[i - 1] == s2_chars[j - 1] {
607                    dp[i][j] = dp[i - 1][j - 1];
608                } else {
609                    dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
610                }
611            }
612        }
613
614        dp[len1][len2]
615    }
616}
617
618impl Default for OcrPostProcessor {
619    fn default() -> Self {
620        Self::new()
621    }
622}
623
624/// Text fragment extracted by OCR with position and confidence information
625#[derive(Debug, Clone)]
626pub struct OcrTextFragment {
627    /// The extracted text content
628    pub text: String,
629
630    /// X position in page coordinates (points)
631    pub x: f64,
632
633    /// Y position in page coordinates (points)
634    pub y: f64,
635
636    /// Width of the text fragment (points)
637    pub width: f64,
638
639    /// Height of the text fragment (points)
640    pub height: f64,
641
642    /// Confidence score for this fragment (0.0 to 1.0)
643    pub confidence: f64,
644
645    /// Word-level confidence scores (optional, for advanced OCR engines)
646    pub word_confidences: Option<Vec<WordConfidence>>,
647
648    /// Font size estimation (points)
649    pub font_size: f64,
650
651    /// Whether this fragment is part of a word or line
652    pub fragment_type: FragmentType,
653}
654
655impl OcrTextFragment {
656    /// Create a new OCR text fragment
657    #[allow(clippy::too_many_arguments)]
658    pub fn new(
659        text: String,
660        x: f64,
661        y: f64,
662        width: f64,
663        height: f64,
664        confidence: f64,
665        font_size: f64,
666        fragment_type: FragmentType,
667    ) -> Self {
668        Self {
669            text,
670            x,
671            y,
672            width,
673            height,
674            confidence,
675            word_confidences: None,
676            font_size,
677            fragment_type,
678        }
679    }
680
681    /// Create a fragment with word-level confidence scores
682    #[allow(clippy::too_many_arguments)]
683    pub fn with_word_confidences(
684        text: String,
685        x: f64,
686        y: f64,
687        width: f64,
688        height: f64,
689        confidence: f64,
690        font_size: f64,
691        fragment_type: FragmentType,
692        word_confidences: Vec<WordConfidence>,
693    ) -> Self {
694        Self {
695            text,
696            x,
697            y,
698            width,
699            height,
700            confidence,
701            word_confidences: Some(word_confidences),
702            font_size,
703            fragment_type,
704        }
705    }
706
707    /// Get words with confidence below the threshold
708    pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
709        self.word_confidences
710            .as_ref()
711            .map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
712            .unwrap_or_default()
713    }
714
715    /// Get the average word confidence if available
716    pub fn average_word_confidence(&self) -> Option<f64> {
717        self.word_confidences.as_ref().map(|words| {
718            if words.is_empty() {
719                return 0.0;
720            }
721            let sum: f64 = words.iter().map(|w| w.confidence).sum();
722            sum / words.len() as f64
723        })
724    }
725
726    /// Get words sorted by confidence (lowest first)
727    pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
728        self.word_confidences
729            .as_ref()
730            .map(|words| {
731                let mut sorted_words: Vec<_> = words.iter().collect();
732                sorted_words.sort_by(|a, b| a.confidence.total_cmp(&b.confidence));
733                sorted_words
734            })
735            .unwrap_or_default()
736    }
737
738    /// Check if this fragment has any low-confidence words
739    pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
740        self.word_confidences
741            .as_ref()
742            .map(|words| words.iter().any(|w| w.confidence < threshold))
743            .unwrap_or(false)
744    }
745
746    /// Get words that are candidates for correction (low confidence + patterns)
747    pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
748        self.word_confidences
749            .as_ref()
750            .map(|words| {
751                words
752                    .iter()
753                    .enumerate()
754                    .filter(|(_, w)| w.confidence < threshold)
755                    .map(|(index, word)| CorrectionCandidate {
756                        word: word.word.clone(),
757                        confidence: word.confidence,
758                        position_in_fragment: index,
759                        suggested_corrections: vec![], // Will be filled by post-processor
760                        correction_reason: CorrectionReason::LowConfidence,
761                    })
762                    .collect()
763            })
764            .unwrap_or_default()
765    }
766
767    /// Generate a confidence report for this fragment
768    pub fn confidence_report(&self) -> String {
769        let mut report = format!(
770            "Fragment confidence: {:.1}% - \"{}\"\n",
771            self.confidence * 100.0,
772            self.text.trim()
773        );
774
775        if let Some(words) = &self.word_confidences {
776            report.push_str(&format!(
777                "  Word-level breakdown ({} words):\n",
778                words.len()
779            ));
780            for (i, word) in words.iter().enumerate() {
781                report.push_str(&format!(
782                    "    {}: \"{}\" - {:.1}%\n",
783                    i + 1,
784                    word.word,
785                    word.confidence * 100.0
786                ));
787
788                if let Some(chars) = &word.character_confidences {
789                    report.push_str("      Characters: ");
790                    for ch in chars {
791                        report.push_str(&format!(
792                            "'{}'({:.0}%) ",
793                            ch.character,
794                            ch.confidence * 100.0
795                        ));
796                    }
797                    report.push('\n');
798                }
799            }
800        } else {
801            report.push_str("  (No word-level data available)\n");
802        }
803
804        report
805    }
806}
807
808/// Type of text fragment
809#[derive(Debug, Clone, Copy, PartialEq, Eq)]
810pub enum FragmentType {
811    /// Individual character
812    Character,
813    /// Complete word
814    Word,
815    /// Text line
816    Line,
817    /// Paragraph
818    Paragraph,
819}
820
821/// Complete result of OCR processing
822#[derive(Debug, Clone)]
823pub struct OcrProcessingResult {
824    /// The complete extracted text
825    pub text: String,
826
827    /// Overall confidence score (0.0 to 1.0)
828    pub confidence: f64,
829
830    /// Individual text fragments with position information
831    pub fragments: Vec<OcrTextFragment>,
832
833    /// Processing time in milliseconds
834    pub processing_time_ms: u64,
835
836    /// OCR engine used for processing
837    pub engine_name: String,
838
839    /// Language detected/used
840    pub language: String,
841
842    /// Region that was processed (None if entire image was processed)
843    pub processed_region: Option<OcrRegion>,
844
845    /// Image dimensions that were processed
846    pub image_dimensions: (u32, u32),
847}
848
849impl OcrProcessingResult {
850    /// Create a new OCR processing result
851    pub fn new(
852        text: String,
853        confidence: f64,
854        fragments: Vec<OcrTextFragment>,
855        processing_time_ms: u64,
856        engine_name: String,
857        language: String,
858        image_dimensions: (u32, u32),
859    ) -> Self {
860        Self {
861            text,
862            confidence,
863            fragments,
864            processing_time_ms,
865            engine_name,
866            language,
867            processed_region: None,
868            image_dimensions,
869        }
870    }
871
872    /// Create a new OCR processing result for a specific region
873    #[allow(clippy::too_many_arguments)]
874    pub fn with_region(
875        text: String,
876        confidence: f64,
877        fragments: Vec<OcrTextFragment>,
878        processing_time_ms: u64,
879        engine_name: String,
880        language: String,
881        image_dimensions: (u32, u32),
882        region: OcrRegion,
883    ) -> Self {
884        Self {
885            text,
886            confidence,
887            fragments,
888            processing_time_ms,
889            engine_name,
890            language,
891            processed_region: Some(region),
892            image_dimensions,
893        }
894    }
895
896    /// Filter fragments by minimum confidence
897    pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
898        self.fragments
899            .iter()
900            .filter(|fragment| fragment.confidence >= min_confidence)
901            .collect()
902    }
903
904    /// Get text fragments within a specific region
905    pub fn fragments_in_region(
906        &self,
907        x: f64,
908        y: f64,
909        width: f64,
910        height: f64,
911    ) -> Vec<&OcrTextFragment> {
912        self.fragments
913            .iter()
914            .filter(|fragment| {
915                fragment.x >= x
916                    && fragment.y >= y
917                    && fragment.x + fragment.width <= x + width
918                    && fragment.y + fragment.height <= y + height
919            })
920            .collect()
921    }
922
923    /// Get fragments of a specific type
924    pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
925        self.fragments
926            .iter()
927            .filter(|fragment| fragment.fragment_type == fragment_type)
928            .collect()
929    }
930
931    /// Calculate average confidence for all fragments
932    pub fn average_confidence(&self) -> f64 {
933        if self.fragments.is_empty() {
934            return 0.0;
935        }
936
937        let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
938        sum / self.fragments.len() as f64
939    }
940}
941
942/// Supported OCR engines
943#[derive(Debug, Clone, Copy, PartialEq, Eq)]
944pub enum OcrEngine {
945    /// Mock OCR provider for testing
946    Mock,
947    /// Tesseract OCR (local processing)
948    Tesseract,
949    /// Azure Computer Vision OCR
950    Azure,
951    /// AWS Textract
952    Aws,
953    /// Google Cloud Vision OCR
954    GoogleCloud,
955}
956
957impl OcrEngine {
958    /// Get the name of the OCR engine
959    pub fn name(&self) -> &'static str {
960        match self {
961            OcrEngine::Mock => "Mock OCR",
962            OcrEngine::Tesseract => "Tesseract",
963            OcrEngine::Azure => "Azure Computer Vision",
964            OcrEngine::Aws => "AWS Textract",
965            OcrEngine::GoogleCloud => "Google Cloud Vision",
966        }
967    }
968
969    /// Check if this engine supports the given image format
970    pub fn supports_format(&self, format: ImageFormat) -> bool {
971        match self {
972            OcrEngine::Mock => true, // Mock supports all formats
973            OcrEngine::Tesseract => matches!(
974                format,
975                ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
976            ),
977            OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
978            OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
979            OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
980        }
981    }
982}
983
984impl fmt::Display for OcrEngine {
985    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
986        write!(f, "{}", self.name())
987    }
988}
989
990/// Trait for OCR providers
991///
992/// This trait defines the interface that all OCR providers must implement.
993/// It provides methods for processing images and extracting text with position information.
994///
995/// # Implementation Notes
996///
997/// - Implementations should handle errors gracefully and return meaningful error messages
998/// - The `process_image` method is the core functionality that all providers must implement
999/// - The `process_page` method is a convenience method for working with page analysis results
1000/// - Providers should validate image formats and reject unsupported formats
1001///
1002/// # Examples
1003///
1004/// ```rust
1005/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
1006/// use oxidize_pdf::graphics::ImageFormat;
1007///
1008/// struct MyOcrProvider;
1009///
1010/// impl OcrProvider for MyOcrProvider {
1011///     fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
1012///         // Implementation here
1013///         # Ok(OcrProcessingResult {
1014///         #     text: "Sample text".to_string(),
1015///         #     confidence: 0.95,
1016///         #     fragments: vec![],
1017///         #     processing_time_ms: 100,
1018///         #     engine_name: "MyOCR".to_string(),
1019///         #     language: "en".to_string(),
1020///         #     image_dimensions: (800, 600),
1021///         #     processed_region: None,
1022///         # })
1023///     }
1024///
1025///     fn supported_formats(&self) -> Vec<ImageFormat> {
1026///         vec![ImageFormat::Jpeg, ImageFormat::Png]
1027///     }
1028///
1029///     fn engine_name(&self) -> &str {
1030///         "MyOCR"
1031///     }
1032///
1033///     fn engine_type(&self) -> OcrEngine {
1034///         OcrEngine::Mock
1035///     }
1036/// }
1037/// ```
1038pub trait OcrProvider: Send + Sync {
1039    /// Process an image and extract text using OCR
1040    ///
1041    /// This is the core method that all OCR providers must implement.
1042    /// It takes image data as bytes and returns structured text results.
1043    ///
1044    /// # Arguments
1045    ///
1046    /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
1047    /// * `options` - OCR processing options and configuration
1048    ///
1049    /// # Returns
1050    ///
1051    /// A `Result` containing the OCR results with text, confidence, and positioning information.
1052    ///
1053    /// # Errors
1054    ///
1055    /// Returns an error if:
1056    /// - The image format is not supported
1057    /// - The image data is corrupted or invalid
1058    /// - OCR processing fails
1059    /// - Network errors occur (for cloud providers)
1060    /// - Authentication fails (for cloud providers)
1061    fn process_image(
1062        &self,
1063        image_data: &[u8],
1064        options: &OcrOptions,
1065    ) -> OcrResult<OcrProcessingResult>;
1066
1067    /// Process a scanned page using content analysis information
1068    ///
1069    /// This method provides a higher-level interface that works with page analysis results.
1070    /// It's particularly useful when integrating with the page analysis module.
1071    ///
1072    /// # Arguments
1073    ///
1074    /// * `page_analysis` - Results from page content analysis
1075    /// * `page_data` - Raw page data or image data
1076    /// * `options` - OCR processing options
1077    ///
1078    /// # Returns
1079    ///
1080    /// OCR results optimized for the specific page content type.
1081    ///
1082    /// # Default Implementation
1083    ///
1084    /// The default implementation simply calls `process_image` with the page data.
1085    /// Providers can override this to provide specialized handling based on page analysis.
1086    fn process_page(
1087        &self,
1088        _page_analysis: &ContentAnalysis,
1089        page_data: &[u8],
1090        options: &OcrOptions,
1091    ) -> OcrResult<OcrProcessingResult> {
1092        self.process_image(page_data, options)
1093    }
1094
1095    /// Process multiple images with region information
1096    ///
1097    /// This method allows for selective OCR processing where each image corresponds
1098    /// to a specific region. This is useful for:
1099    /// - Processing pre-cropped regions of a document  
1100    /// - Batch processing of multiple regions with different OCR settings
1101    /// - Optimizing performance by avoiding full-image processing
1102    ///
1103    /// # Arguments
1104    ///
1105    /// * `image_region_pairs` - Vector of (image_data, region) pairs
1106    /// * `options` - OCR processing options (applies to all regions)
1107    ///
1108    /// # Returns
1109    ///
1110    /// A vector of `OcrProcessingResult`, one for each processed region.
1111    /// The order matches the input pairs vector.
1112    ///
1113    /// # Default Implementation
1114    ///
1115    /// The default implementation processes each image separately and sets
1116    /// the region information in the result.
1117    fn process_image_regions(
1118        &self,
1119        image_region_pairs: &[(&[u8], &OcrRegion)],
1120        options: &OcrOptions,
1121    ) -> OcrResult<Vec<OcrProcessingResult>> {
1122        let mut results = Vec::with_capacity(image_region_pairs.len());
1123
1124        for (image_data, region) in image_region_pairs {
1125            let mut result = self.process_image(image_data, options)?;
1126
1127            // Adjust fragment coordinates to match original image coordinates
1128            // (assuming the input image_data is already cropped to the region)
1129            for fragment in &mut result.fragments {
1130                fragment.x += region.x as f64;
1131                fragment.y += region.y as f64;
1132            }
1133
1134            result.processed_region = Some((*region).clone());
1135            results.push(result);
1136        }
1137
1138        Ok(results)
1139    }
1140
1141    /// Get the list of supported image formats
1142    ///
1143    /// # Returns
1144    ///
1145    /// A vector of `ImageFormat` values that this provider can process.
1146    fn supported_formats(&self) -> Vec<ImageFormat>;
1147
1148    /// Get the name of this OCR provider
1149    ///
1150    /// # Returns
1151    ///
1152    /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
1153    fn engine_name(&self) -> &str;
1154
1155    /// Get the engine type for this provider
1156    ///
1157    /// # Returns
1158    ///
1159    /// The `OcrEngine` enum value corresponding to this provider.
1160    fn engine_type(&self) -> OcrEngine;
1161
1162    /// Check if this provider supports the given image format
1163    ///
1164    /// # Arguments
1165    ///
1166    /// * `format` - The image format to check
1167    ///
1168    /// # Returns
1169    ///
1170    /// `true` if the format is supported, `false` otherwise.
1171    fn supports_format(&self, format: ImageFormat) -> bool {
1172        self.supported_formats().contains(&format)
1173    }
1174
1175    /// Validate image data before processing
1176    ///
1177    /// This method can be used to perform basic validation of image data
1178    /// before attempting OCR processing.
1179    ///
1180    /// # Arguments
1181    ///
1182    /// * `image_data` - Raw image bytes to validate
1183    ///
1184    /// # Returns
1185    ///
1186    /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
1187    ///
1188    /// # Default Implementation
1189    ///
1190    /// The default implementation performs basic format detection based on magic bytes.
1191    fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
1192        if image_data.len() < 8 {
1193            return Err(OcrError::InvalidImageData(
1194                "Image data too short".to_string(),
1195            ));
1196        }
1197
1198        // Check for common image format signatures
1199        let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
1200            ImageFormat::Jpeg
1201        } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
1202            ImageFormat::Png
1203        } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
1204            ImageFormat::Tiff
1205        } else {
1206            return Err(OcrError::InvalidImageData(
1207                "Unrecognized image format".to_string(),
1208            ));
1209        };
1210
1211        if !self.supports_format(format) {
1212            return Err(OcrError::UnsupportedImageFormat(format));
1213        }
1214
1215        Ok(())
1216    }
1217}
1218
1219/// Mock OCR provider for testing and development
1220///
1221/// This provider simulates OCR processing without actually performing text recognition.
1222/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
1223///
1224/// # Examples
1225///
1226/// ```rust
1227/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
1228///
1229/// let provider = MockOcrProvider::new();
1230/// let options = OcrOptions::default();
1231/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
1232///
1233/// let result = provider.process_image(&image_data, &options).unwrap();
1234/// assert!(result.text.contains("Mock OCR"));
1235/// ```
1236#[derive(Clone)]
1237pub struct MockOcrProvider {
1238    /// Mock confidence level to return
1239    confidence: f64,
1240    /// Mock text to return
1241    mock_text: String,
1242    /// Simulated processing delay (milliseconds)
1243    processing_delay_ms: u64,
1244}
1245
1246impl MockOcrProvider {
1247    /// Create a new mock OCR provider with default settings
1248    pub fn new() -> Self {
1249        Self {
1250            confidence: 0.85,
1251            mock_text: "Mock OCR extracted text from scanned image".to_string(),
1252            processing_delay_ms: 100,
1253        }
1254    }
1255
1256    /// Create a mock provider with custom text and confidence
1257    pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
1258        Self {
1259            confidence,
1260            mock_text: text,
1261            processing_delay_ms: 100,
1262        }
1263    }
1264
1265    /// Set the mock text to return
1266    pub fn set_mock_text(&mut self, text: String) {
1267        self.mock_text = text;
1268    }
1269
1270    /// Set the confidence level to return
1271    pub fn set_confidence(&mut self, confidence: f64) {
1272        self.confidence = confidence.clamp(0.0, 1.0);
1273    }
1274
1275    /// Set the simulated processing delay
1276    pub fn set_processing_delay(&mut self, delay_ms: u64) {
1277        self.processing_delay_ms = delay_ms;
1278    }
1279}
1280
1281impl Default for MockOcrProvider {
1282    fn default() -> Self {
1283        Self::new()
1284    }
1285}
1286
1287impl OcrProvider for MockOcrProvider {
1288    fn process_image(
1289        &self,
1290        image_data: &[u8],
1291        options: &OcrOptions,
1292    ) -> OcrResult<OcrProcessingResult> {
1293        // Validate image data
1294        self.validate_image_data(image_data)?;
1295
1296        // Simulate processing time
1297        std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
1298
1299        // Create mock text fragments
1300        let fragments = vec![
1301            OcrTextFragment {
1302                text: self.mock_text.clone(),
1303                x: 50.0,
1304                y: 700.0,
1305                width: 200.0,
1306                height: 20.0,
1307                confidence: self.confidence,
1308                word_confidences: None,
1309                font_size: 12.0,
1310                fragment_type: FragmentType::Line,
1311            },
1312            OcrTextFragment {
1313                text: "Additional mock text".to_string(),
1314                x: 50.0,
1315                y: 680.0,
1316                width: 150.0,
1317                height: 20.0,
1318                confidence: self.confidence * 0.9,
1319                word_confidences: None,
1320                font_size: 12.0,
1321                fragment_type: FragmentType::Line,
1322            },
1323        ];
1324
1325        Ok(OcrProcessingResult {
1326            text: format!("{}\nAdditional mock text", self.mock_text),
1327            confidence: self.confidence,
1328            fragments,
1329            processing_time_ms: self.processing_delay_ms,
1330            engine_name: "Mock OCR".to_string(),
1331            language: options.language.clone(),
1332            processed_region: None,
1333            image_dimensions: (800, 600), // Mock dimensions
1334        })
1335    }
1336
1337    fn supported_formats(&self) -> Vec<ImageFormat> {
1338        vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
1339    }
1340
1341    fn engine_name(&self) -> &str {
1342        "Mock OCR"
1343    }
1344
1345    fn engine_type(&self) -> OcrEngine {
1346        OcrEngine::Mock
1347    }
1348}
1349
1350#[cfg(test)]
1351mod tests;
1352
1353#[cfg(test)]
1354mod postprocessor_tests;
1355
1356#[cfg(test)]
1357mod rigorous_tests;
oxidize_pdf/text/ocr/mod.rs

oxidize_pdf/text/ocr/
mod.rs