oxidize_pdf/text/ocr/
mod.rs

1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//!     0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//!     0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//!     println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//!     let analysis = analyzer.analyze_page(page_num)?;
62//!     if analysis.is_scanned() {
63//!         println!("Processing scanned page {}", page_num);
64//!         // OCR processing would happen here
65//!     }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81    /// OCR provider is not available or not configured
82    #[error("OCR provider not available: {0}")]
83    ProviderNotAvailable(String),
84
85    /// Unsupported image format for OCR processing
86    #[error("Unsupported image format: {0:?}")]
87    UnsupportedImageFormat(ImageFormat),
88
89    /// Invalid or corrupted image data
90    #[error("Invalid image data: {0}")]
91    InvalidImageData(String),
92
93    /// OCR processing failed
94    #[error("OCR processing failed: {0}")]
95    ProcessingFailed(String),
96
97    /// Network error when using cloud OCR providers
98    #[error("Network error: {0}")]
99    NetworkError(String),
100
101    /// API key or authentication error
102    #[error("Authentication error: {0}")]
103    AuthenticationError(String),
104
105    /// Rate limiting or quota exceeded
106    #[error("Rate limit exceeded: {0}")]
107    RateLimitExceeded(String),
108
109    /// OCR provider returned low confidence results
110    #[error("Low confidence results: {0}")]
111    LowConfidence(String),
112
113    /// Generic IO error
114    #[error("IO error: {0}")]
115    Io(#[from] std::io::Error),
116
117    /// Configuration error
118    #[error("Configuration error: {0}")]
119    Configuration(String),
120}
121
122/// A rectangular region for selective OCR processing
123#[derive(Debug, Clone, PartialEq)]
124pub struct OcrRegion {
125    /// X coordinate of the top-left corner (pixels)
126    pub x: u32,
127
128    /// Y coordinate of the top-left corner (pixels)  
129    pub y: u32,
130
131    /// Width of the region (pixels)
132    pub width: u32,
133
134    /// Height of the region (pixels)
135    pub height: u32,
136
137    /// Optional label for this region (e.g., "header", "table", "paragraph")
138    pub label: Option<String>,
139}
140
141impl OcrRegion {
142    /// Create a new OCR region
143    pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
144        Self {
145            x,
146            y,
147            width,
148            height,
149            label: None,
150        }
151    }
152
153    /// Create a new OCR region with a label
154    pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
155        Self {
156            x,
157            y,
158            width,
159            height,
160            label: Some(label.into()),
161        }
162    }
163
164    /// Check if this region contains a point
165    pub fn contains_point(&self, x: u32, y: u32) -> bool {
166        x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
167    }
168
169    /// Check if this region overlaps with another region
170    pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
171        !(self.x + self.width <= other.x
172            || other.x + other.width <= self.x
173            || self.y + self.height <= other.y
174            || other.y + other.height <= self.y)
175    }
176}
177
178/// OCR processing options and configuration
179#[derive(Debug, Clone)]
180pub struct OcrOptions {
181    /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
182    pub language: String,
183
184    /// Minimum confidence threshold (0.0 to 1.0)
185    pub min_confidence: f64,
186
187    /// Whether to preserve text layout and positioning
188    pub preserve_layout: bool,
189
190    /// Image preprocessing options
191    pub preprocessing: ImagePreprocessing,
192
193    /// OCR engine specific options
194    pub engine_options: std::collections::HashMap<String, String>,
195
196    /// Timeout for OCR operations (in seconds)
197    pub timeout_seconds: u32,
198
199    /// Specific regions to process (None = process entire image)
200    pub regions: Option<Vec<OcrRegion>>,
201
202    /// Whether to save extracted images for debug purposes
203    pub debug_output: bool,
204}
205
206impl Default for OcrOptions {
207    fn default() -> Self {
208        Self {
209            language: "en".to_string(),
210            min_confidence: 0.6,
211            preserve_layout: true,
212            preprocessing: ImagePreprocessing::default(),
213            engine_options: std::collections::HashMap::new(),
214            timeout_seconds: 60, // Aumentado para documentos complejos
215            regions: None,
216            debug_output: false,
217        }
218    }
219}
220
221/// Image preprocessing options for OCR
222#[derive(Debug, Clone)]
223pub struct ImagePreprocessing {
224    /// Whether to apply image denoising
225    pub denoise: bool,
226
227    /// Whether to apply image deskewing
228    pub deskew: bool,
229
230    /// Whether to enhance contrast
231    pub enhance_contrast: bool,
232
233    /// Whether to apply image sharpening
234    pub sharpen: bool,
235
236    /// Scale factor for image resizing (1.0 = no scaling)
237    pub scale_factor: f64,
238}
239
240impl Default for ImagePreprocessing {
241    fn default() -> Self {
242        Self {
243            denoise: true,
244            deskew: true,
245            enhance_contrast: true,
246            sharpen: false,
247            scale_factor: 1.0,
248        }
249    }
250}
251
252/// Word-level confidence information for detailed OCR analysis
253#[derive(Debug, Clone)]
254pub struct WordConfidence {
255    /// The word text
256    pub word: String,
257
258    /// Confidence score for this specific word (0.0 to 1.0)
259    pub confidence: f64,
260
261    /// X position of the word within the fragment (relative to fragment start)
262    pub x_offset: f64,
263
264    /// Width of the word in points
265    pub width: f64,
266
267    /// Optional character-level confidences (for ultimate granularity)
268    pub character_confidences: Option<Vec<CharacterConfidence>>,
269}
270
271impl WordConfidence {
272    /// Create a new word confidence
273    pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
274        Self {
275            word,
276            confidence,
277            x_offset,
278            width,
279            character_confidences: None,
280        }
281    }
282
283    /// Create a word confidence with character-level details
284    pub fn with_characters(
285        word: String,
286        confidence: f64,
287        x_offset: f64,
288        width: f64,
289        character_confidences: Vec<CharacterConfidence>,
290    ) -> Self {
291        Self {
292            word,
293            confidence,
294            x_offset,
295            width,
296            character_confidences: Some(character_confidences),
297        }
298    }
299
300    /// Get the average character confidence if available
301    pub fn average_character_confidence(&self) -> Option<f64> {
302        self.character_confidences.as_ref().map(|chars| {
303            let sum: f64 = chars.iter().map(|c| c.confidence).sum();
304            sum / chars.len() as f64
305        })
306    }
307
308    /// Check if this word has low confidence (below threshold)
309    pub fn is_low_confidence(&self, threshold: f64) -> bool {
310        self.confidence < threshold
311    }
312}
313
314/// Character-level confidence information for ultimate OCR granularity
315#[derive(Debug, Clone)]
316pub struct CharacterConfidence {
317    /// The character
318    pub character: char,
319
320    /// Confidence score for this character (0.0 to 1.0)  
321    pub confidence: f64,
322
323    /// X position relative to word start
324    pub x_offset: f64,
325
326    /// Character width in points
327    pub width: f64,
328}
329
330impl CharacterConfidence {
331    /// Create a new character confidence
332    pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
333        Self {
334            character,
335            confidence,
336            x_offset,
337            width,
338        }
339    }
340}
341
342/// Candidate for OCR post-processing correction
343#[derive(Debug, Clone)]
344pub struct CorrectionCandidate {
345    /// The original word with low confidence or errors
346    pub word: String,
347
348    /// Original confidence score
349    pub confidence: f64,
350
351    /// Position within the text fragment
352    pub position_in_fragment: usize,
353
354    /// Suggested corrections ranked by likelihood
355    pub suggested_corrections: Vec<CorrectionSuggestion>,
356
357    /// Reason why this word needs correction
358    pub correction_reason: CorrectionReason,
359}
360
361/// A suggested correction for an OCR error
362#[derive(Debug, Clone)]
363pub struct CorrectionSuggestion {
364    /// The corrected word
365    pub corrected_word: String,
366
367    /// Confidence in this correction (0.0 to 1.0)
368    pub correction_confidence: f64,
369
370    /// Type of correction applied
371    pub correction_type: CorrectionType,
372
373    /// Explanation of why this correction was suggested
374    pub explanation: Option<String>,
375}
376
377/// Reasons why a word might need correction
378#[derive(Debug, Clone, PartialEq, Eq)]
379pub enum CorrectionReason {
380    /// Word has low OCR confidence
381    LowConfidence,
382
383    /// Word contains common OCR confusion patterns
384    ConfusionPattern,
385
386    /// Word not found in dictionary
387    NotInDictionary,
388
389    /// Word doesn't fit context
390    ContextualError,
391
392    /// Word has suspicious character combinations
393    SuspiciousPattern,
394}
395
396/// Types of corrections that can be applied
397#[derive(Debug, Clone, PartialEq, Eq, Hash)]
398pub enum CorrectionType {
399    /// Character substitution (e.g., "0" -> "O")
400    CharacterSubstitution,
401
402    /// Dictionary lookup and replacement
403    DictionaryCorrection,
404
405    /// Contextual correction based on surrounding words
406    ContextualCorrection,
407
408    /// Pattern-based correction (e.g., "rn" -> "m")
409    PatternCorrection,
410
411    /// Manual review suggested
412    ManualReview,
413}
414
415/// OCR post-processor for automatic text correction
416#[derive(Debug, Clone)]
417pub struct OcrPostProcessor {
418    /// Common OCR character confusions
419    pub character_corrections: std::collections::HashMap<char, Vec<char>>,
420
421    /// Dictionary of valid words (optional)
422    pub dictionary: Option<std::collections::HashSet<String>>,
423
424    /// Common pattern corrections
425    pub pattern_corrections: std::collections::HashMap<String, String>,
426
427    /// Confidence threshold for correction
428    pub correction_threshold: f64,
429
430    /// Maximum edit distance for corrections
431    pub max_edit_distance: usize,
432}
433
434impl OcrPostProcessor {
435    /// Create a new post-processor with common OCR corrections
436    pub fn new() -> Self {
437        let mut character_corrections = std::collections::HashMap::new();
438
439        // Common OCR character confusions
440        character_corrections.insert('0', vec!['O', 'o', 'Q']);
441        character_corrections.insert('O', vec!['0', 'Q', 'o']);
442        character_corrections.insert('1', vec!['l', 'I', '|']);
443        character_corrections.insert('l', vec!['1', 'I', '|']);
444        character_corrections.insert('I', vec!['1', 'l', '|']);
445        character_corrections.insert('S', vec!['5', '$']);
446        character_corrections.insert('5', vec!['S', '$']);
447        character_corrections.insert('2', vec!['Z', 'z']);
448        character_corrections.insert('Z', vec!['2', 'z']);
449
450        let mut pattern_corrections = std::collections::HashMap::new();
451        pattern_corrections.insert("rn".to_string(), "m".to_string());
452        pattern_corrections.insert("cl".to_string(), "d".to_string());
453        pattern_corrections.insert("fi".to_string(), "fi".to_string()); // ligature
454        pattern_corrections.insert("fl".to_string(), "fl".to_string()); // ligature
455
456        Self {
457            character_corrections,
458            dictionary: None,
459            pattern_corrections,
460            correction_threshold: 0.7,
461            max_edit_distance: 2,
462        }
463    }
464
465    /// Add a dictionary for word validation
466    pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
467        self.dictionary = Some(dictionary);
468        self
469    }
470
471    /// Process a fragment and suggest corrections
472    pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
473        let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
474
475        // Enhance candidates with suggestions
476        for candidate in &mut candidates {
477            candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
478        }
479
480        candidates
481    }
482
483    /// Generate correction suggestions for a word
484    pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
485        let mut suggestions = Vec::new();
486
487        // Character substitution corrections
488        suggestions.extend(self.character_substitution_corrections(word));
489
490        // Pattern-based corrections
491        suggestions.extend(self.pattern_corrections(word));
492
493        // Dictionary corrections (if available)
494        if let Some(dict) = &self.dictionary {
495            suggestions.extend(self.dictionary_corrections(word, dict));
496        }
497
498        // Sort by confidence and limit results
499        suggestions.sort_by(|a, b| {
500            b.correction_confidence
501                .partial_cmp(&a.correction_confidence)
502                .unwrap_or(std::cmp::Ordering::Equal)
503        });
504        suggestions.truncate(5); // Limit to top 5 suggestions
505
506        suggestions
507    }
508
509    /// Generate character substitution corrections
510    fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
511        let mut suggestions = Vec::new();
512        let chars: Vec<char> = word.chars().collect();
513
514        for (i, &ch) in chars.iter().enumerate() {
515            if let Some(alternatives) = self.character_corrections.get(&ch) {
516                for &alt_ch in alternatives {
517                    let mut corrected_chars = chars.clone();
518                    corrected_chars[i] = alt_ch;
519                    let corrected_word: String = corrected_chars.into_iter().collect();
520
521                    suggestions.push(CorrectionSuggestion {
522                        corrected_word,
523                        correction_confidence: 0.8,
524                        correction_type: CorrectionType::CharacterSubstitution,
525                        explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
526                    });
527                }
528            }
529        }
530
531        suggestions
532    }
533
534    /// Generate pattern-based corrections
535    fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
536        let mut suggestions = Vec::new();
537
538        for (pattern, replacement) in &self.pattern_corrections {
539            if word.contains(pattern) {
540                let corrected_word = word.replace(pattern, replacement);
541                suggestions.push(CorrectionSuggestion {
542                    corrected_word,
543                    correction_confidence: 0.85,
544                    correction_type: CorrectionType::PatternCorrection,
545                    explanation: Some(format!(
546                        "Pattern '{}' -> '{}' correction",
547                        pattern, replacement
548                    )),
549                });
550            }
551        }
552
553        suggestions
554    }
555
556    /// Generate dictionary-based corrections
557    fn dictionary_corrections(
558        &self,
559        word: &str,
560        dictionary: &std::collections::HashSet<String>,
561    ) -> Vec<CorrectionSuggestion> {
562        let mut suggestions = Vec::new();
563
564        // Check if word is already valid
565        if dictionary.contains(word) {
566            return suggestions;
567        }
568
569        // Find similar words using simple edit distance
570        for dict_word in dictionary {
571            if self.edit_distance(word, dict_word) <= self.max_edit_distance {
572                let confidence = 1.0
573                    - (self.edit_distance(word, dict_word) as f64
574                        / word.len().max(dict_word.len()) as f64);
575                suggestions.push(CorrectionSuggestion {
576                    corrected_word: dict_word.clone(),
577                    correction_confidence: confidence * 0.9, // Slightly lower than pattern corrections
578                    correction_type: CorrectionType::DictionaryCorrection,
579                    explanation: Some(format!(
580                        "Dictionary match with edit distance {}",
581                        self.edit_distance(word, dict_word)
582                    )),
583                });
584            }
585        }
586
587        suggestions
588    }
589
590    /// Calculate simple edit distance (Levenshtein distance)
591    fn edit_distance(&self, s1: &str, s2: &str) -> usize {
592        let len1 = s1.len();
593        let len2 = s2.len();
594
595        let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
596
597        #[allow(clippy::needless_range_loop)]
598        for i in 0..=len1 {
599            dp[i][0] = i;
600        }
601        for j in 0..=len2 {
602            dp[0][j] = j;
603        }
604
605        let s1_chars: Vec<char> = s1.chars().collect();
606        let s2_chars: Vec<char> = s2.chars().collect();
607
608        for i in 1..=len1 {
609            for j in 1..=len2 {
610                if s1_chars[i - 1] == s2_chars[j - 1] {
611                    dp[i][j] = dp[i - 1][j - 1];
612                } else {
613                    dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
614                }
615            }
616        }
617
618        dp[len1][len2]
619    }
620}
621
622impl Default for OcrPostProcessor {
623    fn default() -> Self {
624        Self::new()
625    }
626}
627
628/// Text fragment extracted by OCR with position and confidence information
629#[derive(Debug, Clone)]
630pub struct OcrTextFragment {
631    /// The extracted text content
632    pub text: String,
633
634    /// X position in page coordinates (points)
635    pub x: f64,
636
637    /// Y position in page coordinates (points)
638    pub y: f64,
639
640    /// Width of the text fragment (points)
641    pub width: f64,
642
643    /// Height of the text fragment (points)
644    pub height: f64,
645
646    /// Confidence score for this fragment (0.0 to 1.0)
647    pub confidence: f64,
648
649    /// Word-level confidence scores (optional, for advanced OCR engines)
650    pub word_confidences: Option<Vec<WordConfidence>>,
651
652    /// Font size estimation (points)
653    pub font_size: f64,
654
655    /// Whether this fragment is part of a word or line
656    pub fragment_type: FragmentType,
657}
658
659impl OcrTextFragment {
660    /// Create a new OCR text fragment
661    #[allow(clippy::too_many_arguments)]
662    pub fn new(
663        text: String,
664        x: f64,
665        y: f64,
666        width: f64,
667        height: f64,
668        confidence: f64,
669        font_size: f64,
670        fragment_type: FragmentType,
671    ) -> Self {
672        Self {
673            text,
674            x,
675            y,
676            width,
677            height,
678            confidence,
679            word_confidences: None,
680            font_size,
681            fragment_type,
682        }
683    }
684
685    /// Create a fragment with word-level confidence scores
686    #[allow(clippy::too_many_arguments)]
687    pub fn with_word_confidences(
688        text: String,
689        x: f64,
690        y: f64,
691        width: f64,
692        height: f64,
693        confidence: f64,
694        font_size: f64,
695        fragment_type: FragmentType,
696        word_confidences: Vec<WordConfidence>,
697    ) -> Self {
698        Self {
699            text,
700            x,
701            y,
702            width,
703            height,
704            confidence,
705            word_confidences: Some(word_confidences),
706            font_size,
707            fragment_type,
708        }
709    }
710
711    /// Get words with confidence below the threshold
712    pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
713        self.word_confidences
714            .as_ref()
715            .map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
716            .unwrap_or_default()
717    }
718
719    /// Get the average word confidence if available
720    pub fn average_word_confidence(&self) -> Option<f64> {
721        self.word_confidences.as_ref().map(|words| {
722            if words.is_empty() {
723                return 0.0;
724            }
725            let sum: f64 = words.iter().map(|w| w.confidence).sum();
726            sum / words.len() as f64
727        })
728    }
729
730    /// Get words sorted by confidence (lowest first)
731    pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
732        self.word_confidences
733            .as_ref()
734            .map(|words| {
735                let mut sorted_words: Vec<_> = words.iter().collect();
736                sorted_words.sort_by(|a, b| {
737                    a.confidence
738                        .partial_cmp(&b.confidence)
739                        .unwrap_or(std::cmp::Ordering::Equal)
740                });
741                sorted_words
742            })
743            .unwrap_or_default()
744    }
745
746    /// Check if this fragment has any low-confidence words
747    pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
748        self.word_confidences
749            .as_ref()
750            .map(|words| words.iter().any(|w| w.confidence < threshold))
751            .unwrap_or(false)
752    }
753
754    /// Get words that are candidates for correction (low confidence + patterns)
755    pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
756        self.word_confidences
757            .as_ref()
758            .map(|words| {
759                words
760                    .iter()
761                    .enumerate()
762                    .filter(|(_, w)| w.confidence < threshold)
763                    .map(|(index, word)| CorrectionCandidate {
764                        word: word.word.clone(),
765                        confidence: word.confidence,
766                        position_in_fragment: index,
767                        suggested_corrections: vec![], // Will be filled by post-processor
768                        correction_reason: CorrectionReason::LowConfidence,
769                    })
770                    .collect()
771            })
772            .unwrap_or_default()
773    }
774
775    /// Generate a confidence report for this fragment
776    pub fn confidence_report(&self) -> String {
777        let mut report = format!(
778            "Fragment confidence: {:.1}% - \"{}\"\n",
779            self.confidence * 100.0,
780            self.text.trim()
781        );
782
783        if let Some(words) = &self.word_confidences {
784            report.push_str(&format!(
785                "  Word-level breakdown ({} words):\n",
786                words.len()
787            ));
788            for (i, word) in words.iter().enumerate() {
789                report.push_str(&format!(
790                    "    {}: \"{}\" - {:.1}%\n",
791                    i + 1,
792                    word.word,
793                    word.confidence * 100.0
794                ));
795
796                if let Some(chars) = &word.character_confidences {
797                    report.push_str("      Characters: ");
798                    for ch in chars {
799                        report.push_str(&format!(
800                            "'{}'({:.0}%) ",
801                            ch.character,
802                            ch.confidence * 100.0
803                        ));
804                    }
805                    report.push('\n');
806                }
807            }
808        } else {
809            report.push_str("  (No word-level data available)\n");
810        }
811
812        report
813    }
814}
815
816/// Type of text fragment
817#[derive(Debug, Clone, Copy, PartialEq, Eq)]
818pub enum FragmentType {
819    /// Individual character
820    Character,
821    /// Complete word
822    Word,
823    /// Text line
824    Line,
825    /// Paragraph
826    Paragraph,
827}
828
829/// Complete result of OCR processing
830#[derive(Debug, Clone)]
831pub struct OcrProcessingResult {
832    /// The complete extracted text
833    pub text: String,
834
835    /// Overall confidence score (0.0 to 1.0)
836    pub confidence: f64,
837
838    /// Individual text fragments with position information
839    pub fragments: Vec<OcrTextFragment>,
840
841    /// Processing time in milliseconds
842    pub processing_time_ms: u64,
843
844    /// OCR engine used for processing
845    pub engine_name: String,
846
847    /// Language detected/used
848    pub language: String,
849
850    /// Region that was processed (None if entire image was processed)
851    pub processed_region: Option<OcrRegion>,
852
853    /// Image dimensions that were processed
854    pub image_dimensions: (u32, u32),
855}
856
857impl OcrProcessingResult {
858    /// Create a new OCR processing result
859    pub fn new(
860        text: String,
861        confidence: f64,
862        fragments: Vec<OcrTextFragment>,
863        processing_time_ms: u64,
864        engine_name: String,
865        language: String,
866        image_dimensions: (u32, u32),
867    ) -> Self {
868        Self {
869            text,
870            confidence,
871            fragments,
872            processing_time_ms,
873            engine_name,
874            language,
875            processed_region: None,
876            image_dimensions,
877        }
878    }
879
880    /// Create a new OCR processing result for a specific region
881    #[allow(clippy::too_many_arguments)]
882    pub fn with_region(
883        text: String,
884        confidence: f64,
885        fragments: Vec<OcrTextFragment>,
886        processing_time_ms: u64,
887        engine_name: String,
888        language: String,
889        image_dimensions: (u32, u32),
890        region: OcrRegion,
891    ) -> Self {
892        Self {
893            text,
894            confidence,
895            fragments,
896            processing_time_ms,
897            engine_name,
898            language,
899            processed_region: Some(region),
900            image_dimensions,
901        }
902    }
903
904    /// Filter fragments by minimum confidence
905    pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
906        self.fragments
907            .iter()
908            .filter(|fragment| fragment.confidence >= min_confidence)
909            .collect()
910    }
911
912    /// Get text fragments within a specific region
913    pub fn fragments_in_region(
914        &self,
915        x: f64,
916        y: f64,
917        width: f64,
918        height: f64,
919    ) -> Vec<&OcrTextFragment> {
920        self.fragments
921            .iter()
922            .filter(|fragment| {
923                fragment.x >= x
924                    && fragment.y >= y
925                    && fragment.x + fragment.width <= x + width
926                    && fragment.y + fragment.height <= y + height
927            })
928            .collect()
929    }
930
931    /// Get fragments of a specific type
932    pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
933        self.fragments
934            .iter()
935            .filter(|fragment| fragment.fragment_type == fragment_type)
936            .collect()
937    }
938
939    /// Calculate average confidence for all fragments
940    pub fn average_confidence(&self) -> f64 {
941        if self.fragments.is_empty() {
942            return 0.0;
943        }
944
945        let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
946        sum / self.fragments.len() as f64
947    }
948}
949
950/// Supported OCR engines
951#[derive(Debug, Clone, Copy, PartialEq, Eq)]
952pub enum OcrEngine {
953    /// Mock OCR provider for testing
954    Mock,
955    /// Tesseract OCR (local processing)
956    Tesseract,
957    /// Azure Computer Vision OCR
958    Azure,
959    /// AWS Textract
960    Aws,
961    /// Google Cloud Vision OCR
962    GoogleCloud,
963}
964
965impl OcrEngine {
966    /// Get the name of the OCR engine
967    pub fn name(&self) -> &'static str {
968        match self {
969            OcrEngine::Mock => "Mock OCR",
970            OcrEngine::Tesseract => "Tesseract",
971            OcrEngine::Azure => "Azure Computer Vision",
972            OcrEngine::Aws => "AWS Textract",
973            OcrEngine::GoogleCloud => "Google Cloud Vision",
974        }
975    }
976
977    /// Check if this engine supports the given image format
978    pub fn supports_format(&self, format: ImageFormat) -> bool {
979        match self {
980            OcrEngine::Mock => true, // Mock supports all formats
981            OcrEngine::Tesseract => matches!(
982                format,
983                ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
984            ),
985            OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
986            OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
987            OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
988        }
989    }
990}
991
992impl fmt::Display for OcrEngine {
993    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
994        write!(f, "{}", self.name())
995    }
996}
997
998/// Trait for OCR providers
999///
1000/// This trait defines the interface that all OCR providers must implement.
1001/// It provides methods for processing images and extracting text with position information.
1002///
1003/// # Implementation Notes
1004///
1005/// - Implementations should handle errors gracefully and return meaningful error messages
1006/// - The `process_image` method is the core functionality that all providers must implement
1007/// - The `process_page` method is a convenience method for working with page analysis results
1008/// - Providers should validate image formats and reject unsupported formats
1009///
1010/// # Examples
1011///
1012/// ```rust
1013/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
1014/// use oxidize_pdf::graphics::ImageFormat;
1015///
1016/// struct MyOcrProvider;
1017///
1018/// impl OcrProvider for MyOcrProvider {
1019///     fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
1020///         // Implementation here
1021///         # Ok(OcrProcessingResult {
1022///         #     text: "Sample text".to_string(),
1023///         #     confidence: 0.95,
1024///         #     fragments: vec![],
1025///         #     processing_time_ms: 100,
1026///         #     engine_name: "MyOCR".to_string(),
1027///         #     language: "en".to_string(),
1028///         #     image_dimensions: (800, 600),
1029///         #     processed_region: None,
1030///         # })
1031///     }
1032///
1033///     fn supported_formats(&self) -> Vec<ImageFormat> {
1034///         vec![ImageFormat::Jpeg, ImageFormat::Png]
1035///     }
1036///
1037///     fn engine_name(&self) -> &str {
1038///         "MyOCR"
1039///     }
1040///
1041///     fn engine_type(&self) -> OcrEngine {
1042///         OcrEngine::Mock
1043///     }
1044/// }
1045/// ```
1046pub trait OcrProvider: Send + Sync {
1047    /// Process an image and extract text using OCR
1048    ///
1049    /// This is the core method that all OCR providers must implement.
1050    /// It takes image data as bytes and returns structured text results.
1051    ///
1052    /// # Arguments
1053    ///
1054    /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
1055    /// * `options` - OCR processing options and configuration
1056    ///
1057    /// # Returns
1058    ///
1059    /// A `Result` containing the OCR results with text, confidence, and positioning information.
1060    ///
1061    /// # Errors
1062    ///
1063    /// Returns an error if:
1064    /// - The image format is not supported
1065    /// - The image data is corrupted or invalid
1066    /// - OCR processing fails
1067    /// - Network errors occur (for cloud providers)
1068    /// - Authentication fails (for cloud providers)
1069    fn process_image(
1070        &self,
1071        image_data: &[u8],
1072        options: &OcrOptions,
1073    ) -> OcrResult<OcrProcessingResult>;
1074
1075    /// Process a scanned page using content analysis information
1076    ///
1077    /// This method provides a higher-level interface that works with page analysis results.
1078    /// It's particularly useful when integrating with the page analysis module.
1079    ///
1080    /// # Arguments
1081    ///
1082    /// * `page_analysis` - Results from page content analysis
1083    /// * `page_data` - Raw page data or image data
1084    /// * `options` - OCR processing options
1085    ///
1086    /// # Returns
1087    ///
1088    /// OCR results optimized for the specific page content type.
1089    ///
1090    /// # Default Implementation
1091    ///
1092    /// The default implementation simply calls `process_image` with the page data.
1093    /// Providers can override this to provide specialized handling based on page analysis.
1094    fn process_page(
1095        &self,
1096        _page_analysis: &ContentAnalysis,
1097        page_data: &[u8],
1098        options: &OcrOptions,
1099    ) -> OcrResult<OcrProcessingResult> {
1100        self.process_image(page_data, options)
1101    }
1102
1103    /// Process multiple images with region information
1104    ///
1105    /// This method allows for selective OCR processing where each image corresponds
1106    /// to a specific region. This is useful for:
1107    /// - Processing pre-cropped regions of a document  
1108    /// - Batch processing of multiple regions with different OCR settings
1109    /// - Optimizing performance by avoiding full-image processing
1110    ///
1111    /// # Arguments
1112    ///
1113    /// * `image_region_pairs` - Vector of (image_data, region) pairs
1114    /// * `options` - OCR processing options (applies to all regions)
1115    ///
1116    /// # Returns
1117    ///
1118    /// A vector of `OcrProcessingResult`, one for each processed region.
1119    /// The order matches the input pairs vector.
1120    ///
1121    /// # Default Implementation
1122    ///
1123    /// The default implementation processes each image separately and sets
1124    /// the region information in the result.
1125    fn process_image_regions(
1126        &self,
1127        image_region_pairs: &[(&[u8], &OcrRegion)],
1128        options: &OcrOptions,
1129    ) -> OcrResult<Vec<OcrProcessingResult>> {
1130        let mut results = Vec::with_capacity(image_region_pairs.len());
1131
1132        for (image_data, region) in image_region_pairs {
1133            let mut result = self.process_image(image_data, options)?;
1134
1135            // Adjust fragment coordinates to match original image coordinates
1136            // (assuming the input image_data is already cropped to the region)
1137            for fragment in &mut result.fragments {
1138                fragment.x += region.x as f64;
1139                fragment.y += region.y as f64;
1140            }
1141
1142            result.processed_region = Some((*region).clone());
1143            results.push(result);
1144        }
1145
1146        Ok(results)
1147    }
1148
1149    /// Get the list of supported image formats
1150    ///
1151    /// # Returns
1152    ///
1153    /// A vector of `ImageFormat` values that this provider can process.
1154    fn supported_formats(&self) -> Vec<ImageFormat>;
1155
1156    /// Get the name of this OCR provider
1157    ///
1158    /// # Returns
1159    ///
1160    /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
1161    fn engine_name(&self) -> &str;
1162
1163    /// Get the engine type for this provider
1164    ///
1165    /// # Returns
1166    ///
1167    /// The `OcrEngine` enum value corresponding to this provider.
1168    fn engine_type(&self) -> OcrEngine;
1169
1170    /// Check if this provider supports the given image format
1171    ///
1172    /// # Arguments
1173    ///
1174    /// * `format` - The image format to check
1175    ///
1176    /// # Returns
1177    ///
1178    /// `true` if the format is supported, `false` otherwise.
1179    fn supports_format(&self, format: ImageFormat) -> bool {
1180        self.supported_formats().contains(&format)
1181    }
1182
1183    /// Validate image data before processing
1184    ///
1185    /// This method can be used to perform basic validation of image data
1186    /// before attempting OCR processing.
1187    ///
1188    /// # Arguments
1189    ///
1190    /// * `image_data` - Raw image bytes to validate
1191    ///
1192    /// # Returns
1193    ///
1194    /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
1195    ///
1196    /// # Default Implementation
1197    ///
1198    /// The default implementation performs basic format detection based on magic bytes.
1199    fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
1200        if image_data.len() < 8 {
1201            return Err(OcrError::InvalidImageData(
1202                "Image data too short".to_string(),
1203            ));
1204        }
1205
1206        // Check for common image format signatures
1207        let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
1208            ImageFormat::Jpeg
1209        } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
1210            ImageFormat::Png
1211        } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
1212            ImageFormat::Tiff
1213        } else {
1214            return Err(OcrError::InvalidImageData(
1215                "Unrecognized image format".to_string(),
1216            ));
1217        };
1218
1219        if !self.supports_format(format) {
1220            return Err(OcrError::UnsupportedImageFormat(format));
1221        }
1222
1223        Ok(())
1224    }
1225}
1226
1227/// Mock OCR provider for testing and development
1228///
1229/// This provider simulates OCR processing without actually performing text recognition.
1230/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
1231///
1232/// # Examples
1233///
1234/// ```rust
1235/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
1236///
1237/// let provider = MockOcrProvider::new();
1238/// let options = OcrOptions::default();
1239/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
1240///
1241/// let result = provider.process_image(&image_data, &options).unwrap();
1242/// assert!(result.text.contains("Mock OCR"));
1243/// ```
1244#[derive(Clone)]
1245pub struct MockOcrProvider {
1246    /// Mock confidence level to return
1247    confidence: f64,
1248    /// Mock text to return
1249    mock_text: String,
1250    /// Simulated processing delay (milliseconds)
1251    processing_delay_ms: u64,
1252}
1253
1254impl MockOcrProvider {
1255    /// Create a new mock OCR provider with default settings
1256    pub fn new() -> Self {
1257        Self {
1258            confidence: 0.85,
1259            mock_text: "Mock OCR extracted text from scanned image".to_string(),
1260            processing_delay_ms: 100,
1261        }
1262    }
1263
1264    /// Create a mock provider with custom text and confidence
1265    pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
1266        Self {
1267            confidence,
1268            mock_text: text,
1269            processing_delay_ms: 100,
1270        }
1271    }
1272
1273    /// Set the mock text to return
1274    pub fn set_mock_text(&mut self, text: String) {
1275        self.mock_text = text;
1276    }
1277
1278    /// Set the confidence level to return
1279    pub fn set_confidence(&mut self, confidence: f64) {
1280        self.confidence = confidence.clamp(0.0, 1.0);
1281    }
1282
1283    /// Set the simulated processing delay
1284    pub fn set_processing_delay(&mut self, delay_ms: u64) {
1285        self.processing_delay_ms = delay_ms;
1286    }
1287}
1288
1289impl Default for MockOcrProvider {
1290    fn default() -> Self {
1291        Self::new()
1292    }
1293}
1294
1295impl OcrProvider for MockOcrProvider {
1296    fn process_image(
1297        &self,
1298        image_data: &[u8],
1299        options: &OcrOptions,
1300    ) -> OcrResult<OcrProcessingResult> {
1301        // Validate image data
1302        self.validate_image_data(image_data)?;
1303
1304        // Simulate processing time
1305        std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
1306
1307        // Create mock text fragments
1308        let fragments = vec![
1309            OcrTextFragment {
1310                text: self.mock_text.clone(),
1311                x: 50.0,
1312                y: 700.0,
1313                width: 200.0,
1314                height: 20.0,
1315                confidence: self.confidence,
1316                word_confidences: None,
1317                font_size: 12.0,
1318                fragment_type: FragmentType::Line,
1319            },
1320            OcrTextFragment {
1321                text: "Additional mock text".to_string(),
1322                x: 50.0,
1323                y: 680.0,
1324                width: 150.0,
1325                height: 20.0,
1326                confidence: self.confidence * 0.9,
1327                word_confidences: None,
1328                font_size: 12.0,
1329                fragment_type: FragmentType::Line,
1330            },
1331        ];
1332
1333        Ok(OcrProcessingResult {
1334            text: format!("{}\nAdditional mock text", self.mock_text),
1335            confidence: self.confidence,
1336            fragments,
1337            processing_time_ms: self.processing_delay_ms,
1338            engine_name: "Mock OCR".to_string(),
1339            language: options.language.clone(),
1340            processed_region: None,
1341            image_dimensions: (800, 600), // Mock dimensions
1342        })
1343    }
1344
1345    fn supported_formats(&self) -> Vec<ImageFormat> {
1346        vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
1347    }
1348
1349    fn engine_name(&self) -> &str {
1350        "Mock OCR"
1351    }
1352
1353    fn engine_type(&self) -> OcrEngine {
1354        OcrEngine::Mock
1355    }
1356}
1357
1358#[cfg(test)]
1359mod tests;
1360
1361#[cfg(test)]
1362mod postprocessor_tests;
1363
1364#[cfg(test)]
1365mod rigorous_tests;
oxidize_pdf/text/ocr/mod.rs

oxidize_pdf/text/ocr/
mod.rs