oxidize_pdf/text/
ocr.rs

1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//!     0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//!     0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//!     println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//!     let analysis = analyzer.analyze_page(page_num)?;
62//!     if analysis.is_scanned() {
63//!         println!("Processing scanned page {}", page_num);
64//!         // OCR processing would happen here
65//!     }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81    /// OCR provider is not available or not configured
82    #[error("OCR provider not available: {0}")]
83    ProviderNotAvailable(String),
84
85    /// Unsupported image format for OCR processing
86    #[error("Unsupported image format: {0:?}")]
87    UnsupportedImageFormat(ImageFormat),
88
89    /// Invalid or corrupted image data
90    #[error("Invalid image data: {0}")]
91    InvalidImageData(String),
92
93    /// OCR processing failed
94    #[error("OCR processing failed: {0}")]
95    ProcessingFailed(String),
96
97    /// Network error when using cloud OCR providers
98    #[error("Network error: {0}")]
99    NetworkError(String),
100
101    /// API key or authentication error
102    #[error("Authentication error: {0}")]
103    AuthenticationError(String),
104
105    /// Rate limiting or quota exceeded
106    #[error("Rate limit exceeded: {0}")]
107    RateLimitExceeded(String),
108
109    /// OCR provider returned low confidence results
110    #[error("Low confidence results: {0}")]
111    LowConfidence(String),
112
113    /// Generic IO error
114    #[error("IO error: {0}")]
115    Io(#[from] std::io::Error),
116
117    /// Configuration error
118    #[error("Configuration error: {0}")]
119    Configuration(String),
120}
121
122/// A rectangular region for selective OCR processing
123#[derive(Debug, Clone, PartialEq)]
124pub struct OcrRegion {
125    /// X coordinate of the top-left corner (pixels)
126    pub x: u32,
127
128    /// Y coordinate of the top-left corner (pixels)  
129    pub y: u32,
130
131    /// Width of the region (pixels)
132    pub width: u32,
133
134    /// Height of the region (pixels)
135    pub height: u32,
136
137    /// Optional label for this region (e.g., "header", "table", "paragraph")
138    pub label: Option<String>,
139}
140
141impl OcrRegion {
142    /// Create a new OCR region
143    pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
144        Self {
145            x,
146            y,
147            width,
148            height,
149            label: None,
150        }
151    }
152
153    /// Create a new OCR region with a label
154    pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
155        Self {
156            x,
157            y,
158            width,
159            height,
160            label: Some(label.into()),
161        }
162    }
163
164    /// Check if this region contains a point
165    pub fn contains_point(&self, x: u32, y: u32) -> bool {
166        x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
167    }
168
169    /// Check if this region overlaps with another region
170    pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
171        !(self.x + self.width <= other.x
172            || other.x + other.width <= self.x
173            || self.y + self.height <= other.y
174            || other.y + other.height <= self.y)
175    }
176}
177
178/// OCR processing options and configuration
179#[derive(Debug, Clone)]
180pub struct OcrOptions {
181    /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
182    pub language: String,
183
184    /// Minimum confidence threshold (0.0 to 1.0)
185    pub min_confidence: f64,
186
187    /// Whether to preserve text layout and positioning
188    pub preserve_layout: bool,
189
190    /// Image preprocessing options
191    pub preprocessing: ImagePreprocessing,
192
193    /// OCR engine specific options
194    pub engine_options: std::collections::HashMap<String, String>,
195
196    /// Timeout for OCR operations (in seconds)
197    pub timeout_seconds: u32,
198
199    /// Specific regions to process (None = process entire image)
200    pub regions: Option<Vec<OcrRegion>>,
201
202    /// Whether to save extracted images for debug purposes
203    pub debug_output: bool,
204}
205
206impl Default for OcrOptions {
207    fn default() -> Self {
208        Self {
209            language: "en".to_string(),
210            min_confidence: 0.6,
211            preserve_layout: true,
212            preprocessing: ImagePreprocessing::default(),
213            engine_options: std::collections::HashMap::new(),
214            timeout_seconds: 60, // Aumentado para documentos complejos
215            regions: None,
216            debug_output: false,
217        }
218    }
219}
220
221/// Image preprocessing options for OCR
222#[derive(Debug, Clone)]
223pub struct ImagePreprocessing {
224    /// Whether to apply image denoising
225    pub denoise: bool,
226
227    /// Whether to apply image deskewing
228    pub deskew: bool,
229
230    /// Whether to enhance contrast
231    pub enhance_contrast: bool,
232
233    /// Whether to apply image sharpening
234    pub sharpen: bool,
235
236    /// Scale factor for image resizing (1.0 = no scaling)
237    pub scale_factor: f64,
238}
239
240impl Default for ImagePreprocessing {
241    fn default() -> Self {
242        Self {
243            denoise: true,
244            deskew: true,
245            enhance_contrast: true,
246            sharpen: false,
247            scale_factor: 1.0,
248        }
249    }
250}
251
252/// Word-level confidence information for detailed OCR analysis
253#[derive(Debug, Clone)]
254pub struct WordConfidence {
255    /// The word text
256    pub word: String,
257
258    /// Confidence score for this specific word (0.0 to 1.0)
259    pub confidence: f64,
260
261    /// X position of the word within the fragment (relative to fragment start)
262    pub x_offset: f64,
263
264    /// Width of the word in points
265    pub width: f64,
266
267    /// Optional character-level confidences (for ultimate granularity)
268    pub character_confidences: Option<Vec<CharacterConfidence>>,
269}
270
271impl WordConfidence {
272    /// Create a new word confidence
273    pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
274        Self {
275            word,
276            confidence,
277            x_offset,
278            width,
279            character_confidences: None,
280        }
281    }
282
283    /// Create a word confidence with character-level details
284    pub fn with_characters(
285        word: String,
286        confidence: f64,
287        x_offset: f64,
288        width: f64,
289        character_confidences: Vec<CharacterConfidence>,
290    ) -> Self {
291        Self {
292            word,
293            confidence,
294            x_offset,
295            width,
296            character_confidences: Some(character_confidences),
297        }
298    }
299
300    /// Get the average character confidence if available
301    pub fn average_character_confidence(&self) -> Option<f64> {
302        self.character_confidences.as_ref().map(|chars| {
303            let sum: f64 = chars.iter().map(|c| c.confidence).sum();
304            sum / chars.len() as f64
305        })
306    }
307
308    /// Check if this word has low confidence (below threshold)
309    pub fn is_low_confidence(&self, threshold: f64) -> bool {
310        self.confidence < threshold
311    }
312}
313
314/// Character-level confidence information for ultimate OCR granularity
315#[derive(Debug, Clone)]
316pub struct CharacterConfidence {
317    /// The character
318    pub character: char,
319
320    /// Confidence score for this character (0.0 to 1.0)  
321    pub confidence: f64,
322
323    /// X position relative to word start
324    pub x_offset: f64,
325
326    /// Character width in points
327    pub width: f64,
328}
329
330impl CharacterConfidence {
331    /// Create a new character confidence
332    pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
333        Self {
334            character,
335            confidence,
336            x_offset,
337            width,
338        }
339    }
340}
341
342/// Candidate for OCR post-processing correction
343#[derive(Debug, Clone)]
344pub struct CorrectionCandidate {
345    /// The original word with low confidence or errors
346    pub word: String,
347
348    /// Original confidence score
349    pub confidence: f64,
350
351    /// Position within the text fragment
352    pub position_in_fragment: usize,
353
354    /// Suggested corrections ranked by likelihood
355    pub suggested_corrections: Vec<CorrectionSuggestion>,
356
357    /// Reason why this word needs correction
358    pub correction_reason: CorrectionReason,
359}
360
361/// A suggested correction for an OCR error
362#[derive(Debug, Clone)]
363pub struct CorrectionSuggestion {
364    /// The corrected word
365    pub corrected_word: String,
366
367    /// Confidence in this correction (0.0 to 1.0)
368    pub correction_confidence: f64,
369
370    /// Type of correction applied
371    pub correction_type: CorrectionType,
372
373    /// Explanation of why this correction was suggested
374    pub explanation: Option<String>,
375}
376
377/// Reasons why a word might need correction
378#[derive(Debug, Clone, PartialEq, Eq)]
379pub enum CorrectionReason {
380    /// Word has low OCR confidence
381    LowConfidence,
382
383    /// Word contains common OCR confusion patterns
384    ConfusionPattern,
385
386    /// Word not found in dictionary
387    NotInDictionary,
388
389    /// Word doesn't fit context
390    ContextualError,
391
392    /// Word has suspicious character combinations
393    SuspiciousPattern,
394}
395
396/// Types of corrections that can be applied
397#[derive(Debug, Clone, PartialEq, Eq, Hash)]
398pub enum CorrectionType {
399    /// Character substitution (e.g., "0" -> "O")
400    CharacterSubstitution,
401
402    /// Dictionary lookup and replacement
403    DictionaryCorrection,
404
405    /// Contextual correction based on surrounding words
406    ContextualCorrection,
407
408    /// Pattern-based correction (e.g., "rn" -> "m")
409    PatternCorrection,
410
411    /// Manual review suggested
412    ManualReview,
413}
414
415/// OCR post-processor for automatic text correction
416#[derive(Debug, Clone)]
417pub struct OcrPostProcessor {
418    /// Common OCR character confusions
419    pub character_corrections: std::collections::HashMap<char, Vec<char>>,
420
421    /// Dictionary of valid words (optional)
422    pub dictionary: Option<std::collections::HashSet<String>>,
423
424    /// Common pattern corrections
425    pub pattern_corrections: std::collections::HashMap<String, String>,
426
427    /// Confidence threshold for correction
428    pub correction_threshold: f64,
429
430    /// Maximum edit distance for corrections
431    pub max_edit_distance: usize,
432}
433
434impl OcrPostProcessor {
435    /// Create a new post-processor with common OCR corrections
436    pub fn new() -> Self {
437        let mut character_corrections = std::collections::HashMap::new();
438
439        // Common OCR character confusions
440        character_corrections.insert('0', vec!['O', 'o', 'Q']);
441        character_corrections.insert('O', vec!['0', 'Q', 'o']);
442        character_corrections.insert('1', vec!['l', 'I', '|']);
443        character_corrections.insert('l', vec!['1', 'I', '|']);
444        character_corrections.insert('I', vec!['1', 'l', '|']);
445        character_corrections.insert('S', vec!['5', '$']);
446        character_corrections.insert('5', vec!['S', '$']);
447        character_corrections.insert('2', vec!['Z', 'z']);
448        character_corrections.insert('Z', vec!['2', 'z']);
449
450        let mut pattern_corrections = std::collections::HashMap::new();
451        pattern_corrections.insert("rn".to_string(), "m".to_string());
452        pattern_corrections.insert("cl".to_string(), "d".to_string());
453        pattern_corrections.insert("fi".to_string(), "fi".to_string()); // ligature
454        pattern_corrections.insert("fl".to_string(), "fl".to_string()); // ligature
455
456        Self {
457            character_corrections,
458            dictionary: None,
459            pattern_corrections,
460            correction_threshold: 0.7,
461            max_edit_distance: 2,
462        }
463    }
464
465    /// Add a dictionary for word validation
466    pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
467        self.dictionary = Some(dictionary);
468        self
469    }
470
471    /// Process a fragment and suggest corrections
472    pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
473        let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
474
475        // Enhance candidates with suggestions
476        for candidate in &mut candidates {
477            candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
478        }
479
480        candidates
481    }
482
483    /// Generate correction suggestions for a word
484    pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
485        let mut suggestions = Vec::new();
486
487        // Character substitution corrections
488        suggestions.extend(self.character_substitution_corrections(word));
489
490        // Pattern-based corrections
491        suggestions.extend(self.pattern_corrections(word));
492
493        // Dictionary corrections (if available)
494        if let Some(dict) = &self.dictionary {
495            suggestions.extend(self.dictionary_corrections(word, dict));
496        }
497
498        // Sort by confidence and limit results
499        suggestions.sort_by(|a, b| {
500            b.correction_confidence
501                .partial_cmp(&a.correction_confidence)
502                .unwrap_or(std::cmp::Ordering::Equal)
503        });
504        suggestions.truncate(5); // Limit to top 5 suggestions
505
506        suggestions
507    }
508
509    /// Generate character substitution corrections
510    fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
511        let mut suggestions = Vec::new();
512        let chars: Vec<char> = word.chars().collect();
513
514        for (i, &ch) in chars.iter().enumerate() {
515            if let Some(alternatives) = self.character_corrections.get(&ch) {
516                for &alt_ch in alternatives {
517                    let mut corrected_chars = chars.clone();
518                    corrected_chars[i] = alt_ch;
519                    let corrected_word: String = corrected_chars.into_iter().collect();
520
521                    suggestions.push(CorrectionSuggestion {
522                        corrected_word,
523                        correction_confidence: 0.8,
524                        correction_type: CorrectionType::CharacterSubstitution,
525                        explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
526                    });
527                }
528            }
529        }
530
531        suggestions
532    }
533
534    /// Generate pattern-based corrections
535    fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
536        let mut suggestions = Vec::new();
537
538        for (pattern, replacement) in &self.pattern_corrections {
539            if word.contains(pattern) {
540                let corrected_word = word.replace(pattern, replacement);
541                suggestions.push(CorrectionSuggestion {
542                    corrected_word,
543                    correction_confidence: 0.85,
544                    correction_type: CorrectionType::PatternCorrection,
545                    explanation: Some(format!(
546                        "Pattern '{}' -> '{}' correction",
547                        pattern, replacement
548                    )),
549                });
550            }
551        }
552
553        suggestions
554    }
555
556    /// Generate dictionary-based corrections
557    fn dictionary_corrections(
558        &self,
559        word: &str,
560        dictionary: &std::collections::HashSet<String>,
561    ) -> Vec<CorrectionSuggestion> {
562        let mut suggestions = Vec::new();
563
564        // Check if word is already valid
565        if dictionary.contains(word) {
566            return suggestions;
567        }
568
569        // Find similar words using simple edit distance
570        for dict_word in dictionary {
571            if self.edit_distance(word, dict_word) <= self.max_edit_distance {
572                let confidence = 1.0
573                    - (self.edit_distance(word, dict_word) as f64
574                        / word.len().max(dict_word.len()) as f64);
575                suggestions.push(CorrectionSuggestion {
576                    corrected_word: dict_word.clone(),
577                    correction_confidence: confidence * 0.9, // Slightly lower than pattern corrections
578                    correction_type: CorrectionType::DictionaryCorrection,
579                    explanation: Some(format!(
580                        "Dictionary match with edit distance {}",
581                        self.edit_distance(word, dict_word)
582                    )),
583                });
584            }
585        }
586
587        suggestions
588    }
589
590    /// Calculate simple edit distance (Levenshtein distance)
591    fn edit_distance(&self, s1: &str, s2: &str) -> usize {
592        let len1 = s1.len();
593        let len2 = s2.len();
594
595        let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
596
597        #[allow(clippy::needless_range_loop)]
598        for i in 0..=len1 {
599            dp[i][0] = i;
600        }
601        for j in 0..=len2 {
602            dp[0][j] = j;
603        }
604
605        let s1_chars: Vec<char> = s1.chars().collect();
606        let s2_chars: Vec<char> = s2.chars().collect();
607
608        for i in 1..=len1 {
609            for j in 1..=len2 {
610                if s1_chars[i - 1] == s2_chars[j - 1] {
611                    dp[i][j] = dp[i - 1][j - 1];
612                } else {
613                    dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
614                }
615            }
616        }
617
618        dp[len1][len2]
619    }
620}
621
622impl Default for OcrPostProcessor {
623    fn default() -> Self {
624        Self::new()
625    }
626}
627
628/// Text fragment extracted by OCR with position and confidence information
629#[derive(Debug, Clone)]
630pub struct OcrTextFragment {
631    /// The extracted text content
632    pub text: String,
633
634    /// X position in page coordinates (points)
635    pub x: f64,
636
637    /// Y position in page coordinates (points)
638    pub y: f64,
639
640    /// Width of the text fragment (points)
641    pub width: f64,
642
643    /// Height of the text fragment (points)
644    pub height: f64,
645
646    /// Confidence score for this fragment (0.0 to 1.0)
647    pub confidence: f64,
648
649    /// Word-level confidence scores (optional, for advanced OCR engines)
650    pub word_confidences: Option<Vec<WordConfidence>>,
651
652    /// Font size estimation (points)
653    pub font_size: f64,
654
655    /// Whether this fragment is part of a word or line
656    pub fragment_type: FragmentType,
657}
658
659impl OcrTextFragment {
660    /// Create a new OCR text fragment
661    #[allow(clippy::too_many_arguments)]
662    pub fn new(
663        text: String,
664        x: f64,
665        y: f64,
666        width: f64,
667        height: f64,
668        confidence: f64,
669        font_size: f64,
670        fragment_type: FragmentType,
671    ) -> Self {
672        Self {
673            text,
674            x,
675            y,
676            width,
677            height,
678            confidence,
679            word_confidences: None,
680            font_size,
681            fragment_type,
682        }
683    }
684
685    /// Create a fragment with word-level confidence scores
686    #[allow(clippy::too_many_arguments)]
687    pub fn with_word_confidences(
688        text: String,
689        x: f64,
690        y: f64,
691        width: f64,
692        height: f64,
693        confidence: f64,
694        font_size: f64,
695        fragment_type: FragmentType,
696        word_confidences: Vec<WordConfidence>,
697    ) -> Self {
698        Self {
699            text,
700            x,
701            y,
702            width,
703            height,
704            confidence,
705            word_confidences: Some(word_confidences),
706            font_size,
707            fragment_type,
708        }
709    }
710
711    /// Get words with confidence below the threshold
712    pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
713        self.word_confidences
714            .as_ref()
715            .map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
716            .unwrap_or_default()
717    }
718
719    /// Get the average word confidence if available
720    pub fn average_word_confidence(&self) -> Option<f64> {
721        self.word_confidences.as_ref().map(|words| {
722            if words.is_empty() {
723                return 0.0;
724            }
725            let sum: f64 = words.iter().map(|w| w.confidence).sum();
726            sum / words.len() as f64
727        })
728    }
729
730    /// Get words sorted by confidence (lowest first)
731    pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
732        self.word_confidences
733            .as_ref()
734            .map(|words| {
735                let mut sorted_words: Vec<_> = words.iter().collect();
736                sorted_words.sort_by(|a, b| {
737                    a.confidence
738                        .partial_cmp(&b.confidence)
739                        .unwrap_or(std::cmp::Ordering::Equal)
740                });
741                sorted_words
742            })
743            .unwrap_or_default()
744    }
745
746    /// Check if this fragment has any low-confidence words
747    pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
748        self.word_confidences
749            .as_ref()
750            .map(|words| words.iter().any(|w| w.confidence < threshold))
751            .unwrap_or(false)
752    }
753
754    /// Get words that are candidates for correction (low confidence + patterns)
755    pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
756        self.word_confidences
757            .as_ref()
758            .map(|words| {
759                words
760                    .iter()
761                    .enumerate()
762                    .filter(|(_, w)| w.confidence < threshold)
763                    .map(|(index, word)| CorrectionCandidate {
764                        word: word.word.clone(),
765                        confidence: word.confidence,
766                        position_in_fragment: index,
767                        suggested_corrections: vec![], // Will be filled by post-processor
768                        correction_reason: CorrectionReason::LowConfidence,
769                    })
770                    .collect()
771            })
772            .unwrap_or_default()
773    }
774
775    /// Generate a confidence report for this fragment
776    pub fn confidence_report(&self) -> String {
777        let mut report = format!(
778            "Fragment confidence: {:.1}% - \"{}\"\n",
779            self.confidence * 100.0,
780            self.text.trim()
781        );
782
783        if let Some(words) = &self.word_confidences {
784            report.push_str(&format!(
785                "  Word-level breakdown ({} words):\n",
786                words.len()
787            ));
788            for (i, word) in words.iter().enumerate() {
789                report.push_str(&format!(
790                    "    {}: \"{}\" - {:.1}%\n",
791                    i + 1,
792                    word.word,
793                    word.confidence * 100.0
794                ));
795
796                if let Some(chars) = &word.character_confidences {
797                    report.push_str("      Characters: ");
798                    for ch in chars {
799                        report.push_str(&format!(
800                            "'{}'({:.0}%) ",
801                            ch.character,
802                            ch.confidence * 100.0
803                        ));
804                    }
805                    report.push('\n');
806                }
807            }
808        } else {
809            report.push_str("  (No word-level data available)\n");
810        }
811
812        report
813    }
814}
815
816/// Type of text fragment
817#[derive(Debug, Clone, Copy, PartialEq, Eq)]
818pub enum FragmentType {
819    /// Individual character
820    Character,
821    /// Complete word
822    Word,
823    /// Text line
824    Line,
825    /// Paragraph
826    Paragraph,
827}
828
829/// Complete result of OCR processing
830#[derive(Debug, Clone)]
831pub struct OcrProcessingResult {
832    /// The complete extracted text
833    pub text: String,
834
835    /// Overall confidence score (0.0 to 1.0)
836    pub confidence: f64,
837
838    /// Individual text fragments with position information
839    pub fragments: Vec<OcrTextFragment>,
840
841    /// Processing time in milliseconds
842    pub processing_time_ms: u64,
843
844    /// OCR engine used for processing
845    pub engine_name: String,
846
847    /// Language detected/used
848    pub language: String,
849
850    /// Region that was processed (None if entire image was processed)
851    pub processed_region: Option<OcrRegion>,
852
853    /// Image dimensions that were processed
854    pub image_dimensions: (u32, u32),
855}
856
857impl OcrProcessingResult {
858    /// Create a new OCR processing result
859    pub fn new(
860        text: String,
861        confidence: f64,
862        fragments: Vec<OcrTextFragment>,
863        processing_time_ms: u64,
864        engine_name: String,
865        language: String,
866        image_dimensions: (u32, u32),
867    ) -> Self {
868        Self {
869            text,
870            confidence,
871            fragments,
872            processing_time_ms,
873            engine_name,
874            language,
875            processed_region: None,
876            image_dimensions,
877        }
878    }
879
880    /// Create a new OCR processing result for a specific region
881    #[allow(clippy::too_many_arguments)]
882    pub fn with_region(
883        text: String,
884        confidence: f64,
885        fragments: Vec<OcrTextFragment>,
886        processing_time_ms: u64,
887        engine_name: String,
888        language: String,
889        image_dimensions: (u32, u32),
890        region: OcrRegion,
891    ) -> Self {
892        Self {
893            text,
894            confidence,
895            fragments,
896            processing_time_ms,
897            engine_name,
898            language,
899            processed_region: Some(region),
900            image_dimensions,
901        }
902    }
903
904    /// Filter fragments by minimum confidence
905    pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
906        self.fragments
907            .iter()
908            .filter(|fragment| fragment.confidence >= min_confidence)
909            .collect()
910    }
911
912    /// Get text fragments within a specific region
913    pub fn fragments_in_region(
914        &self,
915        x: f64,
916        y: f64,
917        width: f64,
918        height: f64,
919    ) -> Vec<&OcrTextFragment> {
920        self.fragments
921            .iter()
922            .filter(|fragment| {
923                fragment.x >= x
924                    && fragment.y >= y
925                    && fragment.x + fragment.width <= x + width
926                    && fragment.y + fragment.height <= y + height
927            })
928            .collect()
929    }
930
931    /// Get fragments of a specific type
932    pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
933        self.fragments
934            .iter()
935            .filter(|fragment| fragment.fragment_type == fragment_type)
936            .collect()
937    }
938
939    /// Calculate average confidence for all fragments
940    pub fn average_confidence(&self) -> f64 {
941        if self.fragments.is_empty() {
942            return 0.0;
943        }
944
945        let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
946        sum / self.fragments.len() as f64
947    }
948}
949
950/// Supported OCR engines
951#[derive(Debug, Clone, Copy, PartialEq, Eq)]
952pub enum OcrEngine {
953    /// Mock OCR provider for testing
954    Mock,
955    /// Tesseract OCR (local processing)
956    Tesseract,
957    /// Azure Computer Vision OCR
958    Azure,
959    /// AWS Textract
960    Aws,
961    /// Google Cloud Vision OCR
962    GoogleCloud,
963}
964
965impl OcrEngine {
966    /// Get the name of the OCR engine
967    pub fn name(&self) -> &'static str {
968        match self {
969            OcrEngine::Mock => "Mock OCR",
970            OcrEngine::Tesseract => "Tesseract",
971            OcrEngine::Azure => "Azure Computer Vision",
972            OcrEngine::Aws => "AWS Textract",
973            OcrEngine::GoogleCloud => "Google Cloud Vision",
974        }
975    }
976
977    /// Check if this engine supports the given image format
978    pub fn supports_format(&self, format: ImageFormat) -> bool {
979        match self {
980            OcrEngine::Mock => true, // Mock supports all formats
981            OcrEngine::Tesseract => matches!(
982                format,
983                ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
984            ),
985            OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
986            OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
987            OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
988        }
989    }
990}
991
992impl fmt::Display for OcrEngine {
993    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
994        write!(f, "{}", self.name())
995    }
996}
997
998/// Trait for OCR providers
999///
1000/// This trait defines the interface that all OCR providers must implement.
1001/// It provides methods for processing images and extracting text with position information.
1002///
1003/// # Implementation Notes
1004///
1005/// - Implementations should handle errors gracefully and return meaningful error messages
1006/// - The `process_image` method is the core functionality that all providers must implement
1007/// - The `process_page` method is a convenience method for working with page analysis results
1008/// - Providers should validate image formats and reject unsupported formats
1009///
1010/// # Examples
1011///
1012/// ```rust
1013/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
1014/// use oxidize_pdf::graphics::ImageFormat;
1015///
1016/// struct MyOcrProvider;
1017///
1018/// impl OcrProvider for MyOcrProvider {
1019///     fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
1020///         // Implementation here
1021///         # Ok(OcrProcessingResult {
1022///         #     text: "Sample text".to_string(),
1023///         #     confidence: 0.95,
1024///         #     fragments: vec![],
1025///         #     processing_time_ms: 100,
1026///         #     engine_name: "MyOCR".to_string(),
1027///         #     language: "en".to_string(),
1028///         #     image_dimensions: (800, 600),
1029///         #     processed_region: None,
1030///         # })
1031///     }
1032///
1033///     fn supported_formats(&self) -> Vec<ImageFormat> {
1034///         vec![ImageFormat::Jpeg, ImageFormat::Png]
1035///     }
1036///
1037///     fn engine_name(&self) -> &str {
1038///         "MyOCR"
1039///     }
1040///
1041///     fn engine_type(&self) -> OcrEngine {
1042///         OcrEngine::Mock
1043///     }
1044/// }
1045/// ```
1046pub trait OcrProvider: Send + Sync {
1047    /// Process an image and extract text using OCR
1048    ///
1049    /// This is the core method that all OCR providers must implement.
1050    /// It takes image data as bytes and returns structured text results.
1051    ///
1052    /// # Arguments
1053    ///
1054    /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
1055    /// * `options` - OCR processing options and configuration
1056    ///
1057    /// # Returns
1058    ///
1059    /// A `Result` containing the OCR results with text, confidence, and positioning information.
1060    ///
1061    /// # Errors
1062    ///
1063    /// Returns an error if:
1064    /// - The image format is not supported
1065    /// - The image data is corrupted or invalid
1066    /// - OCR processing fails
1067    /// - Network errors occur (for cloud providers)
1068    /// - Authentication fails (for cloud providers)
1069    fn process_image(
1070        &self,
1071        image_data: &[u8],
1072        options: &OcrOptions,
1073    ) -> OcrResult<OcrProcessingResult>;
1074
1075    /// Process a scanned page using content analysis information
1076    ///
1077    /// This method provides a higher-level interface that works with page analysis results.
1078    /// It's particularly useful when integrating with the page analysis module.
1079    ///
1080    /// # Arguments
1081    ///
1082    /// * `page_analysis` - Results from page content analysis
1083    /// * `page_data` - Raw page data or image data
1084    /// * `options` - OCR processing options
1085    ///
1086    /// # Returns
1087    ///
1088    /// OCR results optimized for the specific page content type.
1089    ///
1090    /// # Default Implementation
1091    ///
1092    /// The default implementation simply calls `process_image` with the page data.
1093    /// Providers can override this to provide specialized handling based on page analysis.
1094    fn process_page(
1095        &self,
1096        _page_analysis: &ContentAnalysis,
1097        page_data: &[u8],
1098        options: &OcrOptions,
1099    ) -> OcrResult<OcrProcessingResult> {
1100        self.process_image(page_data, options)
1101    }
1102
1103    /// Process multiple images with region information
1104    ///
1105    /// This method allows for selective OCR processing where each image corresponds
1106    /// to a specific region. This is useful for:
1107    /// - Processing pre-cropped regions of a document  
1108    /// - Batch processing of multiple regions with different OCR settings
1109    /// - Optimizing performance by avoiding full-image processing
1110    ///
1111    /// # Arguments
1112    ///
1113    /// * `image_region_pairs` - Vector of (image_data, region) pairs
1114    /// * `options` - OCR processing options (applies to all regions)
1115    ///
1116    /// # Returns
1117    ///
1118    /// A vector of `OcrProcessingResult`, one for each processed region.
1119    /// The order matches the input pairs vector.
1120    ///
1121    /// # Default Implementation
1122    ///
1123    /// The default implementation processes each image separately and sets
1124    /// the region information in the result.
1125    fn process_image_regions(
1126        &self,
1127        image_region_pairs: &[(&[u8], &OcrRegion)],
1128        options: &OcrOptions,
1129    ) -> OcrResult<Vec<OcrProcessingResult>> {
1130        let mut results = Vec::with_capacity(image_region_pairs.len());
1131
1132        for (image_data, region) in image_region_pairs {
1133            let mut result = self.process_image(image_data, options)?;
1134
1135            // Adjust fragment coordinates to match original image coordinates
1136            // (assuming the input image_data is already cropped to the region)
1137            for fragment in &mut result.fragments {
1138                fragment.x += region.x as f64;
1139                fragment.y += region.y as f64;
1140            }
1141
1142            result.processed_region = Some((*region).clone());
1143            results.push(result);
1144        }
1145
1146        Ok(results)
1147    }
1148
1149    /// Get the list of supported image formats
1150    ///
1151    /// # Returns
1152    ///
1153    /// A vector of `ImageFormat` values that this provider can process.
1154    fn supported_formats(&self) -> Vec<ImageFormat>;
1155
1156    /// Get the name of this OCR provider
1157    ///
1158    /// # Returns
1159    ///
1160    /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
1161    fn engine_name(&self) -> &str;
1162
1163    /// Get the engine type for this provider
1164    ///
1165    /// # Returns
1166    ///
1167    /// The `OcrEngine` enum value corresponding to this provider.
1168    fn engine_type(&self) -> OcrEngine;
1169
1170    /// Check if this provider supports the given image format
1171    ///
1172    /// # Arguments
1173    ///
1174    /// * `format` - The image format to check
1175    ///
1176    /// # Returns
1177    ///
1178    /// `true` if the format is supported, `false` otherwise.
1179    fn supports_format(&self, format: ImageFormat) -> bool {
1180        self.supported_formats().contains(&format)
1181    }
1182
1183    /// Validate image data before processing
1184    ///
1185    /// This method can be used to perform basic validation of image data
1186    /// before attempting OCR processing.
1187    ///
1188    /// # Arguments
1189    ///
1190    /// * `image_data` - Raw image bytes to validate
1191    ///
1192    /// # Returns
1193    ///
1194    /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
1195    ///
1196    /// # Default Implementation
1197    ///
1198    /// The default implementation performs basic format detection based on magic bytes.
1199    fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
1200        if image_data.len() < 8 {
1201            return Err(OcrError::InvalidImageData(
1202                "Image data too short".to_string(),
1203            ));
1204        }
1205
1206        // Check for common image format signatures
1207        let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
1208            ImageFormat::Jpeg
1209        } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
1210            ImageFormat::Png
1211        } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
1212            ImageFormat::Tiff
1213        } else {
1214            return Err(OcrError::InvalidImageData(
1215                "Unrecognized image format".to_string(),
1216            ));
1217        };
1218
1219        if !self.supports_format(format) {
1220            return Err(OcrError::UnsupportedImageFormat(format));
1221        }
1222
1223        Ok(())
1224    }
1225}
1226
1227/// Mock OCR provider for testing and development
1228///
1229/// This provider simulates OCR processing without actually performing text recognition.
1230/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
1231///
1232/// # Examples
1233///
1234/// ```rust
1235/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
1236///
1237/// let provider = MockOcrProvider::new();
1238/// let options = OcrOptions::default();
1239/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
1240///
1241/// let result = provider.process_image(&image_data, &options).unwrap();
1242/// assert!(result.text.contains("Mock OCR"));
1243/// ```
1244#[derive(Clone)]
1245pub struct MockOcrProvider {
1246    /// Mock confidence level to return
1247    confidence: f64,
1248    /// Mock text to return
1249    mock_text: String,
1250    /// Simulated processing delay (milliseconds)
1251    processing_delay_ms: u64,
1252}
1253
1254impl MockOcrProvider {
1255    /// Create a new mock OCR provider with default settings
1256    pub fn new() -> Self {
1257        Self {
1258            confidence: 0.85,
1259            mock_text: "Mock OCR extracted text from scanned image".to_string(),
1260            processing_delay_ms: 100,
1261        }
1262    }
1263
1264    /// Create a mock provider with custom text and confidence
1265    pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
1266        Self {
1267            confidence,
1268            mock_text: text,
1269            processing_delay_ms: 100,
1270        }
1271    }
1272
1273    /// Set the mock text to return
1274    pub fn set_mock_text(&mut self, text: String) {
1275        self.mock_text = text;
1276    }
1277
1278    /// Set the confidence level to return
1279    pub fn set_confidence(&mut self, confidence: f64) {
1280        self.confidence = confidence.clamp(0.0, 1.0);
1281    }
1282
1283    /// Set the simulated processing delay
1284    pub fn set_processing_delay(&mut self, delay_ms: u64) {
1285        self.processing_delay_ms = delay_ms;
1286    }
1287}
1288
1289impl Default for MockOcrProvider {
1290    fn default() -> Self {
1291        Self::new()
1292    }
1293}
1294
1295impl OcrProvider for MockOcrProvider {
1296    fn process_image(
1297        &self,
1298        image_data: &[u8],
1299        options: &OcrOptions,
1300    ) -> OcrResult<OcrProcessingResult> {
1301        // Validate image data
1302        self.validate_image_data(image_data)?;
1303
1304        // Simulate processing time
1305        std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
1306
1307        // Create mock text fragments
1308        let fragments = vec![
1309            OcrTextFragment {
1310                text: self.mock_text.clone(),
1311                x: 50.0,
1312                y: 700.0,
1313                width: 200.0,
1314                height: 20.0,
1315                confidence: self.confidence,
1316                word_confidences: None,
1317                font_size: 12.0,
1318                fragment_type: FragmentType::Line,
1319            },
1320            OcrTextFragment {
1321                text: "Additional mock text".to_string(),
1322                x: 50.0,
1323                y: 680.0,
1324                width: 150.0,
1325                height: 20.0,
1326                confidence: self.confidence * 0.9,
1327                word_confidences: None,
1328                font_size: 12.0,
1329                fragment_type: FragmentType::Line,
1330            },
1331        ];
1332
1333        Ok(OcrProcessingResult {
1334            text: format!("{}\nAdditional mock text", self.mock_text),
1335            confidence: self.confidence,
1336            fragments,
1337            processing_time_ms: self.processing_delay_ms,
1338            engine_name: "Mock OCR".to_string(),
1339            language: options.language.clone(),
1340            processed_region: None,
1341            image_dimensions: (800, 600), // Mock dimensions
1342        })
1343    }
1344
1345    fn supported_formats(&self) -> Vec<ImageFormat> {
1346        vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
1347    }
1348
1349    fn engine_name(&self) -> &str {
1350        "Mock OCR"
1351    }
1352
1353    fn engine_type(&self) -> OcrEngine {
1354        OcrEngine::Mock
1355    }
1356}
1357
1358#[cfg(test)]
1359mod tests {
1360    use super::*;
1361
1362    #[test]
1363    fn test_ocr_options_default() {
1364        let options = OcrOptions::default();
1365        assert_eq!(options.language, "en");
1366        assert_eq!(options.min_confidence, 0.6);
1367        assert!(options.preserve_layout);
1368        assert_eq!(options.timeout_seconds, 60); // Updated to match actual default
1369    }
1370
1371    #[test]
1372    fn test_image_preprocessing_default() {
1373        let preprocessing = ImagePreprocessing::default();
1374        assert!(preprocessing.denoise);
1375        assert!(preprocessing.deskew);
1376        assert!(preprocessing.enhance_contrast);
1377        assert!(!preprocessing.sharpen);
1378        assert_eq!(preprocessing.scale_factor, 1.0);
1379    }
1380
1381    #[test]
1382    fn test_ocr_engine_name() {
1383        assert_eq!(OcrEngine::Mock.name(), "Mock OCR");
1384        assert_eq!(OcrEngine::Tesseract.name(), "Tesseract");
1385        assert_eq!(OcrEngine::Azure.name(), "Azure Computer Vision");
1386    }
1387
1388    #[test]
1389    fn test_ocr_engine_supports_format() {
1390        assert!(OcrEngine::Mock.supports_format(ImageFormat::Jpeg));
1391        assert!(OcrEngine::Mock.supports_format(ImageFormat::Png));
1392        assert!(OcrEngine::Mock.supports_format(ImageFormat::Tiff));
1393
1394        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Jpeg));
1395        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Png));
1396        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Tiff));
1397
1398        assert!(OcrEngine::Azure.supports_format(ImageFormat::Jpeg));
1399        assert!(OcrEngine::Azure.supports_format(ImageFormat::Png));
1400        assert!(!OcrEngine::Azure.supports_format(ImageFormat::Tiff));
1401    }
1402
1403    #[test]
1404    fn test_fragment_type_equality() {
1405        assert_eq!(FragmentType::Word, FragmentType::Word);
1406        assert_ne!(FragmentType::Word, FragmentType::Line);
1407        assert_ne!(FragmentType::Character, FragmentType::Paragraph);
1408    }
1409
1410    #[test]
1411    fn test_mock_ocr_provider_creation() {
1412        let provider = MockOcrProvider::new();
1413        assert_eq!(provider.confidence, 0.85);
1414        assert!(provider.mock_text.contains("Mock OCR"));
1415        assert_eq!(provider.processing_delay_ms, 100);
1416    }
1417
1418    #[test]
1419    fn test_mock_ocr_provider_with_custom_text() {
1420        let custom_text = "Custom mock text".to_string();
1421        let provider = MockOcrProvider::with_text_and_confidence(custom_text.clone(), 0.95);
1422        assert_eq!(provider.mock_text, custom_text);
1423        assert_eq!(provider.confidence, 0.95);
1424    }
1425
1426    #[test]
1427    fn test_mock_ocr_provider_process_image() {
1428        let provider = MockOcrProvider::new();
1429        let options = OcrOptions::default();
1430
1431        // Mock JPEG data
1432        let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
1433
1434        let result = provider.process_image(&jpeg_data, &options).unwrap();
1435        assert!(result.text.contains("Mock OCR"));
1436        assert_eq!(result.confidence, 0.85);
1437        assert!(!result.fragments.is_empty());
1438        assert_eq!(result.engine_name, "Mock OCR");
1439        assert_eq!(result.language, "en");
1440    }
1441
1442    #[test]
1443    fn test_mock_ocr_provider_supported_formats() {
1444        let provider = MockOcrProvider::new();
1445        let formats = provider.supported_formats();
1446        assert!(formats.contains(&ImageFormat::Jpeg));
1447        assert!(formats.contains(&ImageFormat::Png));
1448        assert!(formats.contains(&ImageFormat::Tiff));
1449    }
1450
1451    #[test]
1452    fn test_mock_ocr_provider_engine_info() {
1453        let provider = MockOcrProvider::new();
1454        assert_eq!(provider.engine_name(), "Mock OCR");
1455        assert_eq!(provider.engine_type(), OcrEngine::Mock);
1456    }
1457
1458    #[test]
1459    fn test_mock_ocr_provider_supports_format() {
1460        let provider = MockOcrProvider::new();
1461        assert!(provider.supports_format(ImageFormat::Jpeg));
1462        assert!(provider.supports_format(ImageFormat::Png));
1463        assert!(provider.supports_format(ImageFormat::Tiff));
1464    }
1465
1466    #[test]
1467    fn test_mock_ocr_provider_validate_image_data() {
1468        let provider = MockOcrProvider::new();
1469
1470        // Valid JPEG data
1471        let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
1472        assert!(provider.validate_image_data(&jpeg_data).is_ok());
1473
1474        // Invalid data (too short)
1475        let short_data = vec![0xFF, 0xD8];
1476        assert!(provider.validate_image_data(&short_data).is_err());
1477
1478        // Invalid format
1479        let invalid_data = vec![0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09];
1480        assert!(provider.validate_image_data(&invalid_data).is_err());
1481    }
1482
1483    #[test]
1484    fn test_ocr_processing_result_filter_by_confidence() {
1485        let result = OcrProcessingResult {
1486            text: "Test text".to_string(),
1487            confidence: 0.8,
1488            fragments: vec![
1489                OcrTextFragment {
1490                    text: "High confidence".to_string(),
1491                    x: 0.0,
1492                    y: 0.0,
1493                    width: 100.0,
1494                    height: 20.0,
1495                    confidence: 0.9,
1496                    word_confidences: None,
1497                    font_size: 12.0,
1498                    fragment_type: FragmentType::Word,
1499                },
1500                OcrTextFragment {
1501                    text: "Low confidence".to_string(),
1502                    x: 0.0,
1503                    y: 20.0,
1504                    width: 100.0,
1505                    height: 20.0,
1506                    confidence: 0.5,
1507                    word_confidences: None,
1508                    font_size: 12.0,
1509                    fragment_type: FragmentType::Word,
1510                },
1511            ],
1512            processing_time_ms: 100,
1513            engine_name: "Test".to_string(),
1514            language: "en".to_string(),
1515            processed_region: None,
1516            image_dimensions: (800, 600),
1517        };
1518
1519        let high_confidence = result.filter_by_confidence(0.8);
1520        assert_eq!(high_confidence.len(), 1);
1521        assert_eq!(high_confidence[0].text, "High confidence");
1522    }
1523
1524    #[test]
1525    fn test_ocr_processing_result_fragments_in_region() {
1526        let result = OcrProcessingResult {
1527            text: "Test text".to_string(),
1528            confidence: 0.8,
1529            fragments: vec![
1530                OcrTextFragment {
1531                    text: "Inside region".to_string(),
1532                    x: 10.0,
1533                    y: 10.0,
1534                    width: 80.0,
1535                    height: 20.0,
1536                    confidence: 0.9,
1537                    word_confidences: None,
1538                    font_size: 12.0,
1539                    fragment_type: FragmentType::Word,
1540                },
1541                OcrTextFragment {
1542                    text: "Outside region".to_string(),
1543                    x: 200.0,
1544                    y: 200.0,
1545                    width: 80.0,
1546                    height: 20.0,
1547                    confidence: 0.9,
1548                    word_confidences: None,
1549                    font_size: 12.0,
1550                    fragment_type: FragmentType::Word,
1551                },
1552            ],
1553            processing_time_ms: 100,
1554            engine_name: "Test".to_string(),
1555            language: "en".to_string(),
1556            processed_region: None,
1557            image_dimensions: (800, 600),
1558        };
1559
1560        let in_region = result.fragments_in_region(0.0, 0.0, 100.0, 100.0);
1561        assert_eq!(in_region.len(), 1);
1562        assert_eq!(in_region[0].text, "Inside region");
1563    }
1564
1565    #[test]
1566    fn test_ocr_processing_result_fragments_of_type() {
1567        let result = OcrProcessingResult {
1568            text: "Test text".to_string(),
1569            confidence: 0.8,
1570            fragments: vec![
1571                OcrTextFragment {
1572                    text: "Word fragment".to_string(),
1573                    x: 0.0,
1574                    y: 0.0,
1575                    width: 100.0,
1576                    height: 20.0,
1577                    confidence: 0.9,
1578                    word_confidences: None,
1579                    font_size: 12.0,
1580                    fragment_type: FragmentType::Word,
1581                },
1582                OcrTextFragment {
1583                    text: "Line fragment".to_string(),
1584                    x: 0.0,
1585                    y: 20.0,
1586                    width: 200.0,
1587                    height: 20.0,
1588                    confidence: 0.9,
1589                    word_confidences: None,
1590                    font_size: 12.0,
1591                    fragment_type: FragmentType::Line,
1592                },
1593            ],
1594            processing_time_ms: 100,
1595            engine_name: "Test".to_string(),
1596            language: "en".to_string(),
1597            processed_region: None,
1598            image_dimensions: (800, 600),
1599        };
1600
1601        let words = result.fragments_of_type(FragmentType::Word);
1602        assert_eq!(words.len(), 1);
1603        assert_eq!(words[0].text, "Word fragment");
1604
1605        let lines = result.fragments_of_type(FragmentType::Line);
1606        assert_eq!(lines.len(), 1);
1607        assert_eq!(lines[0].text, "Line fragment");
1608    }
1609
1610    #[test]
1611    fn test_ocr_processing_result_average_confidence() {
1612        let result = OcrProcessingResult {
1613            text: "Test text".to_string(),
1614            confidence: 0.8,
1615            fragments: vec![
1616                OcrTextFragment {
1617                    text: "Fragment 1".to_string(),
1618                    x: 0.0,
1619                    y: 0.0,
1620                    width: 100.0,
1621                    height: 20.0,
1622                    confidence: 0.8,
1623                    word_confidences: None,
1624                    font_size: 12.0,
1625                    fragment_type: FragmentType::Word,
1626                },
1627                OcrTextFragment {
1628                    text: "Fragment 2".to_string(),
1629                    x: 0.0,
1630                    y: 20.0,
1631                    width: 100.0,
1632                    height: 20.0,
1633                    confidence: 0.6,
1634                    word_confidences: None,
1635                    font_size: 12.0,
1636                    fragment_type: FragmentType::Word,
1637                },
1638            ],
1639            processing_time_ms: 100,
1640            engine_name: "Test".to_string(),
1641            language: "en".to_string(),
1642            processed_region: None,
1643            image_dimensions: (800, 600),
1644        };
1645
1646        let avg_confidence = result.average_confidence();
1647        assert_eq!(avg_confidence, 0.7);
1648    }
1649
1650    #[test]
1651    fn test_ocr_processing_result_average_confidence_empty() {
1652        let result = OcrProcessingResult {
1653            text: "Test text".to_string(),
1654            confidence: 0.8,
1655            fragments: vec![],
1656            processing_time_ms: 100,
1657            engine_name: "Test".to_string(),
1658            language: "en".to_string(),
1659            processed_region: None,
1660            image_dimensions: (800, 600),
1661        };
1662
1663        let avg_confidence = result.average_confidence();
1664        assert_eq!(avg_confidence, 0.0);
1665    }
1666
1667    // Comprehensive tests for OCR module
1668    mod comprehensive_tests {
1669        use super::*;
1670        use std::collections::HashMap;
1671
1672        // Error Type Tests
1673        #[test]
1674        fn test_ocr_error_display() {
1675            let errors = vec![
1676                OcrError::ProviderNotAvailable("Tesseract not installed".to_string()),
1677                OcrError::UnsupportedImageFormat(ImageFormat::Tiff),
1678                OcrError::InvalidImageData("Corrupted header".to_string()),
1679                OcrError::ProcessingFailed("OCR engine crashed".to_string()),
1680                OcrError::NetworkError("Connection timeout".to_string()),
1681                OcrError::AuthenticationError("Invalid API key".to_string()),
1682                OcrError::RateLimitExceeded("429 Too Many Requests".to_string()),
1683                OcrError::LowConfidence("Confidence below threshold".to_string()),
1684                OcrError::Configuration("Missing language pack".to_string()),
1685            ];
1686
1687            for error in errors {
1688                let display = format!("{error}");
1689                assert!(!display.is_empty());
1690
1691                // Verify error messages contain expected content
1692                match &error {
1693                    OcrError::ProviderNotAvailable(msg) => assert!(display.contains(msg)),
1694                    OcrError::UnsupportedImageFormat(_) => {
1695                        assert!(display.contains("Unsupported image format"))
1696                    }
1697                    OcrError::InvalidImageData(msg) => assert!(display.contains(msg)),
1698                    OcrError::ProcessingFailed(msg) => assert!(display.contains(msg)),
1699                    OcrError::NetworkError(msg) => assert!(display.contains(msg)),
1700                    OcrError::AuthenticationError(msg) => assert!(display.contains(msg)),
1701                    OcrError::RateLimitExceeded(msg) => assert!(display.contains(msg)),
1702                    OcrError::LowConfidence(msg) => assert!(display.contains(msg)),
1703                    OcrError::Configuration(msg) => assert!(display.contains(msg)),
1704                    _ => {}
1705                }
1706            }
1707        }
1708
1709        #[test]
1710        fn test_ocr_error_from_io_error() {
1711            use std::io::{Error as IoError, ErrorKind};
1712
1713            let io_error = IoError::new(ErrorKind::NotFound, "File not found");
1714            let ocr_error: OcrError = io_error.into();
1715
1716            match ocr_error {
1717                OcrError::Io(_) => {
1718                    let display = format!("{ocr_error}");
1719                    assert!(display.contains("IO error"));
1720                }
1721                _ => panic!("Expected OcrError::Io variant"),
1722            }
1723        }
1724
1725        #[test]
1726        fn test_ocr_error_debug_format() {
1727            let error = OcrError::ProcessingFailed("Test error".to_string());
1728            let debug_str = format!("{error:?}");
1729            assert!(debug_str.contains("ProcessingFailed"));
1730            assert!(debug_str.contains("Test error"));
1731        }
1732
1733        // OcrOptions Tests
1734        #[test]
1735        fn test_ocr_options_custom_language() {
1736            let mut options = OcrOptions::default();
1737            assert_eq!(options.language, "en");
1738
1739            options.language = "spa+eng".to_string();
1740            assert_eq!(options.language, "spa+eng");
1741
1742            options.language = "jpn".to_string();
1743            assert_eq!(options.language, "jpn");
1744        }
1745
1746        #[test]
1747        fn test_ocr_options_confidence_threshold() {
1748            let mut options = OcrOptions::default();
1749            assert_eq!(options.min_confidence, 0.6);
1750
1751            // Test various thresholds
1752            options.min_confidence = 0.0;
1753            assert_eq!(options.min_confidence, 0.0);
1754
1755            options.min_confidence = 1.0;
1756            assert_eq!(options.min_confidence, 1.0);
1757
1758            options.min_confidence = 0.85;
1759            assert_eq!(options.min_confidence, 0.85);
1760        }
1761
1762        #[test]
1763        fn test_ocr_options_engine_specific() {
1764            let mut options = OcrOptions::default();
1765            assert!(options.engine_options.is_empty());
1766
1767            // Add engine-specific options
1768            options.engine_options.insert(
1769                "tessedit_char_whitelist".to_string(),
1770                "0123456789".to_string(),
1771            );
1772            options
1773                .engine_options
1774                .insert("tessedit_ocr_engine_mode".to_string(), "3".to_string());
1775
1776            assert_eq!(options.engine_options.len(), 2);
1777            assert_eq!(
1778                options.engine_options.get("tessedit_char_whitelist"),
1779                Some(&"0123456789".to_string())
1780            );
1781        }
1782
1783        #[test]
1784        fn test_ocr_options_clone() {
1785            let mut options = OcrOptions {
1786                language: "fra".to_string(),
1787                min_confidence: 0.75,
1788                preserve_layout: false,
1789                preprocessing: ImagePreprocessing {
1790                    denoise: false,
1791                    deskew: true,
1792                    enhance_contrast: false,
1793                    sharpen: true,
1794                    scale_factor: 1.5,
1795                },
1796                engine_options: HashMap::new(),
1797                timeout_seconds: 60,
1798                regions: None,
1799                debug_output: false,
1800            };
1801
1802            options
1803                .engine_options
1804                .insert("key".to_string(), "value".to_string());
1805
1806            let cloned = options.clone();
1807            assert_eq!(cloned.language, options.language);
1808            assert_eq!(cloned.min_confidence, options.min_confidence);
1809            assert_eq!(cloned.preserve_layout, options.preserve_layout);
1810            assert_eq!(
1811                cloned.preprocessing.scale_factor,
1812                options.preprocessing.scale_factor
1813            );
1814            assert_eq!(cloned.engine_options.get("key"), Some(&"value".to_string()));
1815            assert_eq!(cloned.timeout_seconds, options.timeout_seconds);
1816        }
1817
1818        #[test]
1819        fn test_ocr_options_timeout_configuration() {
1820            let mut options = OcrOptions::default();
1821            assert_eq!(options.timeout_seconds, 60); // Updated to match actual default
1822
1823            options.timeout_seconds = 0; // No timeout
1824            assert_eq!(options.timeout_seconds, 0);
1825
1826            options.timeout_seconds = 300; // 5 minutes
1827            assert_eq!(options.timeout_seconds, 300);
1828        }
1829
1830        // ImagePreprocessing Tests
1831        #[test]
1832        fn test_image_preprocessing_combinations() {
1833            let test_cases = vec![
1834                (true, true, true, true),
1835                (false, false, false, false),
1836                (true, false, true, false),
1837                (false, true, false, true),
1838            ];
1839
1840            for (denoise, deskew, enhance, sharpen) in test_cases {
1841                let preprocessing = ImagePreprocessing {
1842                    denoise,
1843                    deskew,
1844                    enhance_contrast: enhance,
1845                    sharpen,
1846                    scale_factor: 1.0,
1847                };
1848
1849                assert_eq!(preprocessing.denoise, denoise);
1850                assert_eq!(preprocessing.deskew, deskew);
1851                assert_eq!(preprocessing.enhance_contrast, enhance);
1852                assert_eq!(preprocessing.sharpen, sharpen);
1853            }
1854        }
1855
1856        #[test]
1857        fn test_image_preprocessing_scale_factor() {
1858            let mut preprocessing = ImagePreprocessing::default();
1859            assert_eq!(preprocessing.scale_factor, 1.0);
1860
1861            // Test various scale factors
1862            preprocessing.scale_factor = 0.5;
1863            assert_eq!(preprocessing.scale_factor, 0.5);
1864
1865            preprocessing.scale_factor = 2.0;
1866            assert_eq!(preprocessing.scale_factor, 2.0);
1867
1868            preprocessing.scale_factor = 1.25;
1869            assert_eq!(preprocessing.scale_factor, 1.25);
1870        }
1871
1872        #[test]
1873        fn test_image_preprocessing_clone() {
1874            let preprocessing = ImagePreprocessing {
1875                denoise: false,
1876                deskew: true,
1877                enhance_contrast: false,
1878                sharpen: true,
1879                scale_factor: 1.5,
1880            };
1881
1882            let cloned = preprocessing.clone();
1883            assert_eq!(cloned.denoise, preprocessing.denoise);
1884            assert_eq!(cloned.deskew, preprocessing.deskew);
1885            assert_eq!(cloned.enhance_contrast, preprocessing.enhance_contrast);
1886            assert_eq!(cloned.sharpen, preprocessing.sharpen);
1887            assert_eq!(cloned.scale_factor, preprocessing.scale_factor);
1888        }
1889
1890        // OcrTextFragment Tests
1891        #[test]
1892        fn test_ocr_text_fragment_creation() {
1893            let fragment = OcrTextFragment {
1894                text: "Hello World".to_string(),
1895                x: 100.0,
1896                y: 200.0,
1897                width: 150.0,
1898                height: 25.0,
1899                confidence: 0.92,
1900                word_confidences: None,
1901                font_size: 14.0,
1902                fragment_type: FragmentType::Line,
1903            };
1904
1905            assert_eq!(fragment.text, "Hello World");
1906            assert_eq!(fragment.x, 100.0);
1907            assert_eq!(fragment.y, 200.0);
1908            assert_eq!(fragment.width, 150.0);
1909            assert_eq!(fragment.height, 25.0);
1910            assert_eq!(fragment.confidence, 0.92);
1911            assert_eq!(fragment.font_size, 14.0);
1912            assert_eq!(fragment.fragment_type, FragmentType::Line);
1913        }
1914
1915        #[test]
1916        fn test_ocr_text_fragment_clone() {
1917            let fragment = OcrTextFragment {
1918                text: "Test".to_string(),
1919                x: 50.0,
1920                y: 100.0,
1921                width: 40.0,
1922                height: 15.0,
1923                confidence: 0.88,
1924                word_confidences: None,
1925                font_size: 11.0,
1926                fragment_type: FragmentType::Word,
1927            };
1928
1929            let cloned = fragment.clone();
1930            assert_eq!(cloned.text, fragment.text);
1931            assert_eq!(cloned.x, fragment.x);
1932            assert_eq!(cloned.confidence, fragment.confidence);
1933            assert_eq!(cloned.fragment_type, fragment.fragment_type);
1934        }
1935
1936        #[test]
1937        fn test_fragment_type_copy() {
1938            let ft1 = FragmentType::Character;
1939            let ft2 = ft1; // Copy
1940            assert_eq!(ft1, ft2);
1941            assert_eq!(ft1, FragmentType::Character);
1942        }
1943
1944        #[test]
1945        fn test_fragment_position_calculations() {
1946            let fragment = OcrTextFragment {
1947                text: "Test".to_string(),
1948                x: 100.0,
1949                y: 200.0,
1950                width: 50.0,
1951                height: 20.0,
1952                confidence: 0.9,
1953                word_confidences: None,
1954                font_size: 12.0,
1955                fragment_type: FragmentType::Word,
1956            };
1957
1958            // Calculate bounding box
1959            let right = fragment.x + fragment.width;
1960            let bottom = fragment.y + fragment.height;
1961
1962            assert_eq!(right, 150.0);
1963            assert_eq!(bottom, 220.0);
1964        }
1965
1966        // OcrProcessingResult Advanced Tests
1967        #[test]
1968        fn test_ocr_result_complex_region_filtering() {
1969            let fragments = vec![
1970                OcrTextFragment {
1971                    text: "A".to_string(),
1972                    x: 10.0,
1973                    y: 10.0,
1974                    width: 20.0,
1975                    height: 20.0,
1976                    confidence: 0.9,
1977                    word_confidences: None,
1978                    font_size: 12.0,
1979                    fragment_type: FragmentType::Character,
1980                },
1981                OcrTextFragment {
1982                    text: "B".to_string(),
1983                    x: 25.0,
1984                    y: 10.0,
1985                    width: 20.0,
1986                    height: 20.0,
1987                    confidence: 0.9,
1988                    word_confidences: None,
1989                    font_size: 12.0,
1990                    fragment_type: FragmentType::Character,
1991                },
1992                OcrTextFragment {
1993                    text: "C".to_string(),
1994                    x: 10.0,
1995                    y: 35.0,
1996                    width: 20.0,
1997                    height: 20.0,
1998                    confidence: 0.9,
1999                    word_confidences: None,
2000                    font_size: 12.0,
2001                    fragment_type: FragmentType::Character,
2002                },
2003                OcrTextFragment {
2004                    text: "D".to_string(),
2005                    x: 100.0,
2006                    y: 100.0,
2007                    width: 20.0,
2008                    height: 20.0,
2009                    confidence: 0.9,
2010                    word_confidences: None,
2011                    font_size: 12.0,
2012                    fragment_type: FragmentType::Character,
2013                },
2014            ];
2015
2016            let result = OcrProcessingResult {
2017                text: "ABCD".to_string(),
2018                confidence: 0.9,
2019                fragments,
2020                processing_time_ms: 50,
2021                engine_name: "Test".to_string(),
2022                language: "en".to_string(),
2023                processed_region: None,
2024                image_dimensions: (200, 200),
2025            };
2026
2027            // Test overlapping region - B is partially outside (starts at x=25, width=20, so ends at x=45)
2028            let region1 = result.fragments_in_region(0.0, 0.0, 50.0, 50.0);
2029            assert_eq!(region1.len(), 2); // A and C (B extends beyond the region)
2030
2031            // Test exact fit
2032            let region2 = result.fragments_in_region(10.0, 10.0, 20.0, 20.0);
2033            assert_eq!(region2.len(), 1); // Only A
2034
2035            // Test empty region
2036            let region3 = result.fragments_in_region(200.0, 200.0, 50.0, 50.0);
2037            assert_eq!(region3.len(), 0);
2038        }
2039
2040        #[test]
2041        fn test_ocr_result_confidence_edge_cases() {
2042            let fragments = vec![
2043                OcrTextFragment {
2044                    text: "Perfect".to_string(),
2045                    x: 0.0,
2046                    y: 0.0,
2047                    width: 100.0,
2048                    height: 20.0,
2049                    confidence: 1.0,
2050                    word_confidences: None,
2051                    font_size: 12.0,
2052                    fragment_type: FragmentType::Word,
2053                },
2054                OcrTextFragment {
2055                    text: "Zero".to_string(),
2056                    x: 0.0,
2057                    y: 25.0,
2058                    width: 50.0,
2059                    height: 20.0,
2060                    confidence: 0.0,
2061                    word_confidences: None,
2062                    font_size: 12.0,
2063                    fragment_type: FragmentType::Word,
2064                },
2065                OcrTextFragment {
2066                    text: "Mid".to_string(),
2067                    x: 0.0,
2068                    y: 50.0,
2069                    width: 30.0,
2070                    height: 20.0,
2071                    confidence: 0.5,
2072                    word_confidences: None,
2073                    font_size: 12.0,
2074                    fragment_type: FragmentType::Word,
2075                },
2076            ];
2077
2078            let result = OcrProcessingResult {
2079                text: "Perfect Zero Mid".to_string(),
2080                confidence: 0.5,
2081                fragments,
2082                processing_time_ms: 50,
2083                engine_name: "Test".to_string(),
2084                language: "en".to_string(),
2085                processed_region: None,
2086                image_dimensions: (200, 200),
2087            };
2088
2089            // Test boundary conditions
2090            assert_eq!(result.filter_by_confidence(0.0).len(), 3);
2091            assert_eq!(result.filter_by_confidence(0.5).len(), 2);
2092            assert_eq!(result.filter_by_confidence(1.0).len(), 1);
2093            assert_eq!(result.filter_by_confidence(1.1).len(), 0);
2094        }
2095
2096        #[test]
2097        fn test_ocr_result_fragment_type_combinations() {
2098            let fragments = vec![
2099                OcrTextFragment {
2100                    text: "A".to_string(),
2101                    x: 0.0,
2102                    y: 0.0,
2103                    width: 10.0,
2104                    height: 20.0,
2105                    confidence: 0.9,
2106                    word_confidences: None,
2107                    font_size: 12.0,
2108                    fragment_type: FragmentType::Character,
2109                },
2110                OcrTextFragment {
2111                    text: "Word".to_string(),
2112                    x: 20.0,
2113                    y: 0.0,
2114                    width: 40.0,
2115                    height: 20.0,
2116                    confidence: 0.9,
2117                    word_confidences: None,
2118                    font_size: 12.0,
2119                    fragment_type: FragmentType::Word,
2120                },
2121                OcrTextFragment {
2122                    text: "Line of text".to_string(),
2123                    x: 0.0,
2124                    y: 25.0,
2125                    width: 100.0,
2126                    height: 20.0,
2127                    confidence: 0.9,
2128                    word_confidences: None,
2129                    font_size: 12.0,
2130                    fragment_type: FragmentType::Line,
2131                },
2132                OcrTextFragment {
2133                    text: "Paragraph text...".to_string(),
2134                    x: 0.0,
2135                    y: 50.0,
2136                    width: 200.0,
2137                    height: 100.0,
2138                    confidence: 0.9,
2139                    word_confidences: None,
2140                    font_size: 12.0,
2141                    fragment_type: FragmentType::Paragraph,
2142                },
2143            ];
2144
2145            let result = OcrProcessingResult {
2146                text: "Combined".to_string(),
2147                confidence: 0.9,
2148                fragments,
2149                processing_time_ms: 50,
2150                engine_name: "Test".to_string(),
2151                language: "en".to_string(),
2152                processed_region: None,
2153                image_dimensions: (300, 300),
2154            };
2155
2156            assert_eq!(result.fragments_of_type(FragmentType::Character).len(), 1);
2157            assert_eq!(result.fragments_of_type(FragmentType::Word).len(), 1);
2158            assert_eq!(result.fragments_of_type(FragmentType::Line).len(), 1);
2159            assert_eq!(result.fragments_of_type(FragmentType::Paragraph).len(), 1);
2160        }
2161
2162        #[test]
2163        fn test_ocr_result_large_fragment_set() {
2164            // Test with many fragments (performance test)
2165            let mut fragments = Vec::new();
2166            for i in 0..1000 {
2167                fragments.push(OcrTextFragment {
2168                    text: format!("Fragment{i}"),
2169                    x: (i % 10) as f64 * 50.0,
2170                    y: (i / 10) as f64 * 20.0,
2171                    width: 45.0,
2172                    height: 18.0,
2173                    confidence: 0.5 + (i as f64 % 50.0) / 100.0,
2174                    word_confidences: None,
2175                    font_size: 12.0,
2176                    fragment_type: if i % 4 == 0 {
2177                        FragmentType::Line
2178                    } else {
2179                        FragmentType::Word
2180                    },
2181                });
2182            }
2183
2184            let result = OcrProcessingResult {
2185                text: "Large document".to_string(),
2186                confidence: 0.75,
2187                fragments,
2188                processing_time_ms: 500,
2189                engine_name: "Test".to_string(),
2190                language: "en".to_string(),
2191                processed_region: None,
2192                image_dimensions: (500, 2000),
2193            };
2194
2195            // Test various operations on large set
2196            let high_conf = result.filter_by_confidence(0.8);
2197            assert!(high_conf.len() < 1000);
2198
2199            let lines = result.fragments_of_type(FragmentType::Line);
2200            assert_eq!(lines.len(), 250); // 1/4 of fragments
2201
2202            let region = result.fragments_in_region(0.0, 0.0, 200.0, 200.0);
2203            assert!(!region.is_empty());
2204
2205            let avg = result.average_confidence();
2206            assert!(avg > 0.5 && avg < 1.0);
2207        }
2208
2209        #[test]
2210        fn test_ocr_result_empty_handling() {
2211            let result = OcrProcessingResult {
2212                text: String::new(),
2213                confidence: 0.0,
2214                fragments: vec![],
2215                processing_time_ms: 10,
2216                engine_name: "Test".to_string(),
2217                language: "en".to_string(),
2218                processed_region: None,
2219                image_dimensions: (0, 0),
2220            };
2221
2222            assert_eq!(result.filter_by_confidence(0.5).len(), 0);
2223            assert_eq!(result.fragments_in_region(0.0, 0.0, 100.0, 100.0).len(), 0);
2224            assert_eq!(result.fragments_of_type(FragmentType::Word).len(), 0);
2225            assert_eq!(result.average_confidence(), 0.0);
2226        }
2227
2228        // MockOcrProvider Advanced Tests
2229        #[test]
2230        fn test_mock_provider_configuration_mutations() {
2231            let mut provider = MockOcrProvider::new();
2232
2233            // Test text mutation
2234            provider.set_mock_text("Custom mock text".to_string());
2235
2236            // Test confidence mutation
2237            provider.set_confidence(0.95);
2238
2239            // Test delay mutation
2240            provider.set_processing_delay(200);
2241
2242            let options = OcrOptions::default();
2243            let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
2244
2245            let result = provider.process_image(&jpeg_data, &options).unwrap();
2246            assert!(result.text.contains("Custom mock text"));
2247            assert_eq!(result.confidence, 0.95);
2248            assert_eq!(result.processing_time_ms, 200);
2249        }
2250
2251        #[test]
2252        fn test_mock_provider_confidence_clamping() {
2253            let mut provider = MockOcrProvider::new();
2254
2255            // Test clamping above 1.0
2256            provider.set_confidence(1.5);
2257            assert_eq!(provider.confidence, 1.0);
2258
2259            // Test clamping below 0.0
2260            provider.set_confidence(-0.5);
2261            assert_eq!(provider.confidence, 0.0);
2262
2263            // Test normal values
2264            provider.set_confidence(0.75);
2265            assert_eq!(provider.confidence, 0.75);
2266        }
2267
2268        #[test]
2269        fn test_mock_provider_validate_png() {
2270            let provider = MockOcrProvider::new();
2271
2272            // Valid PNG signature
2273            let png_data = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
2274            assert!(provider.validate_image_data(&png_data).is_ok());
2275
2276            // Invalid PNG (corrupted signature)
2277            let bad_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0B];
2278            assert!(provider.validate_image_data(&bad_png).is_err());
2279        }
2280
2281        #[test]
2282        fn test_mock_provider_validate_tiff() {
2283            let provider = MockOcrProvider::new();
2284
2285            // Valid TIFF (little endian)
2286            let tiff_le = vec![0x49, 0x49, 0x2A, 0x00, 0x00, 0x00, 0x00, 0x00];
2287            assert!(provider.validate_image_data(&tiff_le).is_ok());
2288
2289            // Valid TIFF (big endian)
2290            let tiff_be = vec![0x4D, 0x4D, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x00];
2291            assert!(provider.validate_image_data(&tiff_be).is_ok());
2292        }
2293
2294        #[test]
2295        fn test_mock_provider_process_page() {
2296            let provider = MockOcrProvider::new();
2297            let options = OcrOptions::default();
2298
2299            // Create a mock ContentAnalysis
2300            let analysis = ContentAnalysis {
2301                page_number: 0,
2302                page_type: crate::operations::page_analysis::PageType::Scanned,
2303                text_ratio: 0.0,
2304                image_ratio: 1.0,
2305                blank_space_ratio: 0.0,
2306                text_fragment_count: 0,
2307                image_count: 1,
2308                character_count: 0,
2309            };
2310
2311            let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
2312
2313            // Test that process_page works (default implementation calls process_image)
2314            let result = provider
2315                .process_page(&analysis, &jpeg_data, &options)
2316                .unwrap();
2317            assert!(result.text.contains("Mock OCR"));
2318        }
2319
2320        #[test]
2321        fn test_mock_provider_thread_safety() {
2322            use std::sync::Arc;
2323            use std::thread;
2324
2325            let provider = Arc::new(MockOcrProvider::new());
2326            let options = Arc::new(OcrOptions::default());
2327
2328            let mut handles = vec![];
2329
2330            // Spawn multiple threads to test Send + Sync
2331            for i in 0..5 {
2332                let provider_clone = Arc::clone(&provider);
2333                let options_clone = Arc::clone(&options);
2334
2335                let handle = thread::spawn(move || {
2336                    let jpeg_data =
2337                        vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
2338                    let result = provider_clone
2339                        .process_image(&jpeg_data, &options_clone)
2340                        .unwrap();
2341                    assert!(result.text.contains("Mock OCR"));
2342                    i
2343                });
2344
2345                handles.push(handle);
2346            }
2347
2348            // Wait for all threads
2349            for handle in handles {
2350                let thread_id = handle.join().unwrap();
2351                assert!(thread_id < 5);
2352            }
2353        }
2354
2355        // OcrEngine Tests
2356        #[test]
2357        fn test_ocr_engine_display() {
2358            assert_eq!(format!("{}", OcrEngine::Mock), "Mock OCR");
2359            assert_eq!(format!("{}", OcrEngine::Tesseract), "Tesseract");
2360            assert_eq!(format!("{}", OcrEngine::Azure), "Azure Computer Vision");
2361            assert_eq!(format!("{}", OcrEngine::Aws), "AWS Textract");
2362            assert_eq!(format!("{}", OcrEngine::GoogleCloud), "Google Cloud Vision");
2363        }
2364
2365        #[test]
2366        fn test_ocr_engine_equality() {
2367            assert_eq!(OcrEngine::Mock, OcrEngine::Mock);
2368            assert_ne!(OcrEngine::Mock, OcrEngine::Tesseract);
2369
2370            // Test Copy trait
2371            let engine1 = OcrEngine::Azure;
2372            let engine2 = engine1;
2373            assert_eq!(engine1, engine2);
2374        }
2375
2376        #[test]
2377        fn test_ocr_engine_format_support_matrix() {
2378            // Test complete format support matrix
2379            let _engines = [
2380                OcrEngine::Mock,
2381                OcrEngine::Tesseract,
2382                OcrEngine::Azure,
2383                OcrEngine::Aws,
2384                OcrEngine::GoogleCloud,
2385            ];
2386
2387            let formats = [ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff];
2388
2389            // Expected support matrix
2390            let expected = vec![
2391                (OcrEngine::Mock, vec![true, true, true]),
2392                (OcrEngine::Tesseract, vec![true, true, true]),
2393                (OcrEngine::Azure, vec![true, true, false]),
2394                (OcrEngine::Aws, vec![true, true, false]),
2395                (OcrEngine::GoogleCloud, vec![true, true, false]),
2396            ];
2397
2398            for (engine, expected_support) in expected {
2399                for (i, format) in formats.iter().enumerate() {
2400                    assert_eq!(
2401                        engine.supports_format(*format),
2402                        expected_support[i],
2403                        "Engine {engine:?} format {format:?} support mismatch"
2404                    );
2405                }
2406            }
2407        }
2408
2409        // Integration Tests
2410        #[test]
2411        fn test_validate_image_data_all_formats() {
2412            let provider = MockOcrProvider::new();
2413
2414            // Test all supported formats
2415            let test_cases = vec![
2416                // JPEG with JFIF marker
2417                (vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46], true),
2418                // JPEG with EXIF marker
2419                (vec![0xFF, 0xD8, 0xFF, 0xE1, 0x00, 0x10, 0x45, 0x78], true),
2420                // PNG
2421                (vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], true),
2422                // TIFF LE
2423                (vec![0x49, 0x49, 0x2A, 0x00, 0x00, 0x00, 0x00, 0x00], true),
2424                // TIFF BE
2425                (vec![0x4D, 0x4D, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x00], true),
2426                // GIF (not supported)
2427                (vec![0x47, 0x49, 0x46, 0x38, 0x39, 0x61, 0x00, 0x00], false),
2428                // BMP (not supported)
2429                (vec![0x42, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], false),
2430                // Too short
2431                (vec![0xFF, 0xD8], false),
2432                // Empty
2433                (vec![], false),
2434            ];
2435
2436            for (data, should_succeed) in test_cases {
2437                let result = provider.validate_image_data(&data);
2438                assert_eq!(
2439                    result.is_ok(),
2440                    should_succeed,
2441                    "Failed for data: {:?}",
2442                    &data[..data.len().min(8)]
2443                );
2444            }
2445        }
2446
2447        #[test]
2448        fn test_ocr_options_with_all_preprocessing() {
2449            let options = OcrOptions {
2450                language: "deu+eng+fra".to_string(),
2451                min_confidence: 0.85,
2452                preserve_layout: true,
2453                preprocessing: ImagePreprocessing {
2454                    denoise: true,
2455                    deskew: true,
2456                    enhance_contrast: true,
2457                    sharpen: true,
2458                    scale_factor: 1.5,
2459                },
2460                engine_options: {
2461                    let mut map = HashMap::new();
2462                    map.insert("param1".to_string(), "value1".to_string());
2463                    map.insert("param2".to_string(), "value2".to_string());
2464                    map
2465                },
2466                timeout_seconds: 120,
2467                regions: None,
2468                debug_output: false,
2469            };
2470
2471            // Verify all fields
2472            assert_eq!(options.language, "deu+eng+fra");
2473            assert_eq!(options.min_confidence, 0.85);
2474            assert!(options.preserve_layout);
2475            assert!(options.preprocessing.denoise);
2476            assert!(options.preprocessing.deskew);
2477            assert!(options.preprocessing.enhance_contrast);
2478            assert!(options.preprocessing.sharpen);
2479            assert_eq!(options.preprocessing.scale_factor, 1.5);
2480            assert_eq!(options.engine_options.len(), 2);
2481            assert_eq!(options.timeout_seconds, 120);
2482        }
2483
2484        #[test]
2485        fn test_fragment_boundary_calculations() {
2486            let fragments = [
2487                OcrTextFragment {
2488                    text: "TopLeft".to_string(),
2489                    x: 0.0,
2490                    y: 0.0,
2491                    width: 50.0,
2492                    height: 20.0,
2493                    confidence: 0.9,
2494                    word_confidences: None,
2495                    font_size: 12.0,
2496                    fragment_type: FragmentType::Word,
2497                },
2498                OcrTextFragment {
2499                    text: "BottomRight".to_string(),
2500                    x: 550.0,
2501                    y: 770.0,
2502                    width: 60.0,
2503                    height: 20.0,
2504                    confidence: 0.9,
2505                    word_confidences: None,
2506                    font_size: 12.0,
2507                    fragment_type: FragmentType::Word,
2508                },
2509            ];
2510
2511            // Calculate document bounds
2512            let min_x = fragments.iter().map(|f| f.x).fold(f64::INFINITY, f64::min);
2513            let min_y = fragments.iter().map(|f| f.y).fold(f64::INFINITY, f64::min);
2514            let max_x = fragments
2515                .iter()
2516                .map(|f| f.x + f.width)
2517                .fold(f64::NEG_INFINITY, f64::max);
2518            let max_y = fragments
2519                .iter()
2520                .map(|f| f.y + f.height)
2521                .fold(f64::NEG_INFINITY, f64::max);
2522
2523            assert_eq!(min_x, 0.0);
2524            assert_eq!(min_y, 0.0);
2525            assert_eq!(max_x, 610.0);
2526            assert_eq!(max_y, 790.0);
2527        }
2528
2529        #[test]
2530        fn test_error_chain_context() {
2531            use std::io::{Error as IoError, ErrorKind};
2532
2533            // Test error context preservation
2534            let io_error = IoError::new(ErrorKind::PermissionDenied, "Access denied to image file");
2535            let ocr_error: OcrError = io_error.into();
2536
2537            let error_chain = format!("{ocr_error}");
2538            assert!(error_chain.contains("IO error"));
2539
2540            // Test custom error with context
2541            let processing_error = OcrError::ProcessingFailed(
2542                "Failed to process page 5: insufficient memory".to_string(),
2543            );
2544            let error_msg = format!("{processing_error}");
2545            assert!(error_msg.contains("page 5"));
2546            assert!(error_msg.contains("insufficient memory"));
2547        }
2548
2549        #[test]
2550        fn test_concurrent_result_processing() {
2551            use std::sync::{Arc, Mutex};
2552            use std::thread;
2553
2554            // Create shared result
2555            let result = Arc::new(OcrProcessingResult {
2556                text: "Concurrent test".to_string(),
2557                confidence: 0.85,
2558                fragments: vec![
2559                    OcrTextFragment {
2560                        text: "Fragment1".to_string(),
2561                        x: 0.0,
2562                        y: 0.0,
2563                        width: 100.0,
2564                        height: 20.0,
2565                        confidence: 0.9,
2566                        word_confidences: None,
2567                        font_size: 12.0,
2568                        fragment_type: FragmentType::Word,
2569                    },
2570                    OcrTextFragment {
2571                        text: "Fragment2".to_string(),
2572                        x: 0.0,
2573                        y: 25.0,
2574                        width: 100.0,
2575                        height: 20.0,
2576                        confidence: 0.8,
2577                        word_confidences: None,
2578                        font_size: 12.0,
2579                        fragment_type: FragmentType::Word,
2580                    },
2581                ],
2582                processing_time_ms: 100,
2583                engine_name: "Test".to_string(),
2584                language: "en".to_string(),
2585                processed_region: None,
2586                image_dimensions: (200, 100),
2587            });
2588
2589            let counter = Arc::new(Mutex::new(0));
2590            let mut handles = vec![];
2591
2592            // Spawn threads to process result concurrently
2593            for _ in 0..10 {
2594                let result_clone = Arc::clone(&result);
2595                let counter_clone = Arc::clone(&counter);
2596
2597                let handle = thread::spawn(move || {
2598                    // Perform various read operations
2599                    let _ = result_clone.filter_by_confidence(0.85);
2600                    let _ = result_clone.fragments_in_region(0.0, 0.0, 200.0, 100.0);
2601                    let _ = result_clone.average_confidence();
2602
2603                    let mut count = counter_clone.lock().unwrap();
2604                    *count += 1;
2605                });
2606
2607                handles.push(handle);
2608            }
2609
2610            // Wait for all threads
2611            for handle in handles {
2612                handle.join().unwrap();
2613            }
2614
2615            assert_eq!(*counter.lock().unwrap(), 10);
2616        }
2617    }
2618}
oxidize_pdf/text/ocr.rs

oxidize_pdf/text/
ocr.rs