oxidize_pdf/text/
ocr.rs

1//! OCR (Optical Character Recognition) support for PDF processing
2//!
3//! This module provides a flexible, pluggable architecture for integrating OCR capabilities
4//! into PDF processing workflows. It's designed to work seamlessly with the page analysis
5//! module to process scanned pages and extract text from images.
6//!
7//! # Architecture
8//!
9//! The OCR system uses a trait-based approach that allows for multiple OCR providers:
10//!
11//! - **OcrProvider trait**: Generic interface for OCR engines
12//! - **Pluggable implementations**: Support for local (Tesseract) and cloud (Azure, AWS) providers
13//! - **Result standardization**: Consistent output format regardless of provider
14//! - **Error handling**: Comprehensive error types for OCR operations
15//!
16//! # Usage
17//!
18//! ## Basic OCR Processing
19//!
20//! ```rust
21//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
22//! use oxidize_pdf::graphics::ImageFormat;
23//!
24//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
25//! let provider = MockOcrProvider::new();
26//! let options = OcrOptions::default();
27//!
28//! // Process image data directly - Mock JPEG data
29//! let image_data = vec![
30//!     0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01,
31//!     0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xFF, 0xD9
32//! ];
33//! let result = provider.process_image(&image_data, &options)?;
34//!
35//! println!("Extracted text: {}", result.text);
36//! println!("Confidence: {:.2}%", result.confidence * 100.0);
37//!
38//! for fragment in result.fragments {
39//!     println!("Fragment: '{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! ## Integration with Page Analysis
46//!
47//! ```rust,no_run
48//! use oxidize_pdf::operations::page_analysis::PageContentAnalyzer;
49//! use oxidize_pdf::text::{MockOcrProvider, OcrOptions};
50//! use oxidize_pdf::parser::PdfReader;
51//!
52//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
53//! let document = PdfReader::open_document("scanned.pdf")?;
54//! let analyzer = PageContentAnalyzer::new(document);
55//! let provider = MockOcrProvider::new();
56//!
57//! // Find scanned pages
58//! let scanned_pages = analyzer.find_scanned_pages()?;
59//!
60//! for page_num in scanned_pages {
61//!     let analysis = analyzer.analyze_page(page_num)?;
62//!     if analysis.is_scanned() {
63//!         println!("Processing scanned page {}", page_num);
64//!         // OCR processing would happen here
65//!     }
66//! }
67//! # Ok(())
68//! # }
69//! ```
70
71use crate::graphics::ImageFormat;
72use crate::operations::page_analysis::ContentAnalysis;
73use std::fmt;
74
75/// Result type for OCR operations
76pub type OcrResult<T> = Result<T, OcrError>;
77
78/// Errors that can occur during OCR processing
79#[derive(Debug, thiserror::Error)]
80pub enum OcrError {
81    /// OCR provider is not available or not configured
82    #[error("OCR provider not available: {0}")]
83    ProviderNotAvailable(String),
84
85    /// Unsupported image format for OCR processing
86    #[error("Unsupported image format: {0:?}")]
87    UnsupportedImageFormat(ImageFormat),
88
89    /// Invalid or corrupted image data
90    #[error("Invalid image data: {0}")]
91    InvalidImageData(String),
92
93    /// OCR processing failed
94    #[error("OCR processing failed: {0}")]
95    ProcessingFailed(String),
96
97    /// Network error when using cloud OCR providers
98    #[error("Network error: {0}")]
99    NetworkError(String),
100
101    /// API key or authentication error
102    #[error("Authentication error: {0}")]
103    AuthenticationError(String),
104
105    /// Rate limiting or quota exceeded
106    #[error("Rate limit exceeded: {0}")]
107    RateLimitExceeded(String),
108
109    /// OCR provider returned low confidence results
110    #[error("Low confidence results: {0}")]
111    LowConfidence(String),
112
113    /// Generic IO error
114    #[error("IO error: {0}")]
115    Io(#[from] std::io::Error),
116
117    /// Configuration error
118    #[error("Configuration error: {0}")]
119    Configuration(String),
120}
121
122/// OCR processing options and configuration
123#[derive(Debug, Clone)]
124pub struct OcrOptions {
125    /// Target language for OCR (ISO 639-1 code, e.g., "en", "es", "fr")
126    pub language: String,
127
128    /// Minimum confidence threshold (0.0 to 1.0)
129    pub min_confidence: f64,
130
131    /// Whether to preserve text layout and positioning
132    pub preserve_layout: bool,
133
134    /// Image preprocessing options
135    pub preprocessing: ImagePreprocessing,
136
137    /// OCR engine specific options
138    pub engine_options: std::collections::HashMap<String, String>,
139
140    /// Timeout for OCR operations (in seconds)
141    pub timeout_seconds: u32,
142}
143
144impl Default for OcrOptions {
145    fn default() -> Self {
146        Self {
147            language: "en".to_string(),
148            min_confidence: 0.6,
149            preserve_layout: true,
150            preprocessing: ImagePreprocessing::default(),
151            engine_options: std::collections::HashMap::new(),
152            timeout_seconds: 30,
153        }
154    }
155}
156
157/// Image preprocessing options for OCR
158#[derive(Debug, Clone)]
159pub struct ImagePreprocessing {
160    /// Whether to apply image denoising
161    pub denoise: bool,
162
163    /// Whether to apply image deskewing
164    pub deskew: bool,
165
166    /// Whether to enhance contrast
167    pub enhance_contrast: bool,
168
169    /// Whether to apply image sharpening
170    pub sharpen: bool,
171
172    /// Scale factor for image resizing (1.0 = no scaling)
173    pub scale_factor: f64,
174}
175
176impl Default for ImagePreprocessing {
177    fn default() -> Self {
178        Self {
179            denoise: true,
180            deskew: true,
181            enhance_contrast: true,
182            sharpen: false,
183            scale_factor: 1.0,
184        }
185    }
186}
187
188/// Text fragment extracted by OCR with position and confidence information
189#[derive(Debug, Clone)]
190pub struct OcrTextFragment {
191    /// The extracted text content
192    pub text: String,
193
194    /// X position in page coordinates (points)
195    pub x: f64,
196
197    /// Y position in page coordinates (points)
198    pub y: f64,
199
200    /// Width of the text fragment (points)
201    pub width: f64,
202
203    /// Height of the text fragment (points)
204    pub height: f64,
205
206    /// Confidence score for this fragment (0.0 to 1.0)
207    pub confidence: f64,
208
209    /// Font size estimation (points)
210    pub font_size: f64,
211
212    /// Whether this fragment is part of a word or line
213    pub fragment_type: FragmentType,
214}
215
216/// Type of text fragment
217#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218pub enum FragmentType {
219    /// Individual character
220    Character,
221    /// Complete word
222    Word,
223    /// Text line
224    Line,
225    /// Paragraph
226    Paragraph,
227}
228
229/// Complete result of OCR processing
230#[derive(Debug, Clone)]
231pub struct OcrProcessingResult {
232    /// The complete extracted text
233    pub text: String,
234
235    /// Overall confidence score (0.0 to 1.0)
236    pub confidence: f64,
237
238    /// Individual text fragments with position information
239    pub fragments: Vec<OcrTextFragment>,
240
241    /// Processing time in milliseconds
242    pub processing_time_ms: u64,
243
244    /// OCR engine used for processing
245    pub engine_name: String,
246
247    /// Language detected/used
248    pub language: String,
249
250    /// Image dimensions that were processed
251    pub image_dimensions: (u32, u32),
252}
253
254impl OcrProcessingResult {
255    /// Filter fragments by minimum confidence
256    pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
257        self.fragments
258            .iter()
259            .filter(|fragment| fragment.confidence >= min_confidence)
260            .collect()
261    }
262
263    /// Get text fragments within a specific region
264    pub fn fragments_in_region(
265        &self,
266        x: f64,
267        y: f64,
268        width: f64,
269        height: f64,
270    ) -> Vec<&OcrTextFragment> {
271        self.fragments
272            .iter()
273            .filter(|fragment| {
274                fragment.x >= x
275                    && fragment.y >= y
276                    && fragment.x + fragment.width <= x + width
277                    && fragment.y + fragment.height <= y + height
278            })
279            .collect()
280    }
281
282    /// Get fragments of a specific type
283    pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
284        self.fragments
285            .iter()
286            .filter(|fragment| fragment.fragment_type == fragment_type)
287            .collect()
288    }
289
290    /// Calculate average confidence for all fragments
291    pub fn average_confidence(&self) -> f64 {
292        if self.fragments.is_empty() {
293            return 0.0;
294        }
295
296        let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
297        sum / self.fragments.len() as f64
298    }
299}
300
301/// Supported OCR engines
302#[derive(Debug, Clone, Copy, PartialEq, Eq)]
303pub enum OcrEngine {
304    /// Mock OCR provider for testing
305    Mock,
306    /// Tesseract OCR (local processing)
307    Tesseract,
308    /// Azure Computer Vision OCR
309    Azure,
310    /// AWS Textract
311    Aws,
312    /// Google Cloud Vision OCR
313    GoogleCloud,
314}
315
316impl OcrEngine {
317    /// Get the name of the OCR engine
318    pub fn name(&self) -> &'static str {
319        match self {
320            OcrEngine::Mock => "Mock OCR",
321            OcrEngine::Tesseract => "Tesseract",
322            OcrEngine::Azure => "Azure Computer Vision",
323            OcrEngine::Aws => "AWS Textract",
324            OcrEngine::GoogleCloud => "Google Cloud Vision",
325        }
326    }
327
328    /// Check if this engine supports the given image format
329    pub fn supports_format(&self, format: ImageFormat) -> bool {
330        match self {
331            OcrEngine::Mock => true, // Mock supports all formats
332            OcrEngine::Tesseract => matches!(
333                format,
334                ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
335            ),
336            OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
337            OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
338            OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
339        }
340    }
341}
342
343impl fmt::Display for OcrEngine {
344    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345        write!(f, "{}", self.name())
346    }
347}
348
349/// Trait for OCR providers
350///
351/// This trait defines the interface that all OCR providers must implement.
352/// It provides methods for processing images and extracting text with position information.
353///
354/// # Implementation Notes
355///
356/// - Implementations should handle errors gracefully and return meaningful error messages
357/// - The `process_image` method is the core functionality that all providers must implement
358/// - The `process_page` method is a convenience method for working with page analysis results
359/// - Providers should validate image formats and reject unsupported formats
360///
361/// # Examples
362///
363/// ```rust
364/// use oxidize_pdf::text::{OcrProvider, OcrOptions, OcrProcessingResult, OcrError, OcrEngine};
365/// use oxidize_pdf::graphics::ImageFormat;
366///
367/// struct MyOcrProvider;
368///
369/// impl OcrProvider for MyOcrProvider {
370///     fn process_image(&self, image_data: &[u8], options: &OcrOptions) -> Result<OcrProcessingResult, OcrError> {
371///         // Implementation here
372///         # Ok(OcrProcessingResult {
373///         #     text: "Sample text".to_string(),
374///         #     confidence: 0.95,
375///         #     fragments: vec![],
376///         #     processing_time_ms: 100,
377///         #     engine_name: "MyOCR".to_string(),
378///         #     language: "en".to_string(),
379///         #     image_dimensions: (800, 600),
380///         # })
381///     }
382///
383///     fn supported_formats(&self) -> Vec<ImageFormat> {
384///         vec![ImageFormat::Jpeg, ImageFormat::Png]
385///     }
386///
387///     fn engine_name(&self) -> &str {
388///         "MyOCR"
389///     }
390///
391///     fn engine_type(&self) -> OcrEngine {
392///         OcrEngine::Mock
393///     }
394/// }
395/// ```
396pub trait OcrProvider: Send + Sync {
397    /// Process an image and extract text using OCR
398    ///
399    /// This is the core method that all OCR providers must implement.
400    /// It takes image data as bytes and returns structured text results.
401    ///
402    /// # Arguments
403    ///
404    /// * `image_data` - Raw image bytes (JPEG, PNG, or TIFF)
405    /// * `options` - OCR processing options and configuration
406    ///
407    /// # Returns
408    ///
409    /// A `Result` containing the OCR results with text, confidence, and positioning information.
410    ///
411    /// # Errors
412    ///
413    /// Returns an error if:
414    /// - The image format is not supported
415    /// - The image data is corrupted or invalid
416    /// - OCR processing fails
417    /// - Network errors occur (for cloud providers)
418    /// - Authentication fails (for cloud providers)
419    fn process_image(
420        &self,
421        image_data: &[u8],
422        options: &OcrOptions,
423    ) -> OcrResult<OcrProcessingResult>;
424
425    /// Process a scanned page using content analysis information
426    ///
427    /// This method provides a higher-level interface that works with page analysis results.
428    /// It's particularly useful when integrating with the page analysis module.
429    ///
430    /// # Arguments
431    ///
432    /// * `page_analysis` - Results from page content analysis
433    /// * `page_data` - Raw page data or image data
434    /// * `options` - OCR processing options
435    ///
436    /// # Returns
437    ///
438    /// OCR results optimized for the specific page content type.
439    ///
440    /// # Default Implementation
441    ///
442    /// The default implementation simply calls `process_image` with the page data.
443    /// Providers can override this to provide specialized handling based on page analysis.
444    fn process_page(
445        &self,
446        _page_analysis: &ContentAnalysis,
447        page_data: &[u8],
448        options: &OcrOptions,
449    ) -> OcrResult<OcrProcessingResult> {
450        self.process_image(page_data, options)
451    }
452
453    /// Get the list of supported image formats
454    ///
455    /// # Returns
456    ///
457    /// A vector of `ImageFormat` values that this provider can process.
458    fn supported_formats(&self) -> Vec<ImageFormat>;
459
460    /// Get the name of this OCR provider
461    ///
462    /// # Returns
463    ///
464    /// A string identifying this provider (e.g., "Tesseract", "Azure OCR").
465    fn engine_name(&self) -> &str;
466
467    /// Get the engine type for this provider
468    ///
469    /// # Returns
470    ///
471    /// The `OcrEngine` enum value corresponding to this provider.
472    fn engine_type(&self) -> OcrEngine;
473
474    /// Check if this provider supports the given image format
475    ///
476    /// # Arguments
477    ///
478    /// * `format` - The image format to check
479    ///
480    /// # Returns
481    ///
482    /// `true` if the format is supported, `false` otherwise.
483    fn supports_format(&self, format: ImageFormat) -> bool {
484        self.supported_formats().contains(&format)
485    }
486
487    /// Validate image data before processing
488    ///
489    /// This method can be used to perform basic validation of image data
490    /// before attempting OCR processing.
491    ///
492    /// # Arguments
493    ///
494    /// * `image_data` - Raw image bytes to validate
495    ///
496    /// # Returns
497    ///
498    /// `Ok(())` if the image data is valid, `Err(OcrError)` otherwise.
499    ///
500    /// # Default Implementation
501    ///
502    /// The default implementation performs basic format detection based on magic bytes.
503    fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
504        if image_data.len() < 8 {
505            return Err(OcrError::InvalidImageData(
506                "Image data too short".to_string(),
507            ));
508        }
509
510        // Check for common image format signatures
511        let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
512            ImageFormat::Jpeg
513        } else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
514            ImageFormat::Png
515        } else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
516            ImageFormat::Tiff
517        } else {
518            return Err(OcrError::InvalidImageData(
519                "Unrecognized image format".to_string(),
520            ));
521        };
522
523        if !self.supports_format(format) {
524            return Err(OcrError::UnsupportedImageFormat(format));
525        }
526
527        Ok(())
528    }
529}
530
531/// Mock OCR provider for testing and development
532///
533/// This provider simulates OCR processing without actually performing text recognition.
534/// It's useful for testing OCR workflows and developing OCR-dependent functionality.
535///
536/// # Examples
537///
538/// ```rust
539/// use oxidize_pdf::text::{MockOcrProvider, OcrOptions, OcrProvider};
540///
541/// let provider = MockOcrProvider::new();
542/// let options = OcrOptions::default();
543/// let image_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46]; // Mock JPEG data
544///
545/// let result = provider.process_image(&image_data, &options).unwrap();
546/// assert!(result.text.contains("Mock OCR"));
547/// ```
548#[derive(Clone)]
549pub struct MockOcrProvider {
550    /// Mock confidence level to return
551    confidence: f64,
552    /// Mock text to return
553    mock_text: String,
554    /// Simulated processing delay (milliseconds)
555    processing_delay_ms: u64,
556}
557
558impl MockOcrProvider {
559    /// Create a new mock OCR provider with default settings
560    pub fn new() -> Self {
561        Self {
562            confidence: 0.85,
563            mock_text: "Mock OCR extracted text from scanned image".to_string(),
564            processing_delay_ms: 100,
565        }
566    }
567
568    /// Create a mock provider with custom text and confidence
569    pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
570        Self {
571            confidence,
572            mock_text: text,
573            processing_delay_ms: 100,
574        }
575    }
576
577    /// Set the mock text to return
578    pub fn set_mock_text(&mut self, text: String) {
579        self.mock_text = text;
580    }
581
582    /// Set the confidence level to return
583    pub fn set_confidence(&mut self, confidence: f64) {
584        self.confidence = confidence.clamp(0.0, 1.0);
585    }
586
587    /// Set the simulated processing delay
588    pub fn set_processing_delay(&mut self, delay_ms: u64) {
589        self.processing_delay_ms = delay_ms;
590    }
591}
592
593impl Default for MockOcrProvider {
594    fn default() -> Self {
595        Self::new()
596    }
597}
598
599impl OcrProvider for MockOcrProvider {
600    fn process_image(
601        &self,
602        image_data: &[u8],
603        options: &OcrOptions,
604    ) -> OcrResult<OcrProcessingResult> {
605        // Validate image data
606        self.validate_image_data(image_data)?;
607
608        // Simulate processing time
609        std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
610
611        // Create mock text fragments
612        let fragments = vec![
613            OcrTextFragment {
614                text: self.mock_text.clone(),
615                x: 50.0,
616                y: 700.0,
617                width: 200.0,
618                height: 20.0,
619                confidence: self.confidence,
620                font_size: 12.0,
621                fragment_type: FragmentType::Line,
622            },
623            OcrTextFragment {
624                text: "Additional mock text".to_string(),
625                x: 50.0,
626                y: 680.0,
627                width: 150.0,
628                height: 20.0,
629                confidence: self.confidence * 0.9,
630                font_size: 12.0,
631                fragment_type: FragmentType::Line,
632            },
633        ];
634
635        Ok(OcrProcessingResult {
636            text: format!("{}\nAdditional mock text", self.mock_text),
637            confidence: self.confidence,
638            fragments,
639            processing_time_ms: self.processing_delay_ms,
640            engine_name: "Mock OCR".to_string(),
641            language: options.language.clone(),
642            image_dimensions: (800, 600), // Mock dimensions
643        })
644    }
645
646    fn supported_formats(&self) -> Vec<ImageFormat> {
647        vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
648    }
649
650    fn engine_name(&self) -> &str {
651        "Mock OCR"
652    }
653
654    fn engine_type(&self) -> OcrEngine {
655        OcrEngine::Mock
656    }
657}
658
659#[cfg(test)]
660mod tests {
661    use super::*;
662
663    #[test]
664    fn test_ocr_options_default() {
665        let options = OcrOptions::default();
666        assert_eq!(options.language, "en");
667        assert_eq!(options.min_confidence, 0.6);
668        assert!(options.preserve_layout);
669        assert_eq!(options.timeout_seconds, 30);
670    }
671
672    #[test]
673    fn test_image_preprocessing_default() {
674        let preprocessing = ImagePreprocessing::default();
675        assert!(preprocessing.denoise);
676        assert!(preprocessing.deskew);
677        assert!(preprocessing.enhance_contrast);
678        assert!(!preprocessing.sharpen);
679        assert_eq!(preprocessing.scale_factor, 1.0);
680    }
681
682    #[test]
683    fn test_ocr_engine_name() {
684        assert_eq!(OcrEngine::Mock.name(), "Mock OCR");
685        assert_eq!(OcrEngine::Tesseract.name(), "Tesseract");
686        assert_eq!(OcrEngine::Azure.name(), "Azure Computer Vision");
687    }
688
689    #[test]
690    fn test_ocr_engine_supports_format() {
691        assert!(OcrEngine::Mock.supports_format(ImageFormat::Jpeg));
692        assert!(OcrEngine::Mock.supports_format(ImageFormat::Png));
693        assert!(OcrEngine::Mock.supports_format(ImageFormat::Tiff));
694
695        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Jpeg));
696        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Png));
697        assert!(OcrEngine::Tesseract.supports_format(ImageFormat::Tiff));
698
699        assert!(OcrEngine::Azure.supports_format(ImageFormat::Jpeg));
700        assert!(OcrEngine::Azure.supports_format(ImageFormat::Png));
701        assert!(!OcrEngine::Azure.supports_format(ImageFormat::Tiff));
702    }
703
704    #[test]
705    fn test_fragment_type_equality() {
706        assert_eq!(FragmentType::Word, FragmentType::Word);
707        assert_ne!(FragmentType::Word, FragmentType::Line);
708        assert_ne!(FragmentType::Character, FragmentType::Paragraph);
709    }
710
711    #[test]
712    fn test_mock_ocr_provider_creation() {
713        let provider = MockOcrProvider::new();
714        assert_eq!(provider.confidence, 0.85);
715        assert!(provider.mock_text.contains("Mock OCR"));
716        assert_eq!(provider.processing_delay_ms, 100);
717    }
718
719    #[test]
720    fn test_mock_ocr_provider_with_custom_text() {
721        let custom_text = "Custom mock text".to_string();
722        let provider = MockOcrProvider::with_text_and_confidence(custom_text.clone(), 0.95);
723        assert_eq!(provider.mock_text, custom_text);
724        assert_eq!(provider.confidence, 0.95);
725    }
726
727    #[test]
728    fn test_mock_ocr_provider_process_image() {
729        let provider = MockOcrProvider::new();
730        let options = OcrOptions::default();
731
732        // Mock JPEG data
733        let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
734
735        let result = provider.process_image(&jpeg_data, &options).unwrap();
736        assert!(result.text.contains("Mock OCR"));
737        assert_eq!(result.confidence, 0.85);
738        assert!(!result.fragments.is_empty());
739        assert_eq!(result.engine_name, "Mock OCR");
740        assert_eq!(result.language, "en");
741    }
742
743    #[test]
744    fn test_mock_ocr_provider_supported_formats() {
745        let provider = MockOcrProvider::new();
746        let formats = provider.supported_formats();
747        assert!(formats.contains(&ImageFormat::Jpeg));
748        assert!(formats.contains(&ImageFormat::Png));
749        assert!(formats.contains(&ImageFormat::Tiff));
750    }
751
752    #[test]
753    fn test_mock_ocr_provider_engine_info() {
754        let provider = MockOcrProvider::new();
755        assert_eq!(provider.engine_name(), "Mock OCR");
756        assert_eq!(provider.engine_type(), OcrEngine::Mock);
757    }
758
759    #[test]
760    fn test_mock_ocr_provider_supports_format() {
761        let provider = MockOcrProvider::new();
762        assert!(provider.supports_format(ImageFormat::Jpeg));
763        assert!(provider.supports_format(ImageFormat::Png));
764        assert!(provider.supports_format(ImageFormat::Tiff));
765    }
766
767    #[test]
768    fn test_mock_ocr_provider_validate_image_data() {
769        let provider = MockOcrProvider::new();
770
771        // Valid JPEG data
772        let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
773        assert!(provider.validate_image_data(&jpeg_data).is_ok());
774
775        // Invalid data (too short)
776        let short_data = vec![0xFF, 0xD8];
777        assert!(provider.validate_image_data(&short_data).is_err());
778
779        // Invalid format
780        let invalid_data = vec![0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09];
781        assert!(provider.validate_image_data(&invalid_data).is_err());
782    }
783
784    #[test]
785    fn test_ocr_processing_result_filter_by_confidence() {
786        let result = OcrProcessingResult {
787            text: "Test text".to_string(),
788            confidence: 0.8,
789            fragments: vec![
790                OcrTextFragment {
791                    text: "High confidence".to_string(),
792                    x: 0.0,
793                    y: 0.0,
794                    width: 100.0,
795                    height: 20.0,
796                    confidence: 0.9,
797                    font_size: 12.0,
798                    fragment_type: FragmentType::Word,
799                },
800                OcrTextFragment {
801                    text: "Low confidence".to_string(),
802                    x: 0.0,
803                    y: 20.0,
804                    width: 100.0,
805                    height: 20.0,
806                    confidence: 0.5,
807                    font_size: 12.0,
808                    fragment_type: FragmentType::Word,
809                },
810            ],
811            processing_time_ms: 100,
812            engine_name: "Test".to_string(),
813            language: "en".to_string(),
814            image_dimensions: (800, 600),
815        };
816
817        let high_confidence = result.filter_by_confidence(0.8);
818        assert_eq!(high_confidence.len(), 1);
819        assert_eq!(high_confidence[0].text, "High confidence");
820    }
821
822    #[test]
823    fn test_ocr_processing_result_fragments_in_region() {
824        let result = OcrProcessingResult {
825            text: "Test text".to_string(),
826            confidence: 0.8,
827            fragments: vec![
828                OcrTextFragment {
829                    text: "Inside region".to_string(),
830                    x: 10.0,
831                    y: 10.0,
832                    width: 80.0,
833                    height: 20.0,
834                    confidence: 0.9,
835                    font_size: 12.0,
836                    fragment_type: FragmentType::Word,
837                },
838                OcrTextFragment {
839                    text: "Outside region".to_string(),
840                    x: 200.0,
841                    y: 200.0,
842                    width: 80.0,
843                    height: 20.0,
844                    confidence: 0.9,
845                    font_size: 12.0,
846                    fragment_type: FragmentType::Word,
847                },
848            ],
849            processing_time_ms: 100,
850            engine_name: "Test".to_string(),
851            language: "en".to_string(),
852            image_dimensions: (800, 600),
853        };
854
855        let in_region = result.fragments_in_region(0.0, 0.0, 100.0, 100.0);
856        assert_eq!(in_region.len(), 1);
857        assert_eq!(in_region[0].text, "Inside region");
858    }
859
860    #[test]
861    fn test_ocr_processing_result_fragments_of_type() {
862        let result = OcrProcessingResult {
863            text: "Test text".to_string(),
864            confidence: 0.8,
865            fragments: vec![
866                OcrTextFragment {
867                    text: "Word fragment".to_string(),
868                    x: 0.0,
869                    y: 0.0,
870                    width: 100.0,
871                    height: 20.0,
872                    confidence: 0.9,
873                    font_size: 12.0,
874                    fragment_type: FragmentType::Word,
875                },
876                OcrTextFragment {
877                    text: "Line fragment".to_string(),
878                    x: 0.0,
879                    y: 20.0,
880                    width: 200.0,
881                    height: 20.0,
882                    confidence: 0.9,
883                    font_size: 12.0,
884                    fragment_type: FragmentType::Line,
885                },
886            ],
887            processing_time_ms: 100,
888            engine_name: "Test".to_string(),
889            language: "en".to_string(),
890            image_dimensions: (800, 600),
891        };
892
893        let words = result.fragments_of_type(FragmentType::Word);
894        assert_eq!(words.len(), 1);
895        assert_eq!(words[0].text, "Word fragment");
896
897        let lines = result.fragments_of_type(FragmentType::Line);
898        assert_eq!(lines.len(), 1);
899        assert_eq!(lines[0].text, "Line fragment");
900    }
901
902    #[test]
903    fn test_ocr_processing_result_average_confidence() {
904        let result = OcrProcessingResult {
905            text: "Test text".to_string(),
906            confidence: 0.8,
907            fragments: vec![
908                OcrTextFragment {
909                    text: "Fragment 1".to_string(),
910                    x: 0.0,
911                    y: 0.0,
912                    width: 100.0,
913                    height: 20.0,
914                    confidence: 0.8,
915                    font_size: 12.0,
916                    fragment_type: FragmentType::Word,
917                },
918                OcrTextFragment {
919                    text: "Fragment 2".to_string(),
920                    x: 0.0,
921                    y: 20.0,
922                    width: 100.0,
923                    height: 20.0,
924                    confidence: 0.6,
925                    font_size: 12.0,
926                    fragment_type: FragmentType::Word,
927                },
928            ],
929            processing_time_ms: 100,
930            engine_name: "Test".to_string(),
931            language: "en".to_string(),
932            image_dimensions: (800, 600),
933        };
934
935        let avg_confidence = result.average_confidence();
936        assert_eq!(avg_confidence, 0.7);
937    }
938
939    #[test]
940    fn test_ocr_processing_result_average_confidence_empty() {
941        let result = OcrProcessingResult {
942            text: "Test text".to_string(),
943            confidence: 0.8,
944            fragments: vec![],
945            processing_time_ms: 100,
946            engine_name: "Test".to_string(),
947            language: "en".to_string(),
948            image_dimensions: (800, 600),
949        };
950
951        let avg_confidence = result.average_confidence();
952        assert_eq!(avg_confidence, 0.0);
953    }
954}
oxidize_pdf/text/ocr.rs

oxidize_pdf/text/
ocr.rs