rexis_rag/multimodal/
ocr.rs

1//! # Optical Character Recognition (OCR)
2//!
3//! Multi-engine OCR with text extraction, layout analysis, and confidence scoring.
4
5use super::{
6    BlockType, BoundingBox, Column, OCRConfig, OCREngine, OCREngineType, OCRResult, OCRWord,
7    TextBlock, TextLayout,
8};
9use crate::{RragError, RragResult};
10use std::collections::HashMap;
11use std::path::Path;
12
13/// Default OCR engine implementation
14pub struct DefaultOCREngine {
15    /// Configuration
16    config: OCRConfig,
17
18    /// Primary OCR engine
19    primary_engine: Box<dyn OCREngineImpl>,
20
21    /// Fallback engines
22    fallback_engines: Vec<Box<dyn OCREngineImpl>>,
23
24    /// Text post-processor
25    post_processor: TextPostProcessor,
26
27    /// Layout analyzer
28    layout_analyzer: OCRLayoutAnalyzer,
29}
30
31/// OCR engine implementation trait
32pub trait OCREngineImpl: Send + Sync {
33    /// Extract text from image
34    fn extract_text(&self, image_path: &Path) -> RragResult<OCRResult>;
35
36    /// Get engine capabilities
37    fn capabilities(&self) -> EngineCapabilities;
38
39    /// Engine name
40    fn name(&self) -> &str;
41}
42
43/// Engine capabilities
44#[derive(Debug, Clone)]
45pub struct EngineCapabilities {
46    /// Supported languages
47    pub languages: Vec<String>,
48
49    /// Supports layout detection
50    pub layout_detection: bool,
51
52    /// Supports confidence scores
53    pub confidence_scores: bool,
54
55    /// Supports word-level results
56    pub word_level: bool,
57
58    /// Processing speed (relative)
59    pub speed: ProcessingSpeed,
60
61    /// Accuracy (relative)
62    pub accuracy: AccuracyLevel,
63}
64
65/// Processing speed levels
66#[derive(Debug, Clone, Copy)]
67pub enum ProcessingSpeed {
68    Fast,
69    Medium,
70    Slow,
71}
72
73/// Accuracy levels
74#[derive(Debug, Clone, Copy)]
75pub enum AccuracyLevel {
76    Low,
77    Medium,
78    High,
79}
80
81/// Text post-processor
82pub struct TextPostProcessor {
83    /// Spell checker
84    spell_checker: Option<SpellChecker>,
85
86    /// Language detector
87    language_detector: LanguageDetector,
88
89    /// Text formatter
90    formatter: TextFormatter,
91}
92
93/// Spell checker
94pub struct SpellChecker {
95    /// Dictionary paths by language
96    dictionaries: HashMap<String, String>,
97
98    /// Confidence threshold for corrections
99    confidence_threshold: f32,
100}
101
102/// Language detector
103pub struct LanguageDetector {
104    /// Supported languages
105    supported_languages: Vec<String>,
106
107    /// Detection confidence threshold
108    min_confidence: f32,
109}
110
111/// Text formatter
112pub struct TextFormatter {
113    /// Preserve line breaks
114    preserve_line_breaks: bool,
115
116    /// Preserve spacing
117    preserve_spacing: bool,
118
119    /// Clean up artifacts
120    cleanup_artifacts: bool,
121}
122
123/// OCR layout analyzer
124pub struct OCRLayoutAnalyzer {
125    /// Block detection threshold
126    block_threshold: f32,
127
128    /// Column detection enabled
129    column_detection: bool,
130
131    /// Reading order detection
132    reading_order_detection: bool,
133}
134
135/// Tesseract OCR engine
136pub struct TesseractEngine {
137    /// Language configuration
138    languages: Vec<String>,
139
140    /// OCR engine mode
141    ocr_mode: TesseractOCRMode,
142
143    /// Page segmentation mode
144    psm: PageSegmentationMode,
145}
146
147/// Tesseract OCR modes
148#[derive(Debug, Clone, Copy)]
149pub enum TesseractOCRMode {
150    LegacyOnly,
151    NeuralOnly,
152    LegacyAndNeural,
153}
154
155/// Page segmentation modes
156#[derive(Debug, Clone, Copy)]
157pub enum PageSegmentationMode {
158    Auto,
159    SingleColumn,
160    SingleBlockVertText,
161    SingleBlock,
162    SingleLine,
163    SingleWord,
164    SingleCharacter,
165    SparseText,
166}
167
168/// EasyOCR engine
169pub struct EasyOCREngine {
170    /// Language codes
171    languages: Vec<String>,
172
173    /// GPU acceleration
174    gpu_enabled: bool,
175
176    /// Text detection model
177    detection_model: String,
178
179    /// Text recognition model
180    recognition_model: String,
181}
182
183/// PaddleOCR engine
184pub struct PaddleOCREngine {
185    /// Language
186    language: String,
187
188    /// Model precision
189    precision: ModelPrecision,
190
191    /// Text direction detection
192    direction_detection: bool,
193}
194
195/// Model precision levels
196#[derive(Debug, Clone, Copy)]
197pub enum ModelPrecision {
198    FP16,
199    FP32,
200    INT8,
201}
202
203/// Cloud Vision OCR engine
204pub struct CloudVisionEngine {
205    /// API credentials
206    credentials: CloudCredentials,
207
208    /// API endpoint
209    endpoint: String,
210
211    /// Request timeout
212    timeout_ms: u64,
213}
214
215/// Cloud credentials
216#[derive(Debug, Clone)]
217pub struct CloudCredentials {
218    pub api_key: String,
219    pub project_id: Option<String>,
220    pub region: Option<String>,
221}
222
223/// OCR quality assessment
224#[derive(Debug, Clone)]
225pub struct OCRQuality {
226    /// Overall confidence
227    pub overall_confidence: f32,
228
229    /// Text quality score
230    pub text_quality: f32,
231
232    /// Layout quality score
233    pub layout_quality: f32,
234
235    /// Language detection confidence
236    pub language_confidence: f32,
237
238    /// Quality issues
239    pub issues: Vec<QualityIssue>,
240}
241
242/// Quality issues in OCR
243#[derive(Debug, Clone)]
244pub struct QualityIssue {
245    /// Issue type
246    pub issue_type: OCRIssueType,
247
248    /// Issue description
249    pub description: String,
250
251    /// Severity
252    pub severity: IssueSeverity,
253
254    /// Location
255    pub location: Option<BoundingBox>,
256
257    /// Suggested fix
258    pub suggested_fix: Option<String>,
259}
260
261/// OCR issue types
262#[derive(Debug, Clone, Copy)]
263pub enum OCRIssueType {
264    LowConfidence,
265    PoorImageQuality,
266    UnsupportedLanguage,
267    LayoutComplexity,
268    FontIssues,
269    SkewedText,
270    NoiseArtifacts,
271}
272
273/// Issue severity levels
274#[derive(Debug, Clone, Copy)]
275pub enum IssueSeverity {
276    Low,
277    Medium,
278    High,
279    Critical,
280}
281
282impl DefaultOCREngine {
283    /// Create new OCR engine
284    pub fn new(config: OCRConfig) -> RragResult<Self> {
285        let primary_engine = Self::create_engine(config.engine, &config)?;
286        let fallback_engines = Self::create_fallback_engines(&config)?;
287        let post_processor = TextPostProcessor::new(&config)?;
288        let layout_analyzer = OCRLayoutAnalyzer::new();
289
290        Ok(Self {
291            config,
292            primary_engine,
293            fallback_engines,
294            post_processor,
295            layout_analyzer,
296        })
297    }
298
299    /// Create OCR engine based on type
300    fn create_engine(
301        engine_type: OCREngineType,
302        config: &OCRConfig,
303    ) -> RragResult<Box<dyn OCREngineImpl>> {
304        match engine_type {
305            OCREngineType::Tesseract => {
306                Ok(Box::new(TesseractEngine::new(config.languages.clone())?))
307            }
308            OCREngineType::EasyOCR => Ok(Box::new(EasyOCREngine::new(config.languages.clone())?)),
309            OCREngineType::PaddleOCR => {
310                let lang = config
311                    .languages
312                    .first()
313                    .unwrap_or(&"en".to_string())
314                    .clone();
315                Ok(Box::new(PaddleOCREngine::new(lang)?))
316            }
317            OCREngineType::CloudVision => Ok(Box::new(CloudVisionEngine::new()?)),
318        }
319    }
320
321    /// Create fallback engines
322    fn create_fallback_engines(config: &OCRConfig) -> RragResult<Vec<Box<dyn OCREngineImpl>>> {
323        let mut engines = Vec::new();
324
325        // Add Tesseract as fallback if not primary
326        if config.engine != OCREngineType::Tesseract {
327            engines
328                .push(Box::new(TesseractEngine::new(config.languages.clone())?)
329                    as Box<dyn OCREngineImpl>);
330        }
331
332        // Add EasyOCR as fallback if not primary
333        if config.engine != OCREngineType::EasyOCR {
334            engines
335                .push(Box::new(EasyOCREngine::new(config.languages.clone())?)
336                    as Box<dyn OCREngineImpl>);
337        }
338
339        Ok(engines)
340    }
341
342    /// Perform OCR with fallback
343    pub fn ocr_with_fallback(&self, image_path: &Path) -> RragResult<OCRResult> {
344        // Try primary engine first
345        match self.primary_engine.extract_text(image_path) {
346            Ok(result) if result.confidence >= self.config.confidence_threshold => {
347                return Ok(result);
348            }
349            Ok(primary_result) => {
350                // Primary engine succeeded but confidence is low, try fallbacks
351                for fallback in &self.fallback_engines {
352                    if let Ok(fallback_result) = fallback.extract_text(image_path) {
353                        if fallback_result.confidence > primary_result.confidence {
354                            return Ok(fallback_result);
355                        }
356                    }
357                }
358                // Return primary result if no better fallback found
359                Ok(primary_result)
360            }
361            Err(_) => {
362                // Primary engine failed, try fallbacks
363                for fallback in &self.fallback_engines {
364                    if let Ok(result) = fallback.extract_text(image_path) {
365                        return Ok(result);
366                    }
367                }
368                Err(RragError::document_processing("All OCR engines failed"))
369            }
370        }
371    }
372
373    /// Assess OCR quality
374    pub fn assess_quality(&self, result: &OCRResult) -> OCRQuality {
375        let mut issues = Vec::new();
376
377        // Check overall confidence
378        if result.confidence < 0.7 {
379            issues.push(QualityIssue {
380                issue_type: OCRIssueType::LowConfidence,
381                description: format!("Overall confidence is low: {:.2}", result.confidence),
382                severity: if result.confidence < 0.5 {
383                    IssueSeverity::High
384                } else {
385                    IssueSeverity::Medium
386                },
387                location: None,
388                suggested_fix: Some(
389                    "Consider using a higher resolution image or different OCR engine".to_string(),
390                ),
391            });
392        }
393
394        // Check for words with very low confidence
395        let low_confidence_words = result.words.iter().filter(|w| w.confidence < 0.5).count();
396
397        if low_confidence_words > result.words.len() / 4 {
398            issues.push(QualityIssue {
399                issue_type: OCRIssueType::LowConfidence,
400                description: format!("{} words have low confidence", low_confidence_words),
401                severity: IssueSeverity::Medium,
402                location: None,
403                suggested_fix: Some(
404                    "Manual review recommended for low-confidence words".to_string(),
405                ),
406            });
407        }
408
409        OCRQuality {
410            overall_confidence: result.confidence,
411            text_quality: self.calculate_text_quality(result),
412            layout_quality: 0.8,      // Simplified
413            language_confidence: 0.9, // Simplified
414            issues,
415        }
416    }
417
418    /// Calculate text quality score
419    fn calculate_text_quality(&self, result: &OCRResult) -> f32 {
420        if result.words.is_empty() {
421            return 0.0;
422        }
423
424        // Average word confidence
425        let avg_confidence =
426            result.words.iter().map(|w| w.confidence).sum::<f32>() / result.words.len() as f32;
427
428        // Penalize for very short words (likely noise)
429        let short_words = result.words.iter().filter(|w| w.text.len() <= 2).count();
430        let short_word_penalty = (short_words as f32 / result.words.len() as f32) * 0.2;
431
432        (avg_confidence - short_word_penalty).max(0.0)
433    }
434}
435
436impl OCREngine for DefaultOCREngine {
437    fn ocr(&self, image_path: &Path) -> RragResult<OCRResult> {
438        let mut result = self.ocr_with_fallback(image_path)?;
439
440        // Post-process text if enabled
441        if self.config.spell_correction {
442            result = self.post_processor.process(result)?;
443        }
444
445        Ok(result)
446    }
447
448    fn get_text_with_confidence(&self, image_path: &Path) -> RragResult<Vec<(String, f32)>> {
449        let result = self.ocr(image_path)?;
450        Ok(result
451            .words
452            .into_iter()
453            .map(|word| (word.text, word.confidence))
454            .collect())
455    }
456
457    fn get_layout(&self, image_path: &Path) -> RragResult<TextLayout> {
458        let result = self.ocr(image_path)?;
459        self.layout_analyzer.analyze_layout(&result)
460    }
461}
462
463impl TesseractEngine {
464    /// Create new Tesseract engine
465    pub fn new(languages: Vec<String>) -> RragResult<Self> {
466        Ok(Self {
467            languages,
468            ocr_mode: TesseractOCRMode::LegacyAndNeural,
469            psm: PageSegmentationMode::Auto,
470        })
471    }
472}
473
474impl OCREngineImpl for TesseractEngine {
475    fn extract_text(&self, image_path: &Path) -> RragResult<OCRResult> {
476        // Simulate Tesseract OCR
477        let text = format!(
478            "Sample text extracted from {:?}",
479            image_path.file_name().unwrap_or_default()
480        );
481
482        let words = vec![
483            OCRWord {
484                text: "Sample".to_string(),
485                confidence: 0.95,
486                bounding_box: BoundingBox {
487                    x: 10,
488                    y: 10,
489                    width: 50,
490                    height: 20,
491                },
492            },
493            OCRWord {
494                text: "text".to_string(),
495                confidence: 0.90,
496                bounding_box: BoundingBox {
497                    x: 65,
498                    y: 10,
499                    width: 30,
500                    height: 20,
501                },
502            },
503        ];
504
505        Ok(OCRResult {
506            text,
507            confidence: 0.925,
508            words,
509            languages: self.languages.clone(),
510        })
511    }
512
513    fn capabilities(&self) -> EngineCapabilities {
514        EngineCapabilities {
515            languages: vec!["eng", "fra", "deu", "spa", "chi_sim"]
516                .iter()
517                .map(|s| s.to_string())
518                .collect(),
519            layout_detection: true,
520            confidence_scores: true,
521            word_level: true,
522            speed: ProcessingSpeed::Medium,
523            accuracy: AccuracyLevel::High,
524        }
525    }
526
527    fn name(&self) -> &str {
528        "Tesseract"
529    }
530}
531
532impl EasyOCREngine {
533    /// Create new EasyOCR engine
534    pub fn new(languages: Vec<String>) -> RragResult<Self> {
535        Ok(Self {
536            languages,
537            gpu_enabled: false,
538            detection_model: "craft".to_string(),
539            recognition_model: "crnn".to_string(),
540        })
541    }
542}
543
544impl OCREngineImpl for EasyOCREngine {
545    fn extract_text(&self, image_path: &Path) -> RragResult<OCRResult> {
546        // Simulate EasyOCR
547        let text = format!(
548            "EasyOCR extracted text from {:?}",
549            image_path.file_name().unwrap_or_default()
550        );
551
552        let words = vec![
553            OCRWord {
554                text: "EasyOCR".to_string(),
555                confidence: 0.88,
556                bounding_box: BoundingBox {
557                    x: 5,
558                    y: 5,
559                    width: 60,
560                    height: 25,
561                },
562            },
563            OCRWord {
564                text: "extracted".to_string(),
565                confidence: 0.92,
566                bounding_box: BoundingBox {
567                    x: 70,
568                    y: 5,
569                    width: 70,
570                    height: 25,
571                },
572            },
573        ];
574
575        Ok(OCRResult {
576            text,
577            confidence: 0.90,
578            words,
579            languages: self.languages.clone(),
580        })
581    }
582
583    fn capabilities(&self) -> EngineCapabilities {
584        EngineCapabilities {
585            languages: vec!["en", "ch_sim", "ch_tra", "ja", "ko", "fr", "de"]
586                .iter()
587                .map(|s| s.to_string())
588                .collect(),
589            layout_detection: true,
590            confidence_scores: true,
591            word_level: true,
592            speed: ProcessingSpeed::Fast,
593            accuracy: AccuracyLevel::Medium,
594        }
595    }
596
597    fn name(&self) -> &str {
598        "EasyOCR"
599    }
600}
601
602impl PaddleOCREngine {
603    /// Create new PaddleOCR engine
604    pub fn new(language: String) -> RragResult<Self> {
605        Ok(Self {
606            language,
607            precision: ModelPrecision::FP32,
608            direction_detection: true,
609        })
610    }
611}
612
613impl OCREngineImpl for PaddleOCREngine {
614    fn extract_text(&self, image_path: &Path) -> RragResult<OCRResult> {
615        // Simulate PaddleOCR
616        let text = format!(
617            "PaddleOCR text from {:?}",
618            image_path.file_name().unwrap_or_default()
619        );
620
621        let words = vec![OCRWord {
622            text: "PaddleOCR".to_string(),
623            confidence: 0.93,
624            bounding_box: BoundingBox {
625                x: 8,
626                y: 8,
627                width: 80,
628                height: 22,
629            },
630        }];
631
632        Ok(OCRResult {
633            text,
634            confidence: 0.93,
635            words,
636            languages: vec![self.language.clone()],
637        })
638    }
639
640    fn capabilities(&self) -> EngineCapabilities {
641        EngineCapabilities {
642            languages: vec!["ch", "en", "fr", "german", "japan", "korean"]
643                .iter()
644                .map(|s| s.to_string())
645                .collect(),
646            layout_detection: true,
647            confidence_scores: true,
648            word_level: true,
649            speed: ProcessingSpeed::Fast,
650            accuracy: AccuracyLevel::High,
651        }
652    }
653
654    fn name(&self) -> &str {
655        "PaddleOCR"
656    }
657}
658
659impl CloudVisionEngine {
660    /// Create new Cloud Vision engine
661    pub fn new() -> RragResult<Self> {
662        Ok(Self {
663            credentials: CloudCredentials {
664                api_key: "demo_key".to_string(),
665                project_id: Some("demo_project".to_string()),
666                region: Some("us-central1".to_string()),
667            },
668            endpoint: "https://vision.googleapis.com".to_string(),
669            timeout_ms: 30000,
670        })
671    }
672}
673
674impl OCREngineImpl for CloudVisionEngine {
675    fn extract_text(&self, image_path: &Path) -> RragResult<OCRResult> {
676        // Simulate Cloud Vision API call
677        let text = format!(
678            "Cloud Vision text from {:?}",
679            image_path.file_name().unwrap_or_default()
680        );
681
682        let words = vec![
683            OCRWord {
684                text: "Cloud".to_string(),
685                confidence: 0.98,
686                bounding_box: BoundingBox {
687                    x: 12,
688                    y: 12,
689                    width: 45,
690                    height: 18,
691                },
692            },
693            OCRWord {
694                text: "Vision".to_string(),
695                confidence: 0.97,
696                bounding_box: BoundingBox {
697                    x: 60,
698                    y: 12,
699                    width: 50,
700                    height: 18,
701                },
702            },
703        ];
704
705        Ok(OCRResult {
706            text,
707            confidence: 0.975,
708            words,
709            languages: vec!["en".to_string()],
710        })
711    }
712
713    fn capabilities(&self) -> EngineCapabilities {
714        EngineCapabilities {
715            languages: vec!["en", "zh", "ja", "ko", "hi", "ar", "fr", "de", "es", "pt"]
716                .iter()
717                .map(|s| s.to_string())
718                .collect(),
719            layout_detection: true,
720            confidence_scores: true,
721            word_level: true,
722            speed: ProcessingSpeed::Slow, // Network latency
723            accuracy: AccuracyLevel::High,
724        }
725    }
726
727    fn name(&self) -> &str {
728        "Cloud Vision"
729    }
730}
731
732impl TextPostProcessor {
733    /// Create new text post-processor
734    pub fn new(config: &OCRConfig) -> RragResult<Self> {
735        let spell_checker = if config.spell_correction {
736            Some(SpellChecker::new(&config.languages)?)
737        } else {
738            None
739        };
740
741        let language_detector = LanguageDetector::new(config.languages.clone());
742        let formatter = TextFormatter::new(config.preserve_formatting);
743
744        Ok(Self {
745            spell_checker,
746            language_detector,
747            formatter,
748        })
749    }
750
751    /// Process OCR result
752    pub fn process(&self, mut result: OCRResult) -> RragResult<OCRResult> {
753        // Spell checking
754        if let Some(ref checker) = self.spell_checker {
755            result = checker.correct(result)?;
756        }
757
758        // Language detection
759        let detected_languages = self.language_detector.detect(&result.text)?;
760        if !detected_languages.is_empty() {
761            result.languages = detected_languages;
762        }
763
764        // Text formatting
765        result = self.formatter.format(result)?;
766
767        Ok(result)
768    }
769}
770
771impl SpellChecker {
772    /// Create new spell checker
773    pub fn new(languages: &[String]) -> RragResult<Self> {
774        let mut dictionaries = HashMap::new();
775        for lang in languages {
776            dictionaries.insert(lang.clone(), format!("dict_{}.txt", lang));
777        }
778
779        Ok(Self {
780            dictionaries,
781            confidence_threshold: 0.7,
782        })
783    }
784
785    /// Correct spelling in OCR result
786    pub fn correct(&self, mut result: OCRResult) -> RragResult<OCRResult> {
787        // Simple spell correction simulation
788        for word in &mut result.words {
789            if word.confidence < self.confidence_threshold {
790                word.text = self.suggest_correction(&word.text);
791                word.confidence = (word.confidence + 0.1).min(1.0);
792            }
793        }
794
795        // Rebuild text from corrected words
796        result.text = result
797            .words
798            .iter()
799            .map(|w| w.text.clone())
800            .collect::<Vec<_>>()
801            .join(" ");
802
803        Ok(result)
804    }
805
806    /// Suggest spelling correction
807    fn suggest_correction(&self, word: &str) -> String {
808        // Simple correction rules (in practice would use proper spell checker)
809        match word.to_lowercase().as_str() {
810            "teh" => "the".to_string(),
811            "adn" => "and".to_string(),
812            "taht" => "that".to_string(),
813            _ => word.to_string(),
814        }
815    }
816}
817
818impl LanguageDetector {
819    /// Create new language detector
820    pub fn new(supported_languages: Vec<String>) -> Self {
821        Self {
822            supported_languages,
823            min_confidence: 0.8,
824        }
825    }
826
827    /// Detect languages in text
828    pub fn detect(&self, text: &str) -> RragResult<Vec<String>> {
829        // Simple language detection (would use proper language detection library)
830        if text.chars().any(|c| c as u32 > 127) {
831            // Contains non-ASCII characters, might be non-English
832            if text.chars().any(|c| '\u{4e00}' <= c && c <= '\u{9fff}') {
833                Ok(vec!["zh".to_string()])
834            } else if text.chars().any(|c| '\u{3040}' <= c && c <= '\u{309f}') {
835                Ok(vec!["ja".to_string()])
836            } else {
837                Ok(vec!["en".to_string()]) // Default to English
838            }
839        } else {
840            Ok(vec!["en".to_string()])
841        }
842    }
843}
844
845impl TextFormatter {
846    /// Create new text formatter
847    pub fn new(preserve_formatting: bool) -> Self {
848        Self {
849            preserve_line_breaks: preserve_formatting,
850            preserve_spacing: preserve_formatting,
851            cleanup_artifacts: true,
852        }
853    }
854
855    /// Format OCR result
856    pub fn format(&self, mut result: OCRResult) -> RragResult<OCRResult> {
857        if self.cleanup_artifacts {
858            result.text = self.cleanup_text(&result.text);
859        }
860
861        if !self.preserve_spacing {
862            result.text = self.normalize_spacing(&result.text);
863        }
864
865        if !self.preserve_line_breaks {
866            result.text = result.text.replace('\n', " ");
867        }
868
869        Ok(result)
870    }
871
872    /// Clean up OCR artifacts
873    fn cleanup_text(&self, text: &str) -> String {
874        text.chars()
875            .filter(|&c| c.is_ascii_graphic() || c.is_whitespace())
876            .collect::<String>()
877            .trim()
878            .to_string()
879    }
880
881    /// Normalize spacing
882    fn normalize_spacing(&self, text: &str) -> String {
883        text.split_whitespace().collect::<Vec<_>>().join(" ")
884    }
885}
886
887impl OCRLayoutAnalyzer {
888    /// Create new layout analyzer
889    pub fn new() -> Self {
890        Self {
891            block_threshold: 0.1,
892            column_detection: true,
893            reading_order_detection: true,
894        }
895    }
896
897    /// Analyze layout from OCR result
898    pub fn analyze_layout(&self, result: &OCRResult) -> RragResult<TextLayout> {
899        let blocks = self.detect_blocks(result)?;
900        let reading_order = self.determine_reading_order(&blocks)?;
901        let columns = if self.column_detection {
902            Some(self.detect_columns(&blocks)?)
903        } else {
904            None
905        };
906
907        Ok(TextLayout {
908            blocks,
909            reading_order,
910            columns,
911        })
912    }
913
914    /// Detect text blocks
915    fn detect_blocks(&self, result: &OCRResult) -> RragResult<Vec<TextBlock>> {
916        let mut blocks = Vec::new();
917
918        // Group words into blocks based on proximity
919        let mut current_block_words = Vec::new();
920        let mut current_y = 0u32;
921
922        for word in &result.words {
923            if current_block_words.is_empty()
924                || (word.bounding_box.y as i32 - current_y as i32).abs() < 10
925            {
926                current_block_words.push(word);
927                current_y = word.bounding_box.y;
928            } else {
929                // Start new block
930                if !current_block_words.is_empty() {
931                    blocks.push(self.create_block_from_words(&current_block_words, blocks.len()));
932                }
933                current_block_words = vec![word];
934                current_y = word.bounding_box.y;
935            }
936        }
937
938        // Add final block
939        if !current_block_words.is_empty() {
940            blocks.push(self.create_block_from_words(&current_block_words, blocks.len()));
941        }
942
943        Ok(blocks)
944    }
945
946    /// Create text block from words
947    fn create_block_from_words(&self, words: &[&OCRWord], id: usize) -> TextBlock {
948        let text = words
949            .iter()
950            .map(|w| w.text.as_str())
951            .collect::<Vec<_>>()
952            .join(" ");
953
954        // Calculate bounding box
955        let min_x = words.iter().map(|w| w.bounding_box.x).min().unwrap_or(0);
956        let min_y = words.iter().map(|w| w.bounding_box.y).min().unwrap_or(0);
957        let max_x = words
958            .iter()
959            .map(|w| w.bounding_box.x + w.bounding_box.width)
960            .max()
961            .unwrap_or(0);
962        let max_y = words
963            .iter()
964            .map(|w| w.bounding_box.y + w.bounding_box.height)
965            .max()
966            .unwrap_or(0);
967
968        let bounding_box = BoundingBox {
969            x: min_x,
970            y: min_y,
971            width: max_x - min_x,
972            height: max_y - min_y,
973        };
974
975        // Determine block type (simplified)
976        let block_type = if text.len() < 20 && words.len() <= 3 {
977            BlockType::Title
978        } else if text.ends_with(':') {
979            BlockType::Heading
980        } else {
981            BlockType::Paragraph
982        };
983
984        TextBlock {
985            id,
986            text,
987            bounding_box,
988            block_type,
989        }
990    }
991
992    /// Determine reading order
993    fn determine_reading_order(&self, blocks: &[TextBlock]) -> RragResult<Vec<usize>> {
994        if !self.reading_order_detection {
995            return Ok((0..blocks.len()).collect());
996        }
997
998        // Sort by Y position first, then by X position
999        let mut indexed_blocks: Vec<(usize, &TextBlock)> = blocks.iter().enumerate().collect();
1000        indexed_blocks.sort_by(|a, b| {
1001            a.1.bounding_box
1002                .y
1003                .cmp(&b.1.bounding_box.y)
1004                .then_with(|| a.1.bounding_box.x.cmp(&b.1.bounding_box.x))
1005        });
1006
1007        Ok(indexed_blocks.into_iter().map(|(idx, _)| idx).collect())
1008    }
1009
1010    /// Detect columns
1011    fn detect_columns(&self, blocks: &[TextBlock]) -> RragResult<Vec<Column>> {
1012        // Simple column detection based on X positions
1013        let mut columns = Vec::new();
1014
1015        if blocks.is_empty() {
1016            return Ok(columns);
1017        }
1018
1019        // Group blocks by X position (simplified)
1020        let mut x_groups: std::collections::HashMap<u32, Vec<usize>> =
1021            std::collections::HashMap::new();
1022
1023        for (idx, block) in blocks.iter().enumerate() {
1024            let x_group = (block.bounding_box.x / 100) * 100; // Group by 100px
1025            x_groups.entry(x_group).or_insert_with(Vec::new).push(idx);
1026        }
1027
1028        // Convert groups to columns
1029        for (_x_pos, block_indices) in x_groups {
1030            columns.push(Column {
1031                index: columns.len(),
1032                blocks: block_indices,
1033                width: 100, // Simplified
1034            });
1035        }
1036
1037        // Sort columns by X position
1038        columns.sort_by_key(|c| c.index);
1039
1040        Ok(columns)
1041    }
1042}
1043
1044#[cfg(test)]
1045mod tests {
1046    use super::*;
1047    use tempfile::NamedTempFile;
1048
1049    #[test]
1050    fn test_ocr_engine_creation() {
1051        let config = OCRConfig::default();
1052        let engine = DefaultOCREngine::new(config).unwrap();
1053
1054        assert_eq!(engine.config.confidence_threshold, 0.7);
1055        assert!(engine.config.spell_correction);
1056    }
1057
1058    #[test]
1059    fn test_tesseract_engine() {
1060        let engine = TesseractEngine::new(vec!["eng".to_string()]).unwrap();
1061        let capabilities = engine.capabilities();
1062
1063        assert!(capabilities.confidence_scores);
1064        assert!(capabilities.layout_detection);
1065        assert_eq!(engine.name(), "Tesseract");
1066    }
1067
1068    #[test]
1069    fn test_spell_checker() {
1070        let checker = SpellChecker::new(&["en".to_string()]).unwrap();
1071        let correction = checker.suggest_correction("teh");
1072        assert_eq!(correction, "the");
1073    }
1074
1075    #[test]
1076    fn test_language_detector() {
1077        let detector = LanguageDetector::new(vec!["en".to_string(), "zh".to_string()]);
1078
1079        let english_result = detector.detect("Hello world").unwrap();
1080        assert_eq!(english_result, vec!["en"]);
1081
1082        let chinese_result = detector.detect("你好世界").unwrap();
1083        assert_eq!(chinese_result, vec!["zh"]);
1084    }
1085
1086    #[test]
1087    fn test_text_formatter() {
1088        let formatter = TextFormatter::new(false);
1089
1090        let result = OCRResult {
1091            text: "  Hello    world  \n  test  ".to_string(),
1092            confidence: 0.9,
1093            words: vec![],
1094            languages: vec!["en".to_string()],
1095        };
1096
1097        let formatted = formatter.format(result).unwrap();
1098        assert_eq!(formatted.text, "Hello world test");
1099    }
1100
1101    #[test]
1102    fn test_layout_analysis() {
1103        let analyzer = OCRLayoutAnalyzer::new();
1104
1105        let result = OCRResult {
1106            text: "Sample text".to_string(),
1107            confidence: 0.9,
1108            words: vec![
1109                OCRWord {
1110                    text: "Sample".to_string(),
1111                    confidence: 0.9,
1112                    bounding_box: BoundingBox {
1113                        x: 10,
1114                        y: 10,
1115                        width: 50,
1116                        height: 20,
1117                    },
1118                },
1119                OCRWord {
1120                    text: "text".to_string(),
1121                    confidence: 0.9,
1122                    bounding_box: BoundingBox {
1123                        x: 65,
1124                        y: 10,
1125                        width: 30,
1126                        height: 20,
1127                    },
1128                },
1129            ],
1130            languages: vec!["en".to_string()],
1131        };
1132
1133        let layout = analyzer.analyze_layout(&result).unwrap();
1134        assert!(!layout.blocks.is_empty());
1135        assert!(!layout.reading_order.is_empty());
1136    }
1137}
rexis_rag/multimodal/ocr.rs

rexis_rag/multimodal/
ocr.rs