use crate::graphics::ImageFormat;
use crate::operations::page_analysis::ContentAnalysis;
use std::fmt;
pub type OcrResult<T> = Result<T, OcrError>;
#[derive(Debug, thiserror::Error)]
pub enum OcrError {
#[error("OCR provider not available: {0}")]
ProviderNotAvailable(String),
#[error("Unsupported image format: {0:?}")]
UnsupportedImageFormat(ImageFormat),
#[error("Invalid image data: {0}")]
InvalidImageData(String),
#[error("OCR processing failed: {0}")]
ProcessingFailed(String),
#[error("Network error: {0}")]
NetworkError(String),
#[error("Authentication error: {0}")]
AuthenticationError(String),
#[error("Rate limit exceeded: {0}")]
RateLimitExceeded(String),
#[error("Low confidence results: {0}")]
LowConfidence(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Configuration error: {0}")]
Configuration(String),
}
#[derive(Debug, Clone, PartialEq)]
pub struct OcrRegion {
pub x: u32,
pub y: u32,
pub width: u32,
pub height: u32,
pub label: Option<String>,
}
impl OcrRegion {
pub fn new(x: u32, y: u32, width: u32, height: u32) -> Self {
Self {
x,
y,
width,
height,
label: None,
}
}
pub fn with_label(x: u32, y: u32, width: u32, height: u32, label: impl Into<String>) -> Self {
Self {
x,
y,
width,
height,
label: Some(label.into()),
}
}
pub fn contains_point(&self, x: u32, y: u32) -> bool {
x >= self.x && x < self.x + self.width && y >= self.y && y < self.y + self.height
}
pub fn overlaps_with(&self, other: &OcrRegion) -> bool {
!(self.x + self.width <= other.x
|| other.x + other.width <= self.x
|| self.y + self.height <= other.y
|| other.y + other.height <= self.y)
}
}
#[derive(Debug, Clone)]
pub struct OcrOptions {
pub language: String,
pub min_confidence: f64,
pub preserve_layout: bool,
pub preprocessing: ImagePreprocessing,
pub engine_options: std::collections::HashMap<String, String>,
pub timeout_seconds: u32,
pub regions: Option<Vec<OcrRegion>>,
pub debug_output: bool,
}
impl Default for OcrOptions {
fn default() -> Self {
Self {
language: "en".to_string(),
min_confidence: 0.6,
preserve_layout: true,
preprocessing: ImagePreprocessing::default(),
engine_options: std::collections::HashMap::new(),
timeout_seconds: 60, regions: None,
debug_output: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ImagePreprocessing {
pub denoise: bool,
pub deskew: bool,
pub enhance_contrast: bool,
pub sharpen: bool,
pub scale_factor: f64,
}
impl Default for ImagePreprocessing {
fn default() -> Self {
Self {
denoise: true,
deskew: true,
enhance_contrast: true,
sharpen: false,
scale_factor: 1.0,
}
}
}
#[derive(Debug, Clone)]
pub struct WordConfidence {
pub word: String,
pub confidence: f64,
pub x_offset: f64,
pub width: f64,
pub character_confidences: Option<Vec<CharacterConfidence>>,
}
impl WordConfidence {
pub fn new(word: String, confidence: f64, x_offset: f64, width: f64) -> Self {
Self {
word,
confidence,
x_offset,
width,
character_confidences: None,
}
}
pub fn with_characters(
word: String,
confidence: f64,
x_offset: f64,
width: f64,
character_confidences: Vec<CharacterConfidence>,
) -> Self {
Self {
word,
confidence,
x_offset,
width,
character_confidences: Some(character_confidences),
}
}
pub fn average_character_confidence(&self) -> Option<f64> {
self.character_confidences.as_ref().map(|chars| {
let sum: f64 = chars.iter().map(|c| c.confidence).sum();
sum / chars.len() as f64
})
}
pub fn is_low_confidence(&self, threshold: f64) -> bool {
self.confidence < threshold
}
}
#[derive(Debug, Clone)]
pub struct CharacterConfidence {
pub character: char,
pub confidence: f64,
pub x_offset: f64,
pub width: f64,
}
impl CharacterConfidence {
pub fn new(character: char, confidence: f64, x_offset: f64, width: f64) -> Self {
Self {
character,
confidence,
x_offset,
width,
}
}
}
#[derive(Debug, Clone)]
pub struct CorrectionCandidate {
pub word: String,
pub confidence: f64,
pub position_in_fragment: usize,
pub suggested_corrections: Vec<CorrectionSuggestion>,
pub correction_reason: CorrectionReason,
}
#[derive(Debug, Clone)]
pub struct CorrectionSuggestion {
pub corrected_word: String,
pub correction_confidence: f64,
pub correction_type: CorrectionType,
pub explanation: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CorrectionReason {
LowConfidence,
ConfusionPattern,
NotInDictionary,
ContextualError,
SuspiciousPattern,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum CorrectionType {
CharacterSubstitution,
DictionaryCorrection,
ContextualCorrection,
PatternCorrection,
ManualReview,
}
#[derive(Debug, Clone)]
pub struct OcrPostProcessor {
pub character_corrections: std::collections::HashMap<char, Vec<char>>,
pub dictionary: Option<std::collections::HashSet<String>>,
pub pattern_corrections: std::collections::HashMap<String, String>,
pub correction_threshold: f64,
pub max_edit_distance: usize,
}
impl OcrPostProcessor {
pub fn new() -> Self {
let mut character_corrections = std::collections::HashMap::new();
character_corrections.insert('0', vec!['O', 'o', 'Q']);
character_corrections.insert('O', vec!['0', 'Q', 'o']);
character_corrections.insert('1', vec!['l', 'I', '|']);
character_corrections.insert('l', vec!['1', 'I', '|']);
character_corrections.insert('I', vec!['1', 'l', '|']);
character_corrections.insert('S', vec!['5', '$']);
character_corrections.insert('5', vec!['S', '$']);
character_corrections.insert('2', vec!['Z', 'z']);
character_corrections.insert('Z', vec!['2', 'z']);
let mut pattern_corrections = std::collections::HashMap::new();
pattern_corrections.insert("rn".to_string(), "m".to_string());
pattern_corrections.insert("cl".to_string(), "d".to_string());
pattern_corrections.insert("fi".to_string(), "fi".to_string()); pattern_corrections.insert("fl".to_string(), "fl".to_string());
Self {
character_corrections,
dictionary: None,
pattern_corrections,
correction_threshold: 0.7,
max_edit_distance: 2,
}
}
pub fn with_dictionary(mut self, dictionary: std::collections::HashSet<String>) -> Self {
self.dictionary = Some(dictionary);
self
}
pub fn process_fragment(&self, fragment: &OcrTextFragment) -> Vec<CorrectionCandidate> {
let mut candidates = fragment.get_correction_candidates(self.correction_threshold);
for candidate in &mut candidates {
candidate.suggested_corrections = self.generate_suggestions(&candidate.word);
}
candidates
}
pub fn generate_suggestions(&self, word: &str) -> Vec<CorrectionSuggestion> {
let mut suggestions = Vec::new();
suggestions.extend(self.character_substitution_corrections(word));
suggestions.extend(self.pattern_corrections(word));
if let Some(dict) = &self.dictionary {
suggestions.extend(self.dictionary_corrections(word, dict));
}
suggestions.sort_by(|a, b| b.correction_confidence.total_cmp(&a.correction_confidence));
suggestions.truncate(5);
suggestions
}
fn character_substitution_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
let mut suggestions = Vec::new();
let chars: Vec<char> = word.chars().collect();
for (i, &ch) in chars.iter().enumerate() {
if let Some(alternatives) = self.character_corrections.get(&ch) {
for &alt_ch in alternatives {
let mut corrected_chars = chars.clone();
corrected_chars[i] = alt_ch;
let corrected_word: String = corrected_chars.into_iter().collect();
suggestions.push(CorrectionSuggestion {
corrected_word,
correction_confidence: 0.8,
correction_type: CorrectionType::CharacterSubstitution,
explanation: Some(format!("'{}' -> '{}' substitution", ch, alt_ch)),
});
}
}
}
suggestions
}
fn pattern_corrections(&self, word: &str) -> Vec<CorrectionSuggestion> {
let mut suggestions = Vec::new();
for (pattern, replacement) in &self.pattern_corrections {
if word.contains(pattern) {
let corrected_word = word.replace(pattern, replacement);
suggestions.push(CorrectionSuggestion {
corrected_word,
correction_confidence: 0.85,
correction_type: CorrectionType::PatternCorrection,
explanation: Some(format!(
"Pattern '{}' -> '{}' correction",
pattern, replacement
)),
});
}
}
suggestions
}
fn dictionary_corrections(
&self,
word: &str,
dictionary: &std::collections::HashSet<String>,
) -> Vec<CorrectionSuggestion> {
let mut suggestions = Vec::new();
if dictionary.contains(word) {
return suggestions;
}
for dict_word in dictionary {
if self.edit_distance(word, dict_word) <= self.max_edit_distance {
let confidence = 1.0
- (self.edit_distance(word, dict_word) as f64
/ word.len().max(dict_word.len()) as f64);
suggestions.push(CorrectionSuggestion {
corrected_word: dict_word.clone(),
correction_confidence: confidence * 0.9, correction_type: CorrectionType::DictionaryCorrection,
explanation: Some(format!(
"Dictionary match with edit distance {}",
self.edit_distance(word, dict_word)
)),
});
}
}
suggestions
}
fn edit_distance(&self, s1: &str, s2: &str) -> usize {
let len1 = s1.len();
let len2 = s2.len();
let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
#[allow(clippy::needless_range_loop)]
for i in 0..=len1 {
dp[i][0] = i;
}
for j in 0..=len2 {
dp[0][j] = j;
}
let s1_chars: Vec<char> = s1.chars().collect();
let s2_chars: Vec<char> = s2.chars().collect();
for i in 1..=len1 {
for j in 1..=len2 {
if s1_chars[i - 1] == s2_chars[j - 1] {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
}
}
}
dp[len1][len2]
}
}
impl Default for OcrPostProcessor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct OcrTextFragment {
pub text: String,
pub x: f64,
pub y: f64,
pub width: f64,
pub height: f64,
pub confidence: f64,
pub word_confidences: Option<Vec<WordConfidence>>,
pub font_size: f64,
pub fragment_type: FragmentType,
}
impl OcrTextFragment {
#[allow(clippy::too_many_arguments)]
pub fn new(
text: String,
x: f64,
y: f64,
width: f64,
height: f64,
confidence: f64,
font_size: f64,
fragment_type: FragmentType,
) -> Self {
Self {
text,
x,
y,
width,
height,
confidence,
word_confidences: None,
font_size,
fragment_type,
}
}
#[allow(clippy::too_many_arguments)]
pub fn with_word_confidences(
text: String,
x: f64,
y: f64,
width: f64,
height: f64,
confidence: f64,
font_size: f64,
fragment_type: FragmentType,
word_confidences: Vec<WordConfidence>,
) -> Self {
Self {
text,
x,
y,
width,
height,
confidence,
word_confidences: Some(word_confidences),
font_size,
fragment_type,
}
}
pub fn get_low_confidence_words(&self, threshold: f64) -> Vec<&WordConfidence> {
self.word_confidences
.as_ref()
.map(|words| words.iter().filter(|w| w.confidence < threshold).collect())
.unwrap_or_default()
}
pub fn average_word_confidence(&self) -> Option<f64> {
self.word_confidences.as_ref().map(|words| {
if words.is_empty() {
return 0.0;
}
let sum: f64 = words.iter().map(|w| w.confidence).sum();
sum / words.len() as f64
})
}
pub fn words_by_confidence(&self) -> Vec<&WordConfidence> {
self.word_confidences
.as_ref()
.map(|words| {
let mut sorted_words: Vec<_> = words.iter().collect();
sorted_words.sort_by(|a, b| a.confidence.total_cmp(&b.confidence));
sorted_words
})
.unwrap_or_default()
}
pub fn has_low_confidence_words(&self, threshold: f64) -> bool {
self.word_confidences
.as_ref()
.map(|words| words.iter().any(|w| w.confidence < threshold))
.unwrap_or(false)
}
pub fn get_correction_candidates(&self, threshold: f64) -> Vec<CorrectionCandidate> {
self.word_confidences
.as_ref()
.map(|words| {
words
.iter()
.enumerate()
.filter(|(_, w)| w.confidence < threshold)
.map(|(index, word)| CorrectionCandidate {
word: word.word.clone(),
confidence: word.confidence,
position_in_fragment: index,
suggested_corrections: vec![], correction_reason: CorrectionReason::LowConfidence,
})
.collect()
})
.unwrap_or_default()
}
pub fn confidence_report(&self) -> String {
let mut report = format!(
"Fragment confidence: {:.1}% - \"{}\"\n",
self.confidence * 100.0,
self.text.trim()
);
if let Some(words) = &self.word_confidences {
report.push_str(&format!(
" Word-level breakdown ({} words):\n",
words.len()
));
for (i, word) in words.iter().enumerate() {
report.push_str(&format!(
" {}: \"{}\" - {:.1}%\n",
i + 1,
word.word,
word.confidence * 100.0
));
if let Some(chars) = &word.character_confidences {
report.push_str(" Characters: ");
for ch in chars {
report.push_str(&format!(
"'{}'({:.0}%) ",
ch.character,
ch.confidence * 100.0
));
}
report.push('\n');
}
}
} else {
report.push_str(" (No word-level data available)\n");
}
report
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FragmentType {
Character,
Word,
Line,
Paragraph,
}
#[derive(Debug, Clone)]
pub struct OcrProcessingResult {
pub text: String,
pub confidence: f64,
pub fragments: Vec<OcrTextFragment>,
pub processing_time_ms: u64,
pub engine_name: String,
pub language: String,
pub processed_region: Option<OcrRegion>,
pub image_dimensions: (u32, u32),
}
impl OcrProcessingResult {
pub fn new(
text: String,
confidence: f64,
fragments: Vec<OcrTextFragment>,
processing_time_ms: u64,
engine_name: String,
language: String,
image_dimensions: (u32, u32),
) -> Self {
Self {
text,
confidence,
fragments,
processing_time_ms,
engine_name,
language,
processed_region: None,
image_dimensions,
}
}
#[allow(clippy::too_many_arguments)]
pub fn with_region(
text: String,
confidence: f64,
fragments: Vec<OcrTextFragment>,
processing_time_ms: u64,
engine_name: String,
language: String,
image_dimensions: (u32, u32),
region: OcrRegion,
) -> Self {
Self {
text,
confidence,
fragments,
processing_time_ms,
engine_name,
language,
processed_region: Some(region),
image_dimensions,
}
}
pub fn filter_by_confidence(&self, min_confidence: f64) -> Vec<&OcrTextFragment> {
self.fragments
.iter()
.filter(|fragment| fragment.confidence >= min_confidence)
.collect()
}
pub fn fragments_in_region(
&self,
x: f64,
y: f64,
width: f64,
height: f64,
) -> Vec<&OcrTextFragment> {
self.fragments
.iter()
.filter(|fragment| {
fragment.x >= x
&& fragment.y >= y
&& fragment.x + fragment.width <= x + width
&& fragment.y + fragment.height <= y + height
})
.collect()
}
pub fn fragments_of_type(&self, fragment_type: FragmentType) -> Vec<&OcrTextFragment> {
self.fragments
.iter()
.filter(|fragment| fragment.fragment_type == fragment_type)
.collect()
}
pub fn average_confidence(&self) -> f64 {
if self.fragments.is_empty() {
return 0.0;
}
let sum: f64 = self.fragments.iter().map(|f| f.confidence).sum();
sum / self.fragments.len() as f64
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OcrEngine {
Mock,
Tesseract,
Azure,
Aws,
GoogleCloud,
}
impl OcrEngine {
pub fn name(&self) -> &'static str {
match self {
OcrEngine::Mock => "Mock OCR",
OcrEngine::Tesseract => "Tesseract",
OcrEngine::Azure => "Azure Computer Vision",
OcrEngine::Aws => "AWS Textract",
OcrEngine::GoogleCloud => "Google Cloud Vision",
}
}
pub fn supports_format(&self, format: ImageFormat) -> bool {
match self {
OcrEngine::Mock => true, OcrEngine::Tesseract => matches!(
format,
ImageFormat::Jpeg | ImageFormat::Png | ImageFormat::Tiff
),
OcrEngine::Azure => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
OcrEngine::Aws => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
OcrEngine::GoogleCloud => matches!(format, ImageFormat::Jpeg | ImageFormat::Png),
}
}
}
impl fmt::Display for OcrEngine {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.name())
}
}
pub trait OcrProvider: Send + Sync {
fn process_image(
&self,
image_data: &[u8],
options: &OcrOptions,
) -> OcrResult<OcrProcessingResult>;
fn process_page(
&self,
_page_analysis: &ContentAnalysis,
page_data: &[u8],
options: &OcrOptions,
) -> OcrResult<OcrProcessingResult> {
self.process_image(page_data, options)
}
fn process_image_regions(
&self,
image_region_pairs: &[(&[u8], &OcrRegion)],
options: &OcrOptions,
) -> OcrResult<Vec<OcrProcessingResult>> {
let mut results = Vec::with_capacity(image_region_pairs.len());
for (image_data, region) in image_region_pairs {
let mut result = self.process_image(image_data, options)?;
for fragment in &mut result.fragments {
fragment.x += region.x as f64;
fragment.y += region.y as f64;
}
result.processed_region = Some((*region).clone());
results.push(result);
}
Ok(results)
}
fn supported_formats(&self) -> Vec<ImageFormat>;
fn engine_name(&self) -> &str;
fn engine_type(&self) -> OcrEngine;
fn supports_format(&self, format: ImageFormat) -> bool {
self.supported_formats().contains(&format)
}
fn validate_image_data(&self, image_data: &[u8]) -> OcrResult<()> {
if image_data.len() < 8 {
return Err(OcrError::InvalidImageData(
"Image data too short".to_string(),
));
}
let format = if image_data.starts_with(b"\xFF\xD8\xFF") {
ImageFormat::Jpeg
} else if image_data.starts_with(b"\x89PNG\r\n\x1a\n") {
ImageFormat::Png
} else if image_data.starts_with(b"II\x2A\x00") || image_data.starts_with(b"MM\x00\x2A") {
ImageFormat::Tiff
} else {
return Err(OcrError::InvalidImageData(
"Unrecognized image format".to_string(),
));
};
if !self.supports_format(format) {
return Err(OcrError::UnsupportedImageFormat(format));
}
Ok(())
}
}
#[derive(Clone)]
pub struct MockOcrProvider {
confidence: f64,
mock_text: String,
processing_delay_ms: u64,
}
impl MockOcrProvider {
pub fn new() -> Self {
Self {
confidence: 0.85,
mock_text: "Mock OCR extracted text from scanned image".to_string(),
processing_delay_ms: 100,
}
}
pub fn with_text_and_confidence(text: String, confidence: f64) -> Self {
Self {
confidence,
mock_text: text,
processing_delay_ms: 100,
}
}
pub fn set_mock_text(&mut self, text: String) {
self.mock_text = text;
}
pub fn set_confidence(&mut self, confidence: f64) {
self.confidence = confidence.clamp(0.0, 1.0);
}
pub fn set_processing_delay(&mut self, delay_ms: u64) {
self.processing_delay_ms = delay_ms;
}
}
impl Default for MockOcrProvider {
fn default() -> Self {
Self::new()
}
}
impl OcrProvider for MockOcrProvider {
fn process_image(
&self,
image_data: &[u8],
options: &OcrOptions,
) -> OcrResult<OcrProcessingResult> {
self.validate_image_data(image_data)?;
std::thread::sleep(std::time::Duration::from_millis(self.processing_delay_ms));
let fragments = vec![
OcrTextFragment {
text: self.mock_text.clone(),
x: 50.0,
y: 700.0,
width: 200.0,
height: 20.0,
confidence: self.confidence,
word_confidences: None,
font_size: 12.0,
fragment_type: FragmentType::Line,
},
OcrTextFragment {
text: "Additional mock text".to_string(),
x: 50.0,
y: 680.0,
width: 150.0,
height: 20.0,
confidence: self.confidence * 0.9,
word_confidences: None,
font_size: 12.0,
fragment_type: FragmentType::Line,
},
];
Ok(OcrProcessingResult {
text: format!("{}\nAdditional mock text", self.mock_text),
confidence: self.confidence,
fragments,
processing_time_ms: self.processing_delay_ms,
engine_name: "Mock OCR".to_string(),
language: options.language.clone(),
processed_region: None,
image_dimensions: (800, 600), })
}
fn supported_formats(&self) -> Vec<ImageFormat> {
vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
}
fn engine_name(&self) -> &str {
"Mock OCR"
}
fn engine_type(&self) -> OcrEngine {
OcrEngine::Mock
}
}
#[cfg(test)]
mod tests;
#[cfg(test)]
mod postprocessor_tests;
#[cfg(test)]
mod rigorous_tests;