#[cfg(feature = "ocr-tesseract")]
use crate::graphics::ImageFormat;
#[cfg(feature = "ocr-tesseract")]
use crate::text::{
FragmentType, OcrEngine, OcrError, OcrOptions, OcrProcessingResult, OcrProvider, OcrResult,
OcrTextFragment,
};
#[cfg(feature = "ocr-tesseract")]
use rusty_tesseract::{image_to_string, Args, Image};
#[cfg(feature = "ocr-tesseract")]
use std::collections::HashMap;
#[cfg(feature = "ocr-tesseract")]
use std::time::Instant;
#[cfg(feature = "ocr-tesseract")]
#[derive(Debug, Clone)]
pub struct RustyTesseractConfig {
pub language: String,
pub psm: Option<u8>,
pub oem: Option<u8>,
pub dpi: Option<u32>,
pub config_variables: HashMap<String, String>,
}
#[cfg(feature = "ocr-tesseract")]
impl Default for RustyTesseractConfig {
fn default() -> Self {
let mut config_vars = HashMap::new();
config_vars.insert("tessedit_char_whitelist".to_string(),
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?:;()[]{}\"'-+=%&#@*/\\| \t\n".to_string());
config_vars.insert("preserve_interword_spaces".to_string(), "1".to_string());
Self {
language: "eng".to_string(),
psm: Some(3), oem: Some(3), dpi: Some(300), config_variables: config_vars,
}
}
}
#[cfg(feature = "ocr-tesseract")]
pub struct RustyTesseractProvider {
config: RustyTesseractConfig,
}
#[cfg(feature = "ocr-tesseract")]
impl RustyTesseractProvider {
pub fn new() -> Self {
Self::with_config(RustyTesseractConfig::default())
}
pub fn with_config(config: RustyTesseractConfig) -> Self {
Self { config }
}
pub fn for_contracts() -> Self {
let mut config_vars = HashMap::new();
config_vars.insert("preserve_interword_spaces".to_string(), "1".to_string());
config_vars.insert("tessedit_create_hocr".to_string(), "0".to_string());
config_vars.insert("tessedit_create_tsv".to_string(), "0".to_string());
config_vars.insert("load_system_dawg".to_string(), "1".to_string()); config_vars.insert("load_freq_dawg".to_string(), "1".to_string()); config_vars.insert("textord_debug_tabfind".to_string(), "0".to_string());
config_vars.insert("textord_use_cjk_fp_model".to_string(), "0".to_string());
let config = RustyTesseractConfig {
language: "eng".to_string(),
psm: Some(1), oem: Some(1), dpi: Some(300),
config_variables: config_vars,
};
Self { config }
}
pub fn for_large_documents() -> Self {
let mut config_vars = HashMap::new();
config_vars.insert("preserve_interword_spaces".to_string(), "1".to_string());
config_vars.insert("tessedit_create_hocr".to_string(), "0".to_string());
config_vars.insert("tessedit_create_tsv".to_string(), "0".to_string());
config_vars.insert("load_system_dawg".to_string(), "1".to_string());
config_vars.insert("load_freq_dawg".to_string(), "1".to_string());
config_vars.insert("classify_enable_learning".to_string(), "0".to_string());
config_vars.insert("tessedit_do_invert".to_string(), "0".to_string());
config_vars.insert("textord_debug_tabfind".to_string(), "0".to_string());
config_vars.insert("textord_use_cjk_fp_model".to_string(), "0".to_string());
let config = RustyTesseractConfig {
language: "eng".to_string(),
psm: Some(1), oem: Some(1), dpi: Some(150), config_variables: config_vars,
};
Self { config }
}
pub fn test_availability() -> OcrResult<bool> {
let _args = Args {
lang: "eng".to_string(),
config_variables: HashMap::new(),
dpi: Some(150),
psm: Some(6),
oem: Some(3),
};
Ok(true)
}
pub fn config(&self) -> &RustyTesseractConfig {
&self.config
}
fn create_args(&self, options: &OcrOptions) -> Args {
let mut config_vars = self.config.config_variables.clone();
if options.min_confidence > 0.0 {
config_vars.insert("tessedit_reject_mode".to_string(), "2".to_string());
}
Args {
lang: self.config.language.clone(),
config_variables: config_vars,
dpi: self.config.dpi.map(|v| v as i32),
psm: self.config.psm.map(|v| v as i32),
oem: self.config.oem.map(|v| v as i32),
}
}
}
#[cfg(feature = "ocr-tesseract")]
impl OcrProvider for RustyTesseractProvider {
fn supported_formats(&self) -> Vec<ImageFormat> {
vec![
ImageFormat::Png,
ImageFormat::Jpeg,
ImageFormat::Tiff,
]
}
fn engine_name(&self) -> &str {
"rusty-tesseract"
}
fn engine_type(&self) -> OcrEngine {
OcrEngine::Tesseract
}
fn process_image(
&self,
image_data: &[u8],
options: &OcrOptions,
) -> OcrResult<OcrProcessingResult> {
let start_time = Instant::now();
use std::io::Cursor;
let cursor = Cursor::new(image_data);
let dynamic_image = rusty_tesseract::image::ImageReader::new(cursor)
.with_guessed_format()
.map_err(|e| {
OcrError::ProcessingFailed(format!("Failed to guess image format: {}", e))
})?
.decode()
.map_err(|e| OcrError::ProcessingFailed(format!("Failed to decode image: {}", e)))?;
let image = Image::from_dynamic_image(&dynamic_image).map_err(|e| {
OcrError::ProcessingFailed(format!("Failed to create tesseract image: {}", e))
})?;
let args = self.create_args(options);
let text = image_to_string(&image, &args)
.map_err(|e| OcrError::ProcessingFailed(format!("OCR processing failed: {}", e)))?;
let processing_time = start_time.elapsed();
let confidence = estimate_confidence(&text);
let fragments = if text.trim().is_empty() {
Vec::new()
} else {
vec![OcrTextFragment {
text: text.clone(),
x: 0.0,
y: 0.0,
width: 0.0,
height: 0.0,
font_size: 12.0,
confidence: confidence as f64,
word_confidences: None,
fragment_type: FragmentType::Word,
}]
};
Ok(OcrProcessingResult {
text: text.trim().to_string(),
confidence: confidence as f64,
fragments,
processing_time_ms: processing_time.as_millis() as u64,
engine_name: "rusty-tesseract".to_string(),
language: self.config.language.clone(),
processed_region: None,
image_dimensions: (0, 0), })
}
}
#[cfg(feature = "ocr-tesseract")]
fn estimate_confidence(text: &str) -> f32 {
if text.trim().is_empty() {
return 0.0;
}
let trimmed = text.trim();
let total_chars = trimmed.len() as f32;
let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count() as f32;
let _digit_count = trimmed.chars().filter(|c| c.is_numeric()).count() as f32;
let space_count = trimmed.chars().filter(|c| c.is_whitespace()).count() as f32;
let punct_count = trimmed.chars().filter(|c| c.is_ascii_punctuation()).count() as f32;
let mut confidence: f32 = 0.7;
let alpha_ratio = alpha_count / total_chars;
if alpha_ratio > 0.5 {
confidence += 0.1;
}
let space_ratio = space_count / total_chars;
if space_ratio > 0.1 && space_ratio < 0.3 {
confidence += 0.1;
}
let punct_ratio = punct_count / total_chars;
if punct_ratio < 0.2 {
confidence += 0.05;
} else {
confidence -= 0.1;
}
confidence.max(0.0).min(1.0)
}
#[cfg(feature = "ocr-tesseract")]
impl Default for RustyTesseractProvider {
fn default() -> Self {
Self::new()
}
}
#[cfg(not(feature = "ocr-tesseract"))]
pub struct RustyTesseractProvider;
#[cfg(not(feature = "ocr-tesseract"))]
impl RustyTesseractProvider {
pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
Err("OCR feature not enabled. Compile with --features ocr-tesseract".into())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "ocr-tesseract")]
#[test]
fn test_config_default() {
let config = RustyTesseractConfig::default();
assert_eq!(config.language, "eng");
assert_eq!(config.psm, Some(3)); assert_eq!(config.oem, Some(3));
assert_eq!(config.dpi, Some(300));
}
#[cfg(feature = "ocr-tesseract")]
#[test]
fn test_provider_creation() {
let _provider = RustyTesseractProvider::new();
}
#[cfg(feature = "ocr-tesseract")]
#[test]
fn test_confidence_estimation() {
assert_eq!(estimate_confidence(""), 0.0);
let normal_text = "This is a normal text with proper spacing.";
let confidence = estimate_confidence(normal_text);
assert!(confidence > 0.5);
let noisy_text = "!!@#$%^&*()";
let noisy_confidence = estimate_confidence(noisy_text);
assert!(noisy_confidence < confidence);
}
#[cfg(feature = "ocr-tesseract")]
#[test]
fn test_engine_info() {
let provider = RustyTesseractProvider::new();
assert_eq!(provider.engine_type(), OcrEngine::Tesseract);
assert_eq!(provider.engine_name(), "rusty-tesseract");
}
}