mod config;
mod detector;
mod engine;
mod error;
mod postprocessor;
mod preprocessor;
mod recognizer;
pub use config::{DetResizeStrategy, OcrConfig, OcrConfigBuilder};
pub use detector::TextDetector;
pub use engine::{OcrEngine, OcrOutput, OcrSpan};
pub use error::OcrError;
pub use postprocessor::DetectedBox;
pub use preprocessor::{crop_text_region, preprocess_for_detection, preprocess_for_recognition};
pub use recognizer::{RecognitionResult, TextRecognizer};
use crate::{PdfDocument, Result};
#[derive(Debug, Clone, PartialEq)]
pub enum PageType {
NativeText,
ScannedPage,
HybridPage,
}
pub fn detect_page_type(doc: &mut PdfDocument, page: usize) -> Result<PageType> {
let spans = doc.extract_spans(page).unwrap_or_default();
let native_text: String = spans
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let trimmed = native_text.trim();
let text_len = trimmed.len();
let replacement_count = trimmed.chars().filter(|&c| c == '\u{FFFD}').count();
let total_chars = trimmed.chars().count().max(1);
let replacement_ratio = replacement_count as f32 / total_chars as f32;
let text_is_garbled = replacement_ratio > 0.20 && total_chars > 10;
let images = doc.extract_images(page)?;
if images.is_empty() {
return Ok(PageType::NativeText);
}
let page_area: f32 = 612.0 * 792.0;
let largest_image_area = images
.iter()
.map(|img| (img.width() as f32) * (img.height() as f32))
.fold(0.0_f32, f32::max);
let high_coverage = largest_image_area > page_area * 4.0;
if text_len <= 50 || text_is_garbled {
Ok(PageType::ScannedPage)
} else if high_coverage && text_len < 500 {
Ok(PageType::HybridPage)
} else {
Ok(PageType::NativeText)
}
}
pub fn needs_ocr(doc: &mut PdfDocument, page: usize) -> Result<bool> {
let page_type = detect_page_type(doc, page)?;
Ok(matches!(page_type, PageType::ScannedPage | PageType::HybridPage))
}
#[derive(Debug, Clone)]
pub struct OcrExtractOptions {
pub config: OcrConfig,
pub scale: f32,
pub fallback_to_native: bool,
}
impl Default for OcrExtractOptions {
fn default() -> Self {
Self {
config: OcrConfig::default(),
scale: 300.0 / 72.0, fallback_to_native: true,
}
}
}
impl OcrExtractOptions {
pub fn with_dpi(dpi: f32) -> Self {
Self {
scale: dpi / 72.0,
..Default::default()
}
}
}
pub fn ocr_page(
doc: &mut PdfDocument,
page: usize,
engine: &OcrEngine,
options: &OcrExtractOptions,
) -> Result<String> {
let images = doc.extract_images(page)?;
if images.is_empty() {
if options.fallback_to_native {
return doc.extract_text(page);
}
return Ok(String::new());
}
let largest_image = images
.iter()
.max_by_key(|img| (img.width() as u64) * (img.height() as u64))
.unwrap();
let dynamic_image = largest_image.to_dynamic_image()?;
let ocr_result = engine
.ocr_image(&dynamic_image)
.map_err(|e| crate::error::Error::Image(format!("OCR failed: {}", e)))?;
Ok(ocr_result.text_in_reading_order())
}
pub fn ocr_page_spans(
doc: &mut PdfDocument,
page: usize,
engine: &OcrEngine,
options: &OcrExtractOptions,
) -> Result<Vec<crate::layout::text_block::TextSpan>> {
let images = doc.extract_images(page)?;
if images.is_empty() {
return Ok(Vec::new());
}
let largest_image = images
.iter()
.max_by_key(|img| (img.width() as u64) * (img.height() as u64))
.unwrap();
let dynamic_image = largest_image.to_dynamic_image()?;
let ocr_result = engine
.ocr_image(&dynamic_image)
.map_err(|e| crate::error::Error::Image(format!("OCR failed: {}", e)))?;
Ok(ocr_result.to_text_spans(options.scale))
}
pub fn extract_text_with_ocr(
doc: &mut PdfDocument,
page: usize,
engine: Option<&OcrEngine>,
options: OcrExtractOptions,
) -> Result<String> {
let page_type = detect_page_type(doc, page)?;
match page_type {
PageType::NativeText => {
doc.extract_text(page)
},
PageType::ScannedPage => {
if let Some(ocr_engine) = engine {
match ocr_page(doc, page, ocr_engine, &options) {
Ok(ocr_text) => Ok(ocr_text),
Err(e) => {
log::warn!("OCR failed for scanned page {}: {}", page, e);
if options.fallback_to_native {
doc.extract_text(page)
} else {
Err(e)
}
},
}
} else {
doc.extract_text(page)
}
},
PageType::HybridPage => {
let native_text = doc.extract_text(page).unwrap_or_default();
if let Some(ocr_engine) = engine {
match ocr_page(doc, page, ocr_engine, &options) {
Ok(ocr_text) => {
let native_len = native_text.trim().len();
let ocr_len = ocr_text.trim().len();
if ocr_len > native_len * 2 {
log::debug!(
"Hybrid page {}: OCR ({} chars) >> native ({} chars), using OCR",
page,
ocr_len,
native_len
);
Ok(ocr_text)
} else {
log::debug!(
"Hybrid page {}: native ({} chars) >= OCR ({} chars), using native",
page,
native_len,
ocr_len
);
Ok(native_text)
}
},
Err(e) => {
log::warn!("OCR failed for hybrid page {}: {}, using native text", page, e);
Ok(native_text)
},
}
} else {
Ok(native_text)
}
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ocr_module_compiles() {
let _ = OcrConfig::default();
}
#[test]
fn test_ocr_extract_options_default() {
let options = OcrExtractOptions::default();
assert!((options.scale - 300.0 / 72.0).abs() < 0.01);
assert!(options.fallback_to_native);
}
#[test]
fn test_ocr_extract_options_with_dpi() {
let options = OcrExtractOptions::with_dpi(200.0);
assert!((options.scale - 200.0 / 72.0).abs() < 0.01);
}
#[test]
fn test_page_type_enum() {
assert_eq!(PageType::NativeText, PageType::NativeText);
assert_ne!(PageType::NativeText, PageType::ScannedPage);
assert_ne!(PageType::ScannedPage, PageType::HybridPage);
}
}