mod backend;
mod config;
mod detector;
mod engine;
mod error;
mod postprocessor;
mod preprocessor;
mod recognizer;
pub use config::{DetResizeStrategy, OcrConfig, OcrConfigBuilder};
pub use detector::TextDetector;
pub use engine::{OcrEngine, OcrOutput, OcrSpan};
pub use error::OcrError;
pub use postprocessor::DetectedBox;
pub use preprocessor::{crop_text_region, preprocess_for_detection, preprocess_for_recognition};
pub use recognizer::{RecognitionResult, TextRecognizer};
use crate::{PdfDocument, Result};
#[derive(Debug, Clone, PartialEq)]
pub enum PageType {
NativeText,
ScannedPage,
HybridPage,
}
pub fn detect_page_type(doc: &PdfDocument, page: usize) -> Result<PageType> {
use crate::extractors::auto::PageKind;
let cls = doc.classify_page(page)?;
Ok(match cls.kind {
PageKind::TextLayer | PageKind::Empty => PageType::NativeText,
PageKind::Scanned => PageType::ScannedPage,
PageKind::ImageText | PageKind::Mixed => PageType::HybridPage,
})
}
pub fn needs_ocr(doc: &PdfDocument, page: usize) -> Result<bool> {
let page_type = detect_page_type(doc, page)?;
Ok(matches!(page_type, PageType::ScannedPage | PageType::HybridPage))
}
#[derive(Debug, Clone)]
pub struct OcrExtractOptions {
pub config: OcrConfig,
pub scale: f32,
pub fallback_to_native: bool,
}
impl Default for OcrExtractOptions {
fn default() -> Self {
Self {
config: OcrConfig::default(),
scale: 300.0 / 72.0, fallback_to_native: true,
}
}
}
impl OcrExtractOptions {
pub fn with_dpi(dpi: f32) -> Self {
Self {
scale: dpi / 72.0,
..Default::default()
}
}
}
pub fn ocr_page(
doc: &PdfDocument,
page: usize,
engine: &OcrEngine,
options: &OcrExtractOptions,
) -> Result<String> {
let images = doc.extract_images(page)?;
if images.is_empty() {
if options.fallback_to_native {
return doc.extract_text(page);
}
return Ok(String::new());
}
let largest_image = images
.iter()
.max_by_key(|img| (img.width() as u64) * (img.height() as u64))
.unwrap();
let dynamic_image = largest_image.to_dynamic_image()?;
let ocr_result = engine
.ocr_image(&dynamic_image)
.map_err(|e| crate::error::Error::Image(format!("OCR failed: {}", e)))?;
Ok(ocr_result.text_in_reading_order())
}
pub fn ocr_page_spans(
doc: &PdfDocument,
page: usize,
engine: &OcrEngine,
options: &OcrExtractOptions,
) -> Result<Vec<crate::layout::text_block::TextSpan>> {
let images = doc.extract_images(page)?;
if images.is_empty() {
return Ok(Vec::new());
}
let largest_image = images
.iter()
.max_by_key(|img| (img.width() as u64) * (img.height() as u64))
.unwrap();
let dynamic_image = largest_image.to_dynamic_image()?;
let ocr_result = engine
.ocr_image(&dynamic_image)
.map_err(|e| crate::error::Error::Image(format!("OCR failed: {}", e)))?;
Ok(ocr_result.to_text_spans(options.scale))
}
pub(crate) fn merge_native_and_ocr(native: &str, ocr: &str) -> String {
let native_trimmed = native.trim_end();
if ocr.trim().is_empty() {
return native.to_string();
}
if native_trimmed.trim().is_empty() {
return ocr.to_string();
}
let norm = |s: &str| {
s.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_lowercase()
};
let native_norm = norm(native_trimmed);
let mut extra: Vec<&str> = Vec::new();
for line in ocr.lines() {
let lt = line.trim();
if lt.is_empty() {
continue;
}
let ln = norm(lt);
if ln.is_empty() || native_norm.contains(&ln) || extra.iter().any(|e| norm(e) == ln) {
continue;
}
extra.push(lt);
}
if extra.is_empty() {
return native.to_string();
}
format!("{native_trimmed}\n{}", extra.join("\n"))
}
pub fn extract_text_with_ocr(
doc: &PdfDocument,
page: usize,
engine: Option<&OcrEngine>,
options: OcrExtractOptions,
) -> Result<String> {
let page_type = detect_page_type(doc, page)?;
match page_type {
PageType::NativeText => {
doc.extract_text(page)
},
PageType::ScannedPage => {
if let Some(ocr_engine) = engine {
match ocr_page(doc, page, ocr_engine, &options) {
Ok(ocr_text) => Ok(ocr_text),
Err(e) => {
log::warn!("OCR failed for scanned page {}: {}", page, e);
if options.fallback_to_native {
doc.extract_text(page)
} else {
Err(e)
}
},
}
} else {
doc.extract_text(page)
}
},
PageType::HybridPage => {
let native_text = doc.extract_text(page).unwrap_or_default();
if let Some(ocr_engine) = engine {
match ocr_page(doc, page, ocr_engine, &options) {
Ok(ocr_text) => Ok(merge_native_and_ocr(&native_text, &ocr_text)),
Err(e) => {
log::warn!("OCR failed for hybrid page {}: {}, using native text", page, e);
Ok(native_text)
},
}
} else {
Ok(native_text)
}
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ocr_module_compiles() {
let _ = OcrConfig::default();
}
#[test]
fn test_ocr_extract_options_default() {
let options = OcrExtractOptions::default();
assert!((options.scale - 300.0 / 72.0).abs() < 0.01);
assert!(options.fallback_to_native);
}
#[test]
fn test_ocr_extract_options_with_dpi() {
let options = OcrExtractOptions::with_dpi(200.0);
assert!((options.scale - 200.0 / 72.0).abs() < 0.01);
}
#[test]
fn test_page_type_enum() {
assert_eq!(PageType::NativeText, PageType::NativeText);
assert_ne!(PageType::NativeText, PageType::ScannedPage);
assert_ne!(PageType::ScannedPage, PageType::HybridPage);
}
#[test]
fn merge_unions_disjoint_native_and_image_text() {
let m = merge_native_and_ocr(
"Native paragraph stays.\nSecond native line.",
"Caption text from the figure",
);
assert!(m.contains("Native paragraph stays."), "{m:?}");
assert!(m.contains("Second native line."), "{m:?}");
assert!(m.contains("Caption text from the figure"), "{m:?}");
}
#[test]
fn merge_is_not_either_or_when_native_is_longer() {
let native = "A very long native paragraph ".repeat(8);
let m = merge_native_and_ocr(&native, "INVOICE 42");
assert!(m.contains("INVOICE 42"), "in-image text must survive: {m:?}");
assert!(m.contains("A very long native paragraph"), "{m:?}");
}
#[test]
fn merge_dedups_sidecar_and_handles_empties() {
let m = merge_native_and_ocr("Hello World\nKeep me", "hello world\nNEW LINE");
assert_eq!(m.matches("Hello World").count(), 1, "{m:?}");
assert!(m.contains("NEW LINE"), "{m:?}");
assert_eq!(merge_native_and_ocr("only native", " "), "only native");
assert_eq!(merge_native_and_ocr(" ", "only ocr"), "only ocr");
}
}