use std::sync::Arc;
use crate::error::LiteParseError;
use crate::ocr::{OcrEngine, OcrOptions, OcrResult};
use crate::types::{Page, TextItem};
use image::{ImageBuffer, Rgba};
use pdfium::Document;
pub(crate) struct RenderedPage {
pub idx: usize,
pub rgb_bytes: Vec<u8>,
pub width: u32,
pub height: u32,
}
pub(crate) fn render_pages_for_ocr(
document: &Document,
pages: &[Page],
dpi: f32,
) -> Result<Vec<RenderedPage>, LiteParseError> {
let mut rendered = Vec::new();
for (idx, page) in pages.iter().enumerate() {
let text_length: usize = page
.text_items
.iter()
.filter(|item| !is_likely_garbled(&item.text))
.map(|item| item.text.len())
.sum();
let page_obj = document.page((page.page_number - 1) as i32)?;
let has_images = !page_obj.image_bounds(25.0, 0.9).is_empty();
let page_area = page.page_width * page.page_height;
let text_bbox_area: f32 = page
.text_items
.iter()
.filter(|item| !is_likely_garbled(&item.text))
.map(|item| item.width * item.height)
.sum();
let text_coverage = if page_area > 0.0 {
text_bbox_area / page_area
} else {
0.0
};
let needs_ocr =
text_length < 20 || text_coverage < 0.15 || has_images || page_is_garbled(page);
if !needs_ocr {
continue;
}
let bitmap = page_obj.render(dpi)?;
let width = bitmap.width() as u32;
let height = bitmap.height() as u32;
let rgba = bitmap.to_rgba();
let img: ImageBuffer<Rgba<u8>, Vec<u8>> = ImageBuffer::from_raw(width, height, rgba)
.ok_or(LiteParseError::Other(
"failed to create image buffer".into(),
))?;
let rgb_img = image::DynamicImage::ImageRgba8(img).to_rgb8();
let rgb_bytes = rgb_img.into_raw();
rendered.push(RenderedPage {
idx,
rgb_bytes,
width,
height,
});
}
Ok(rendered)
}
pub(crate) async fn ocr_and_merge_rendered(
pages: &mut [Page],
rendered: Vec<RenderedPage>,
dpi: f32,
ocr_engine: Arc<dyn OcrEngine>,
ocr_language: &str,
num_workers: usize,
) -> Result<(), LiteParseError> {
let num_workers = num_workers.max(1);
let semaphore = Arc::new(tokio::sync::Semaphore::new(num_workers));
let mut handles = Vec::with_capacity(rendered.len());
let handle = tokio::runtime::Handle::current();
for r in rendered {
let engine = ocr_engine.clone();
let sem = semaphore.clone();
let language = ocr_language.to_string();
let page_number = pages[r.idx].page_number;
let rt_handle = handle.clone();
handles.push((
r.idx,
page_number,
tokio::task::spawn_blocking(move || {
let _permit = rt_handle
.block_on(sem.acquire_owned())
.expect("semaphore closed");
let options = OcrOptions { language };
rt_handle.block_on(engine.recognize(&r.rgb_bytes, r.width, r.height, &options))
}),
));
}
let scale_factor = 72.0 / dpi;
let total_tasks = handles.len();
let mut failed_tasks = 0usize;
let mut failed_sparse_text_page = false;
let mut first_error: Option<String> = None;
for (idx, page_number, handle) in handles {
let ocr_results: Vec<OcrResult> = match handle.await {
Ok(Ok(results)) => results,
Ok(Err(e)) => {
failed_tasks += 1;
failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
if first_error.is_none() {
let msg = e.to_string();
eprintln!("[ocr] failed for page {}: {}", page_number, msg);
first_error = Some(msg);
}
continue;
}
Err(e) => {
failed_tasks += 1;
failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
if first_error.is_none() {
let msg = e.to_string();
eprintln!("[ocr] task panicked for page {}: {}", page_number, msg);
first_error = Some(msg);
}
continue;
}
};
if ocr_results.is_empty() {
continue;
}
let page = &mut pages[idx];
if page_is_garbled(page) {
page.text_items.clear();
} else {
page.text_items
.retain(|item| !is_likely_garbled(&item.text));
}
let native_count = page.text_items.len();
for r in &ocr_results {
if r.confidence <= 0.1 {
continue;
}
let ocr_x = r.bbox[0] * scale_factor;
let ocr_y = r.bbox[1] * scale_factor;
let ocr_w = (r.bbox[2] - r.bbox[0]) * scale_factor;
let ocr_h = (r.bbox[3] - r.bbox[1]) * scale_factor;
if overlaps_existing_text(
&page.text_items[..native_count],
ocr_x,
ocr_y,
ocr_w,
ocr_h,
2.0,
) {
continue;
}
let cleaned = clean_ocr_table_artifacts(&r.text);
if cleaned.is_empty() {
continue;
}
page.text_items.push(TextItem {
text: cleaned,
x: ocr_x,
y: ocr_y,
width: ocr_w,
height: ocr_h,
font_name: Some("OCR".to_string()),
font_size: Some(ocr_h),
confidence: Some((r.confidence * 1000.0).round() / 1000.0),
..Default::default()
});
}
}
if total_tasks > 0 && failed_tasks == total_tasks && failed_sparse_text_page {
let detail = first_error.unwrap_or_else(|| "unknown error".to_string());
return Err(LiteParseError::Ocr(format!(
"OCR failed for all {} page(s): {}",
total_tasks, detail
)));
}
if failed_tasks > 0 {
eprintln!(
"[ocr] {}/{} page(s) failed OCR; continuing with partial results",
failed_tasks, total_tasks
);
}
Ok(())
}
fn page_has_sparse_native_text(page: &Page) -> bool {
let text_length: usize = page
.text_items
.iter()
.filter(|item| !is_likely_garbled(&item.text))
.map(|item| item.text.len())
.sum();
let page_area = page.page_width * page.page_height;
let text_bbox_area: f32 = page
.text_items
.iter()
.filter(|item| !is_likely_garbled(&item.text))
.map(|item| item.width * item.height)
.sum();
let text_coverage = if page_area > 0.0 {
text_bbox_area / page_area
} else {
0.0
};
text_length < 20 || text_coverage < 0.15
}
fn is_likely_garbled(text: &str) -> bool {
let (letters, vowels) = count_letters_and_vowels(text);
if letters < 10 {
return false;
}
vowels * 10 < letters
}
fn count_letters_and_vowels(text: &str) -> (usize, usize) {
let mut letters = 0usize;
let mut vowels = 0usize;
for ch in text.chars() {
if ch.is_ascii_alphabetic() {
letters += 1;
if matches!(ch.to_ascii_lowercase(), 'a' | 'e' | 'i' | 'o' | 'u') {
vowels += 1;
}
}
}
(letters, vowels)
}
fn page_is_garbled(page: &Page) -> bool {
let mut total_letters = 0usize;
let mut total_vowels = 0usize;
for it in &page.text_items {
let (l, v) = count_letters_and_vowels(&it.text);
total_letters += l;
total_vowels += v;
}
if total_letters < 30 {
return false;
}
total_vowels * 5 < total_letters
}
fn overlaps_existing_text(
items: &[TextItem],
ocr_x: f32,
ocr_y: f32,
ocr_w: f32,
ocr_h: f32,
tolerance: f32,
) -> bool {
for item in items {
let item_right = item.x + item.width;
let item_bottom = item.y + item.height;
let overlap_x = ocr_x < item_right + tolerance && ocr_x + ocr_w > item.x - tolerance;
let overlap_y = ocr_y < item_bottom + tolerance && ocr_y + ocr_h > item.y - tolerance;
if overlap_x && overlap_y {
return true;
}
}
false
}
fn clean_ocr_table_artifacts(text: &str) -> String {
let trimmed = text.trim();
if trimmed.is_empty() {
return String::new();
}
let without_artifacts: &str = trimmed
.trim_start_matches(['|', '[', ']', '(', ')', '{', '}'])
.trim_end_matches(['|', '[', ']', '(', ')', '{', '}'])
.trim();
if without_artifacts.is_empty() {
return trimmed.to_string();
}
let is_numeric_ish = without_artifacts
.chars()
.all(|c| c.is_ascii_digit() || matches!(c, ',' | '.' | ' ' | '%' | '-' | '+' | '*' | '/'))
|| without_artifacts == "N/A"
|| without_artifacts == "Z"
|| without_artifacts == "-";
if is_numeric_ish {
without_artifacts.to_string()
} else {
trimmed.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_ocr_table_artifacts() {
assert_eq!(clean_ocr_table_artifacts("44520]"), "44520");
assert_eq!(clean_ocr_table_artifacts("|123"), "123");
assert_eq!(clean_ocr_table_artifacts("0.3|"), "0.3");
assert_eq!(clean_ocr_table_artifacts("(note)"), "(note)");
assert_eq!(clean_ocr_table_artifacts("|hello|"), "|hello|");
assert_eq!(clean_ocr_table_artifacts("N/A"), "N/A");
assert_eq!(clean_ocr_table_artifacts(""), "");
assert_eq!(clean_ocr_table_artifacts("|||"), "|||");
}
fn make_item(x: f32, y: f32, w: f32, h: f32) -> TextItem {
TextItem {
text: "x".into(),
x,
y,
width: w,
height: h,
..Default::default()
}
}
#[test]
fn test_overlaps_existing_text_inside() {
let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
assert!(overlaps_existing_text(&items, 12.0, 11.0, 5.0, 2.0, 2.0));
}
#[test]
fn test_overlaps_existing_text_disjoint() {
let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
assert!(!overlaps_existing_text(&items, 100.0, 100.0, 5.0, 5.0, 2.0));
}
#[test]
fn test_overlaps_existing_text_tolerance() {
let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
assert!(overlaps_existing_text(&items, 31.0, 10.0, 5.0, 5.0, 2.0));
assert!(!overlaps_existing_text(&items, 35.0, 10.0, 5.0, 5.0, 2.0));
}
#[test]
fn test_overlaps_empty() {
assert!(!overlaps_existing_text(&[], 0.0, 0.0, 1.0, 1.0, 0.0));
}
#[test]
fn test_clean_ocr_keeps_whitespace_trimmed() {
assert_eq!(clean_ocr_table_artifacts(" "), "");
assert_eq!(clean_ocr_table_artifacts(" 123 "), "123");
}
struct FailingEngine;
impl OcrEngine for FailingEngine {
fn name(&self) -> &str {
"failing"
}
fn recognize<'a, 'b: 'a, 'c: 'a>(
&'a self,
_image_data: &'c [u8],
_width: u32,
_height: u32,
_options: &'b OcrOptions,
) -> std::pin::Pin<
Box<
dyn Future<
Output = Result<Vec<OcrResult>, Box<dyn std::error::Error + Send + Sync>>,
> + Send
+ '_,
>,
> {
Box::pin(async move { Err("Error opening data file tessdata/eng.traineddata".into()) })
}
}
fn make_blank_page(page_number: usize) -> Page {
Page {
page_number,
page_width: 100.0,
page_height: 100.0,
text_items: Vec::new(),
}
}
fn make_rendered(idx: usize) -> RenderedPage {
RenderedPage {
idx,
rgb_bytes: vec![0u8, 0u8, 0u8],
width: 1,
height: 1,
}
}
fn make_native_text_page(page_number: usize) -> Page {
Page {
page_number,
page_width: 100.0,
page_height: 100.0,
text_items: vec![TextItem {
text: "this page already has real native text content".into(),
x: 0.0,
y: 0.0,
width: 50.0,
height: 50.0,
..Default::default()
}],
}
}
fn make_low_coverage_text_page(page_number: usize) -> Page {
Page {
page_number,
page_width: 100.0,
page_height: 100.0,
text_items: vec![TextItem {
text: "small native header that is not enough".into(),
x: 0.0,
y: 0.0,
width: 10.0,
height: 5.0,
..Default::default()
}],
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_all_pages_fail_returns_error() {
let mut pages = vec![make_blank_page(1), make_blank_page(2)];
let rendered = vec![make_rendered(0), make_rendered(1)];
let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);
let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;
let err = result.expect_err("expected systemic OCR failure to be surfaced");
let msg = err.to_string();
assert!(
msg.contains("OCR failed for all 2 page(s)"),
"unexpected error message: {msg}"
);
assert!(
msg.contains("traineddata"),
"error should carry the underlying cause: {msg}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_no_rendered_pages_is_ok() {
let mut pages = vec![make_blank_page(1)];
let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);
let result = ocr_and_merge_rendered(&mut pages, Vec::new(), 72.0, engine, "eng", 2).await;
assert!(result.is_ok(), "empty OCR set should succeed: {result:?}");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_native_text_pages_not_failed_on_ocr_error() {
let mut pages = vec![make_native_text_page(1), make_native_text_page(2)];
let rendered = vec![make_rendered(0), make_rendered(1)];
let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);
let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;
assert!(
result.is_ok(),
"OCR failure on already-native-text pages must not abort the parse: {result:?}"
);
assert_eq!(pages[0].text_items.len(), 1);
assert_eq!(pages[1].text_items.len(), 1);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_mixed_failure_with_sparse_text_page_returns_error() {
let mut pages = vec![make_native_text_page(1), make_blank_page(2)];
let rendered = vec![make_rendered(0), make_rendered(1)];
let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);
let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;
let err = result.expect_err("a text-starved page losing all OCR must surface an error");
assert!(
err.to_string().contains("OCR failed for all 2 page(s)"),
"unexpected error message: {err}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn test_low_coverage_text_page_failure_returns_error() {
let mut pages = vec![make_low_coverage_text_page(1)];
let rendered = vec![make_rendered(0)];
let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);
let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;
let err = result.expect_err("low-coverage text page losing OCR must surface an error");
assert!(
err.to_string().contains("OCR failed for all 1 page(s)"),
"unexpected error message: {err}"
);
}
}