liteparse 2.0.8

use std::sync::Arc;

use crate::error::LiteParseError;
use crate::ocr::{OcrEngine, OcrOptions, OcrResult};
use crate::types::{Page, TextItem};
use pdfium::{Document, ImageBounds};

/// Minimum dark filled-path area (pt², ~72 DPI page space) not covered by
/// native text before a page is sent to OCR. 400 pt² is roughly one word at
/// a 10–12pt size; anything smaller is likely a bullet, icon, dot leader, or
/// decoration whose loss is acceptable. A false trigger only costs an extra
/// OCR pass (the overlap filter discards OCR results that duplicate native
/// text); measured on a 121-page financial report, the trigger fires on ~3
/// pages at this threshold.
const UNCOVERED_VECTOR_AREA_THRESHOLD: f32 = 400.0;

/// Owned page bitmap prepared for OCR. Indices refer to positions in the `pages` slice.
pub(crate) struct RenderedPage {
    pub idx: usize,
    pub rgb_bytes: Vec<u8>,
    pub width: u32,
    pub height: u32,
}

/// Render pages that need OCR from an already-open document.
///
/// The pdfium `Document` holds raw pointers that are not `Send`, so callers must
/// drop it before awaiting the OCR engine.
pub(crate) fn render_pages_for_ocr(
    document: &Document,
    pages: &[Page],
    dpi: f32,
) -> Result<Vec<RenderedPage>, LiteParseError> {
    let mut rendered = Vec::new();
    for (idx, page) in pages.iter().enumerate() {
        // Count only usable native text. Substitution-cipher-style corrupt
        // encodings (e.g. PDFs with a broken cmap) produce long "text" that looks
        // populated but is unreadable — without this, such pages bypass OCR
        // because text_length >= 20 and coverage looks fine. The same applies to
        // unmappable items (Type3 fonts with no ToUnicode), whose text is a
        // char-code fallback and whose bounding boxes come from deceptive
        // declared metrics.
        let text_length: usize = page
            .text_items
            .iter()
            .filter(|item| !is_unusable_native(item))
            .map(|item| item.text.len())
            .sum();
        let page_obj = document.page((page.page_number - 1) as i32)?;
        let has_images = !page_obj.image_bounds(25.0, 0.9).is_empty();

        let page_area = page.page_width * page.page_height;
        let text_bbox_area: f32 = page
            .text_items
            .iter()
            .filter(|item| !is_unusable_native(item))
            .map(|item| item.width * item.height)
            .sum();
        let text_coverage = if page_area > 0.0 {
            text_bbox_area / page_area
        } else {
            0.0
        };

        let mut needs_ocr =
            text_length < 20 || text_coverage < 0.15 || has_images || page_is_garbled(page);

        // Text drawn as filled vector outlines lives outside the text layer
        // entirely: no text items, no image XObjects, so none of the above
        // triggers fire on a text-dense page. Detect it by measuring filled
        // path area that native text doesn't account for. Checked last so the
        // page-object walk only runs when the cheap predicates all pass.
        if !needs_ocr {
            let path_bounds = page_obj.filled_path_bounds(3.0, 0.9);
            let uncovered = uncovered_path_area(&path_bounds, &page.text_items);

            needs_ocr = uncovered >= UNCOVERED_VECTOR_AREA_THRESHOLD;
        }

        if !needs_ocr {
            continue;
        }

        let bitmap = page_obj.render(dpi)?;
        let width = bitmap.width() as u32;
        let height = bitmap.height() as u32;
        // RGB is what OCR consumes; converting straight from BGRA avoids an
        // intermediate full-frame RGBA buffer per page.
        let rgb_bytes = bitmap.to_rgb();

        rendered.push(RenderedPage {
            idx,
            rgb_bytes,
            width,
            height,
        });
    }
    Ok(rendered)
}

/// Run OCR on pre-rendered page bitmaps and merge results into `pages`.
pub(crate) async fn ocr_and_merge_rendered(
    pages: &mut [Page],
    rendered: Vec<RenderedPage>,
    dpi: f32,
    ocr_engine: Arc<dyn OcrEngine>,
    ocr_language: &str,
    num_workers: usize,
) -> Result<(), LiteParseError> {
    // Phase 1: spawn one async task per page. A semaphore limits how many run
    // `recognize` concurrently to `num_workers`.
    //
    // The permit MUST be acquired in async context (`acquire_owned().await`),
    // not inside `spawn_blocking` via `block_on`. Acquiring it on a blocking
    // thread parks that OS thread until a permit is free; with more pages than
    // tokio's blocking pool (default `max_blocking_threads = 512`), every pool
    // thread ends up parked waiting on the semaphore. The single task holding
    // the permit then calls `recognize`, whose HTTP client resolves DNS via its
    // own internal `spawn_blocking` — which can never get a thread, so the
    // request never goes out, the permit is never released, and the whole OCR
    // pass deadlocks. Acquiring the permit asynchronously parks the lightweight
    // task instead, so only `num_workers` blocking threads are ever consumed.
    let num_workers = num_workers.max(1);
    let semaphore = Arc::new(tokio::sync::Semaphore::new(num_workers));
    let mut handles = Vec::with_capacity(rendered.len());

    let handle = tokio::runtime::Handle::current();

    for r in rendered {
        let engine = ocr_engine.clone();
        let sem = semaphore.clone();
        let language = ocr_language.to_string();
        let page_number = pages[r.idx].page_number;
        let rt_handle = handle.clone();

        handles.push((
            r.idx,
            page_number,
            tokio::spawn(async move {
                // Park the task (not an OS thread) until a permit is available.
                let _permit = sem.acquire_owned().await.expect("semaphore closed");
                let options = OcrOptions { language };
                // Offload the (possibly CPU-blocking, e.g. Tesseract) recognize
                // onto a blocking thread. Because the permit is already held,
                // at most `num_workers` blocking threads are in use at once,
                // leaving the rest of the pool free for the HTTP client's
                // internal DNS resolution.
                match tokio::task::spawn_blocking(move || {
                    rt_handle.block_on(engine.recognize(&r.rgb_bytes, r.width, r.height, &options))
                })
                .await
                {
                    Ok(result) => result,
                    Err(join_err) => {
                        Err(Box::new(join_err) as Box<dyn std::error::Error + Send + Sync>)
                    }
                }
            }),
        ));
    }

    // Phase 3: collect results and merge into pages.
    let scale_factor = 72.0 / dpi;

    // Track OCR task outcomes so we can distinguish a systemic failure (e.g.
    // missing Tesseract language data, which fails identically on every page)
    // from incidental per-page failures. Without this, every page logs the same
    // error and `parse()` still returns "success" with no OCR text.
    //
    // We additionally track whether any *sparse-text* page failed: a page is
    // rendered for OCR if it has sparse native text OR merely contains an image
    // (`needs_ocr = text_length < 20 || text_coverage < 0.15 || has_images`).
    // A native-text PDF with a logo on every page is rendered for OCR
    // enrichment but already has all its text. We must only fail loud when OCR
    // failure destroyed a sparse page's likely primary text source — otherwise
    // a broken OCR setup would abort perfectly good native-text documents.
    let total_tasks = handles.len();
    let mut failed_tasks = 0usize;
    let mut failed_sparse_text_page = false;
    let mut first_error: Option<String> = None;

    for (idx, page_number, handle) in handles {
        let ocr_results: Vec<OcrResult> = match handle.await {
            Ok(Ok(results)) => results,
            Ok(Err(e)) => {
                failed_tasks += 1;
                failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
                // Only log the first failure to avoid flooding stderr with an
                // identical message for every page.
                if first_error.is_none() {
                    let msg = e.to_string();
                    eprintln!("[ocr] failed for page {}: {}", page_number, msg);
                    first_error = Some(msg);
                }
                continue;
            }
            Err(e) => {
                failed_tasks += 1;
                failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
                if first_error.is_none() {
                    let msg = e.to_string();
                    eprintln!("[ocr] task panicked for page {}: {}", page_number, msg);
                    first_error = Some(msg);
                }
                continue;
            }
        };

        if ocr_results.is_empty() {
            continue;
        }

        let page = &mut pages[idx];
        // Drop unusable native items (substitution-cipher cmap corruption, or
        // unmappable Type3 text) so OCR can replace them. Without this,
        // garbled-but-spatially-present native text suppresses every OCR
        // result that overlaps it via the overlap check below, leaving the
        // output stuck with unreadable text. We apply both per-item and
        // per-page checks: short garbled labels ("GDWH", "XVG") can't be
        // flagged alone, but their host page can.
        if page_is_garbled(page) {
            page.text_items.clear();
        } else {
            page.text_items.retain(|item| !is_unusable_native(item));
        }

        // Only check overlap against native (already-extracted) PDF text. Comparing
        // each OCR result against previously-accepted OCR results caused adjacent
        // OCR lines whose bounding boxes touched within tolerance to suppress each
        // other, dropping every second line on scanned pages.
        let native_count = page.text_items.len();
        for r in &ocr_results {
            if r.confidence <= 0.1 {
                continue;
            }

            // Prefer the screen-space axis-aligned bbox derived from the polygon
            // (when present) so rotated detections carry a tight upright bbox.
            // The polygon also lets us recover an explicit rotation angle so the
            // projector can route rotated sidebar text through its rotation
            // reading-order handler instead of mistaking it for body text.
            let (ocr_x, ocr_y, ocr_w, ocr_h, rotation) = match r.polygon {
                Some(poly) => {
                    let xs = [poly[0][0], poly[1][0], poly[2][0], poly[3][0]];
                    let ys = [poly[0][1], poly[1][1], poly[2][1], poly[3][1]];
                    let x_min = xs.iter().copied().fold(f32::INFINITY, f32::min);
                    let x_max = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
                    let y_min = ys.iter().copied().fold(f32::INFINITY, f32::min);
                    let y_max = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
                    let rot = polygon_rotation_deg(&poly);
                    (
                        x_min * scale_factor,
                        y_min * scale_factor,
                        (x_max - x_min) * scale_factor,
                        (y_max - y_min) * scale_factor,
                        rot,
                    )
                }
                None => (
                    r.bbox[0] * scale_factor,
                    r.bbox[1] * scale_factor,
                    (r.bbox[2] - r.bbox[0]) * scale_factor,
                    (r.bbox[3] - r.bbox[1]) * scale_factor,
                    0.0,
                ),
            };

            if overlaps_existing_text(
                &page.text_items[..native_count],
                ocr_x,
                ocr_y,
                ocr_w,
                ocr_h,
                2.0,
            ) {
                continue;
            }

            let cleaned = clean_ocr_table_artifacts(&r.text);
            if cleaned.is_empty() {
                continue;
            }

            // For native rotated text the font_size approximates line height,
            // which for 90/270° rotations corresponds to the *narrow* screen
            // dimension. Use the perpendicular extent for rotated OCR text so
            // downstream font-size heuristics stay sane.
            let font_size_hint = if rotation == 90.0 || rotation == 270.0 {
                ocr_w.max(1.0)
            } else {
                ocr_h
            };

            page.text_items.push(TextItem {
                text: cleaned,
                x: ocr_x,
                y: ocr_y,
                width: ocr_w,
                height: ocr_h,
                rotation,
                font_name: Some("OCR".to_string()),
                font_size: Some(font_size_hint),
                confidence: Some((r.confidence * 1000.0).round() / 1000.0),
                ..Default::default()
            });
        }
    }

    // If every OCR task failed *and* at least one of those failures was on a
    // sparse-text page (the same length/coverage predicate that sends pages to
    // OCR as text-poor in `render_pages_for_ocr`), treat it as a systemic
    // failure. Returning an error surfaces the root cause (e.g. missing language
    // data) instead of silently emitting an empty or mostly-empty page. We
    // deliberately do NOT fail when the only failures were on pages that already
    // had substantial native text and were merely rendered for image-based OCR
    // enrichment — a broken OCR setup must not abort an otherwise-good
    // native-text document.
    if total_tasks > 0 && failed_tasks == total_tasks && failed_sparse_text_page {
        let detail = first_error.unwrap_or_else(|| "unknown error".to_string());
        return Err(LiteParseError::Ocr(format!(
            "OCR failed for all {} page(s): {}",
            total_tasks, detail
        )));
    }

    // Surface a concise summary for partial failures without flooding stderr.
    if failed_tasks > 0 {
        eprintln!(
            "[ocr] {}/{} page(s) failed OCR; continuing with partial results",
            failed_tasks, total_tasks
        );
    }

    Ok(())
}

/// True when the page's native (already-extracted) text is sparse enough that
/// OCR is likely its primary text source. Mirrors the non-image predicates in
/// `render_pages_for_ocr` (`text_length < 20 || text_coverage < 0.15`) so the
/// systemic-failure guard matches the same pages that were rendered because
/// their native text was insufficient.
fn page_has_sparse_native_text(page: &Page) -> bool {
    let text_length: usize = page
        .text_items
        .iter()
        .filter(|item| !is_unusable_native(item))
        .map(|item| item.text.len())
        .sum();
    let page_area = page.page_width * page.page_height;
    let text_bbox_area: f32 = page
        .text_items
        .iter()
        .filter(|item| !is_unusable_native(item))
        .map(|item| item.width * item.height)
        .sum();
    let text_coverage = if page_area > 0.0 {
        text_bbox_area / page_area
    } else {
        0.0
    };

    text_length < 20 || text_coverage < 0.15
}

/// A native text item that cannot be trusted as a text source: either its
/// Unicode mapping failed outright (Type3 fonts with no ToUnicode — the text
/// is a char-code fallback and the bbox comes from deceptive declared
/// metrics), or its content looks substitution-cipher garbled.
fn is_unusable_native(item: &TextItem) -> bool {
    item.has_unicode_map_error || is_likely_garbled(&item.text)
}

/// Total area of filled vector paths not accounted for by native text items.
/// Glyph outlines drawn as paths produce filled regions with no overlapping
/// text item; rules and table borders are stroke-only and already excluded
/// upstream, and shading rects behind real text are subtracted away by the
/// text overlap. Coverage is approximated by summing per-item intersections
/// (clamped to the path's own area), which can only over-estimate coverage —
/// i.e. err toward not triggering OCR.
fn uncovered_path_area(paths: &[ImageBounds], items: &[TextItem]) -> f32 {
    let mut uncovered = 0.0f32;
    for p in paths {
        let p_area = p.width * p.height;
        if p_area <= 0.0 {
            continue;
        }
        let mut covered = 0.0f32;
        for item in items {
            let ix = (p.x + p.width).min(item.x + item.width) - p.x.max(item.x);
            let iy = (p.y + p.height).min(item.y + item.height) - p.y.max(item.y);
            if ix > 0.0 && iy > 0.0 {
                covered += ix * iy;
                if covered >= p_area {
                    break;
                }
            }
        }
        uncovered += (p_area - covered).max(0.0);
    }
    uncovered
}

/// Heuristic for substitution-cipher / broken-cmap garbling: real Latin-script
/// text has a vowel ratio of roughly 30–45%, but a substitution permutation
/// almost always maps the original A/E/I/O/U onto non-vowel letters, driving
/// the apparent vowel ratio to near zero. Texts without enough ASCII letters
/// to judge (non-Latin scripts, numbers, short labels) are treated as fine.
fn is_likely_garbled(text: &str) -> bool {
    let (letters, vowels) = count_letters_and_vowels(text);
    if letters < 10 {
        return false;
    }
    vowels * 10 < letters
}

fn count_letters_and_vowels(text: &str) -> (usize, usize) {
    let mut letters = 0usize;
    let mut vowels = 0usize;
    for ch in text.chars() {
        if ch.is_ascii_alphabetic() {
            letters += 1;
            if matches!(ch.to_ascii_lowercase(), 'a' | 'e' | 'i' | 'o' | 'u') {
                vowels += 1;
            }
        }
    }
    (letters, vowels)
}

/// Page-level garbled check: even when individual items are too short to judge
/// in isolation (e.g. "GDWH", "FXUUHQFB XVG"), a page whose aggregate vowel
/// ratio collapses to single digits is almost certainly substitution-encoded.
/// Used to drop all native items on the page before OCR merge, so short
/// garbled labels don't suppress overlapping OCR results.
fn page_is_garbled(page: &Page) -> bool {
    let mut total_letters = 0usize;
    let mut total_vowels = 0usize;
    for it in &page.text_items {
        let (l, v) = count_letters_and_vowels(&it.text);
        total_letters += l;
        total_vowels += v;
    }
    if total_letters < 30 {
        return false;
    }
    // Real Latin-script vowel ratios sit ~30–45% across English, Portuguese,
    // Spanish, French, etc. A page-wide ratio under 20% is well outside any
    // natural-language range and signals substitution-style corruption. (A
    // simple +3 Caesar shift still leaves some U/Y letters from the original
    // O/Y mapping, so a 10% bound is too tight to catch this in practice.)
    total_vowels * 5 < total_letters
}

/// Recover a discrete CCW rotation in degrees from a 4-point OCR polygon.
/// Returns one of 0.0, 90.0, 180.0, 270.0 — snapping to the nearest right
/// angle — or 0.0 for nearly-square/degenerate polygons.
///
/// Point ordering varies between OCR engines: some emit TL→TR→BR→BL in the
/// glyphs' upright reading frame (so poly[0]→poly[1] is always the reading
/// direction), but others (notably PaddleOCR 3.x with
/// `use_textline_orientation=True`) emit polygons in screen-axis order, where
/// poly[0]→poly[1] is always horizontal in screen space regardless of how the
/// text actually reads. To handle both, we pick the *longer* of the two
/// adjacent edges as the reading direction — the text always runs along the
/// long axis of its bounding quadrilateral.
fn polygon_rotation_deg(poly: &[[f32; 2]; 4]) -> f32 {
    let e0 = [poly[1][0] - poly[0][0], poly[1][1] - poly[0][1]];
    let e1 = [poly[2][0] - poly[1][0], poly[2][1] - poly[1][1]];
    let len0 = (e0[0] * e0[0] + e0[1] * e0[1]).sqrt();
    let len1 = (e1[0] * e1[0] + e1[1] * e1[1]).sqrt();
    if len0.max(len1) < 1.0 {
        return 0.0;
    }
    // Treat near-square polygons as un-rotated — there's no reliable reading
    // axis to pick from. Single-char/CJK detections fall in here.
    let (longer, shorter) = if len0 >= len1 {
        (len0, len1)
    } else {
        (len1, len0)
    };
    if shorter > 0.0 && longer / shorter < 1.3 {
        return 0.0;
    }
    let reading = if len0 >= len1 { e0 } else { e1 };
    // atan2 with screen-down y; negate to get the conventional CCW angle.
    let angle_ccw = -reading[1].atan2(reading[0]).to_degrees();
    let normalized = angle_ccw.rem_euclid(360.0);
    ((normalized / 90.0).round() as i32 * 90).rem_euclid(360) as f32
}

/// Check if an OCR bounding box overlaps with any existing text item.
fn overlaps_existing_text(
    items: &[TextItem],
    ocr_x: f32,
    ocr_y: f32,
    ocr_w: f32,
    ocr_h: f32,
    tolerance: f32,
) -> bool {
    for item in items {
        let item_right = item.x + item.width;
        let item_bottom = item.y + item.height;

        let overlap_x = ocr_x < item_right + tolerance && ocr_x + ocr_w > item.x - tolerance;
        let overlap_y = ocr_y < item_bottom + tolerance && ocr_y + ocr_h > item.y - tolerance;

        if overlap_x && overlap_y {
            return true;
        }
    }
    false
}

/// Clean common OCR artifacts from table border misreads.
/// OCR often misreads vertical table border lines as bracket-like characters.
fn clean_ocr_table_artifacts(text: &str) -> String {
    let trimmed = text.trim();
    if trimmed.is_empty() {
        return String::new();
    }

    // Strip leading/trailing border artifact characters: | [ ] ( ) { }
    let without_artifacts: &str = trimmed
        .trim_start_matches(['|', '[', ']', '(', ')', '{', '}'])
        .trim_end_matches(['|', '[', ']', '(', ')', '{', '}'])
        .trim();

    if without_artifacts.is_empty() {
        return trimmed.to_string();
    }

    // Only use cleaned version if core content looks numeric-ish
    // This avoids incorrectly stripping brackets from content like "(note)"
    let is_numeric_ish = without_artifacts
        .chars()
        .all(|c| c.is_ascii_digit() || matches!(c, ',' | '.' | ' ' | '%' | '-' | '+' | '*' | '/'))
        || without_artifacts == "N/A"
        || without_artifacts == "Z"
        || without_artifacts == "-";

    if is_numeric_ish {
        without_artifacts.to_string()
    } else {
        trimmed.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_polygon_rotation_horizontal() {
        let p = [[0.0, 0.0], [100.0, 0.0], [100.0, 20.0], [0.0, 20.0]];
        assert_eq!(polygon_rotation_deg(&p), 0.0);
    }

    #[test]
    fn test_polygon_rotation_90_ccw() {
        // Upright text rotated 90° CCW: TL→TR edge points upward (screen y decreasing).
        let p = [[10.0, 100.0], [10.0, 0.0], [30.0, 0.0], [30.0, 100.0]];
        assert_eq!(polygon_rotation_deg(&p), 90.0);
    }

    #[test]
    fn test_polygon_rotation_270_ccw() {
        // Upright text rotated 270° CCW (= 90° CW): TL→TR edge points downward.
        let p = [[10.0, 0.0], [10.0, 100.0], [30.0, 100.0], [30.0, 0.0]];
        assert_eq!(polygon_rotation_deg(&p), 270.0);
    }

    #[test]
    fn test_polygon_rotation_screen_axis_vertical() {
        // PaddleOCR-style: tall+narrow sidebar polygon in screen-axis order
        // (smallest-y first). poly[0]→poly[1] is the SHORT horizontal edge,
        // not the reading direction. The longer edge picks out the rotation.
        let p = [[20.0, 50.0], [50.0, 50.0], [50.0, 750.0], [20.0, 750.0]];
        let r = polygon_rotation_deg(&p);
        assert!(r == 90.0 || r == 270.0, "expected 90 or 270, got {r}");
    }

    #[test]
    fn test_polygon_rotation_near_square() {
        // Single-char detections (CJK glyphs, etc.) should not be classified
        // as rotated — there's no reliable reading axis.
        let p = [[0.0, 0.0], [20.0, 0.0], [20.0, 22.0], [0.0, 22.0]];
        assert_eq!(polygon_rotation_deg(&p), 0.0);
    }

    #[test]
    fn test_polygon_rotation_180() {
        let p = [[100.0, 20.0], [0.0, 20.0], [0.0, 0.0], [100.0, 0.0]];
        assert_eq!(polygon_rotation_deg(&p), 180.0);
    }

    #[test]
    fn test_clean_ocr_table_artifacts() {
        assert_eq!(clean_ocr_table_artifacts("44520]"), "44520");
        assert_eq!(clean_ocr_table_artifacts("|123"), "123");
        assert_eq!(clean_ocr_table_artifacts("0.3|"), "0.3");
        assert_eq!(clean_ocr_table_artifacts("(note)"), "(note)");
        assert_eq!(clean_ocr_table_artifacts("|hello|"), "|hello|");
        assert_eq!(clean_ocr_table_artifacts("N/A"), "N/A");
        assert_eq!(clean_ocr_table_artifacts(""), "");
        assert_eq!(clean_ocr_table_artifacts("|||"), "|||");
    }

    fn make_item(x: f32, y: f32, w: f32, h: f32) -> TextItem {
        TextItem {
            text: "x".into(),
            x,
            y,
            width: w,
            height: h,
            ..Default::default()
        }
    }

    #[test]
    fn test_overlaps_existing_text_inside() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        assert!(overlaps_existing_text(&items, 12.0, 11.0, 5.0, 2.0, 2.0));
    }

    #[test]
    fn test_overlaps_existing_text_disjoint() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        assert!(!overlaps_existing_text(&items, 100.0, 100.0, 5.0, 5.0, 2.0));
    }

    #[test]
    fn test_overlaps_existing_text_tolerance() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        // Just outside but within tolerance
        assert!(overlaps_existing_text(&items, 31.0, 10.0, 5.0, 5.0, 2.0));
        // Beyond tolerance
        assert!(!overlaps_existing_text(&items, 35.0, 10.0, 5.0, 5.0, 2.0));
    }

    #[test]
    fn test_overlaps_empty() {
        assert!(!overlaps_existing_text(&[], 0.0, 0.0, 1.0, 1.0, 0.0));
    }

    fn pb(x: f32, y: f32, w: f32, h: f32) -> ImageBounds {
        ImageBounds {
            x,
            y,
            width: w,
            height: h,
        }
    }

    #[test]
    fn test_uncovered_path_area_no_text() {
        // A sentence-sized outlined region with no native text at all.
        let paths = vec![pb(50.0, 300.0, 200.0, 12.0)];
        let area = uncovered_path_area(&paths, &[]);
        assert!((area - 2400.0).abs() < 1.0);
        assert!(area >= UNCOVERED_VECTOR_AREA_THRESHOLD);
    }

    #[test]
    fn test_uncovered_path_area_fully_covered_by_text() {
        // Shading rect behind real text: fully covered, must not trigger.
        let paths = vec![pb(50.0, 300.0, 200.0, 12.0)];
        let items = vec![make_item(40.0, 295.0, 250.0, 25.0)];
        let area = uncovered_path_area(&paths, &items);
        assert_eq!(area, 0.0);
    }

    #[test]
    fn test_uncovered_path_area_partial_coverage() {
        // Half the outlined region is covered by a text item.
        let paths = vec![pb(0.0, 0.0, 100.0, 10.0)];
        let items = vec![make_item(0.0, 0.0, 50.0, 10.0)];
        let area = uncovered_path_area(&paths, &items);
        assert!((area - 500.0).abs() < 1.0);
    }

    #[test]
    fn test_uncovered_path_area_small_decoration_below_threshold() {
        // A few bullet-sized filled paths shouldn't reach the threshold.
        let paths = vec![pb(10.0, 10.0, 8.0, 8.0), pb(10.0, 30.0, 8.0, 8.0)];
        let area = uncovered_path_area(&paths, &[]);
        assert!(area < UNCOVERED_VECTOR_AREA_THRESHOLD);
    }

    #[test]
    fn test_unusable_native_unicode_map_error() {
        let mut item = make_item(0.0, 0.0, 10.0, 10.0);
        assert!(!is_unusable_native(&item));
        item.has_unicode_map_error = true;
        assert!(is_unusable_native(&item));
    }

    #[test]
    fn test_clean_ocr_keeps_whitespace_trimmed() {
        assert_eq!(clean_ocr_table_artifacts("   "), "");
        assert_eq!(clean_ocr_table_artifacts(" 123 "), "123");
    }

    // A mock OCR engine that always fails, simulating a systemic error such as
    // missing Tesseract language data (the root cause behind issue #253).
    struct FailingEngine;
    impl OcrEngine for FailingEngine {
        fn name(&self) -> &str {
            "failing"
        }
        fn recognize<'a, 'b: 'a, 'c: 'a>(
            &'a self,
            _image_data: &'c [u8],
            _width: u32,
            _height: u32,
            _options: &'b OcrOptions,
        ) -> std::pin::Pin<
            Box<
                dyn Future<
                        Output = Result<Vec<OcrResult>, Box<dyn std::error::Error + Send + Sync>>,
                    > + Send
                    + '_,
            >,
        > {
            Box::pin(async move { Err("Error opening data file tessdata/eng.traineddata".into()) })
        }
    }

    fn make_blank_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: Vec::new(),
        }
    }

    fn make_rendered(idx: usize) -> RenderedPage {
        RenderedPage {
            idx,
            // 1x1 RGB pixel; the engine never inspects it.
            rgb_bytes: vec![0u8, 0u8, 0u8],
            width: 1,
            height: 1,
        }
    }

    // A page that already has substantial native text coverage, as would be the
    // case for a native-text PDF page that was only rendered for OCR because it
    // also contains an image.
    fn make_native_text_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: vec![TextItem {
                text: "this page already has real native text content".into(),
                x: 0.0,
                y: 0.0,
                width: 50.0,
                height: 50.0,
                ..Default::default()
            }],
        }
    }

    // A page with >20 bytes of native text but very low page coverage. These
    // are still text-poor enough that `render_pages_for_ocr` sends them to OCR
    // (`text_coverage < 0.15`), so a systemic OCR failure should not be silently
    // swallowed.
    fn make_low_coverage_text_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: vec![TextItem {
                text: "small native header that is not enough".into(),
                x: 0.0,
                y: 0.0,
                width: 10.0,
                height: 5.0,
                ..Default::default()
            }],
        }
    }

    // When every OCR task fails (e.g. missing language data), the function must
    // return an error instead of silently reporting success with no OCR text.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_all_pages_fail_returns_error() {
        let mut pages = vec![make_blank_page(1), make_blank_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("expected systemic OCR failure to be surfaced");
        let msg = err.to_string();
        assert!(
            msg.contains("OCR failed for all 2 page(s)"),
            "unexpected error message: {msg}"
        );
        assert!(
            msg.contains("traineddata"),
            "error should carry the underlying cause: {msg}"
        );
    }

    // With no rendered pages there is nothing to OCR; this must remain a no-op
    // success rather than tripping the all-failed guard.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_no_rendered_pages_is_ok() {
        let mut pages = vec![make_blank_page(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, Vec::new(), 72.0, engine, "eng", 2).await;

        assert!(result.is_ok(), "empty OCR set should succeed: {result:?}");
    }

    // Regression guard: when OCR fails but every failing page already had native
    // text (it was only rendered for image-based enrichment), a broken OCR setup
    // must NOT abort the parse — the native text is still valid output.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_native_text_pages_not_failed_on_ocr_error() {
        let mut pages = vec![make_native_text_page(1), make_native_text_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        assert!(
            result.is_ok(),
            "OCR failure on already-native-text pages must not abort the parse: {result:?}"
        );
        // Native text is preserved untouched.
        assert_eq!(pages[0].text_items.len(), 1);
        assert_eq!(pages[1].text_items.len(), 1);
    }

    // When failures span both a sparse-text page and a native-text page, the
    // sparse-text page lost its likely primary text source, so we still fail
    // loud.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_mixed_failure_with_sparse_text_page_returns_error() {
        let mut pages = vec![make_native_text_page(1), make_blank_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("a text-starved page losing all OCR must surface an error");
        assert!(
            err.to_string().contains("OCR failed for all 2 page(s)"),
            "unexpected error message: {err}"
        );
    }

    // Regression guard for the review finding: low-coverage pages are rendered
    // for OCR even when their native text length is >20 bytes. A systemic OCR
    // failure on such pages must still surface as an error.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_low_coverage_text_page_failure_returns_error() {
        let mut pages = vec![make_low_coverage_text_page(1)];
        let rendered = vec![make_rendered(0)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("low-coverage text page losing OCR must surface an error");
        assert!(
            err.to_string().contains("OCR failed for all 1 page(s)"),
            "unexpected error message: {err}"
        );
    }
}