liteparse 2.0.8

Fast, lightweight PDF and document parsing with spatial text extraction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
use std::sync::Arc;

use crate::error::LiteParseError;
use crate::ocr::{OcrEngine, OcrOptions, OcrResult};
use crate::types::{Page, TextItem};
use pdfium::{Document, ImageBounds};

/// Minimum dark filled-path area (pt², ~72 DPI page space) not covered by
/// native text before a page is sent to OCR. 400 pt² is roughly one word at
/// a 10–12pt size; anything smaller is likely a bullet, icon, dot leader, or
/// decoration whose loss is acceptable. A false trigger only costs an extra
/// OCR pass (the overlap filter discards OCR results that duplicate native
/// text); measured on a 121-page financial report, the trigger fires on ~3
/// pages at this threshold.
const UNCOVERED_VECTOR_AREA_THRESHOLD: f32 = 400.0;

/// Owned page bitmap prepared for OCR. Indices refer to positions in the `pages` slice.
pub(crate) struct RenderedPage {
    pub idx: usize,
    pub rgb_bytes: Vec<u8>,
    pub width: u32,
    pub height: u32,
}

/// Render pages that need OCR from an already-open document.
///
/// The pdfium `Document` holds raw pointers that are not `Send`, so callers must
/// drop it before awaiting the OCR engine.
pub(crate) fn render_pages_for_ocr(
    document: &Document,
    pages: &[Page],
    dpi: f32,
) -> Result<Vec<RenderedPage>, LiteParseError> {
    let mut rendered = Vec::new();
    for (idx, page) in pages.iter().enumerate() {
        // Count only usable native text. Substitution-cipher-style corrupt
        // encodings (e.g. PDFs with a broken cmap) produce long "text" that looks
        // populated but is unreadable — without this, such pages bypass OCR
        // because text_length >= 20 and coverage looks fine. The same applies to
        // unmappable items (Type3 fonts with no ToUnicode), whose text is a
        // char-code fallback and whose bounding boxes come from deceptive
        // declared metrics.
        let text_length: usize = page
            .text_items
            .iter()
            .filter(|item| !is_unusable_native(item))
            .map(|item| item.text.len())
            .sum();
        let page_obj = document.page((page.page_number - 1) as i32)?;
        let has_images = !page_obj.image_bounds(25.0, 0.9).is_empty();

        let page_area = page.page_width * page.page_height;
        let text_bbox_area: f32 = page
            .text_items
            .iter()
            .filter(|item| !is_unusable_native(item))
            .map(|item| item.width * item.height)
            .sum();
        let text_coverage = if page_area > 0.0 {
            text_bbox_area / page_area
        } else {
            0.0
        };

        let mut needs_ocr =
            text_length < 20 || text_coverage < 0.15 || has_images || page_is_garbled(page);

        // Text drawn as filled vector outlines lives outside the text layer
        // entirely: no text items, no image XObjects, so none of the above
        // triggers fire on a text-dense page. Detect it by measuring filled
        // path area that native text doesn't account for. Checked last so the
        // page-object walk only runs when the cheap predicates all pass.
        if !needs_ocr {
            let path_bounds = page_obj.filled_path_bounds(3.0, 0.9);
            let uncovered = uncovered_path_area(&path_bounds, &page.text_items);

            needs_ocr = uncovered >= UNCOVERED_VECTOR_AREA_THRESHOLD;
        }

        if !needs_ocr {
            continue;
        }

        let bitmap = page_obj.render(dpi)?;
        let width = bitmap.width() as u32;
        let height = bitmap.height() as u32;
        // RGB is what OCR consumes; converting straight from BGRA avoids an
        // intermediate full-frame RGBA buffer per page.
        let rgb_bytes = bitmap.to_rgb();

        rendered.push(RenderedPage {
            idx,
            rgb_bytes,
            width,
            height,
        });
    }
    Ok(rendered)
}

/// Run OCR on pre-rendered page bitmaps and merge results into `pages`.
pub(crate) async fn ocr_and_merge_rendered(
    pages: &mut [Page],
    rendered: Vec<RenderedPage>,
    dpi: f32,
    ocr_engine: Arc<dyn OcrEngine>,
    ocr_language: &str,
    num_workers: usize,
) -> Result<(), LiteParseError> {
    // Phase 1: spawn one async task per page. A semaphore limits how many run
    // `recognize` concurrently to `num_workers`.
    //
    // The permit MUST be acquired in async context (`acquire_owned().await`),
    // not inside `spawn_blocking` via `block_on`. Acquiring it on a blocking
    // thread parks that OS thread until a permit is free; with more pages than
    // tokio's blocking pool (default `max_blocking_threads = 512`), every pool
    // thread ends up parked waiting on the semaphore. The single task holding
    // the permit then calls `recognize`, whose HTTP client resolves DNS via its
    // own internal `spawn_blocking` — which can never get a thread, so the
    // request never goes out, the permit is never released, and the whole OCR
    // pass deadlocks. Acquiring the permit asynchronously parks the lightweight
    // task instead, so only `num_workers` blocking threads are ever consumed.
    let num_workers = num_workers.max(1);
    let semaphore = Arc::new(tokio::sync::Semaphore::new(num_workers));
    let mut handles = Vec::with_capacity(rendered.len());

    let handle = tokio::runtime::Handle::current();

    for r in rendered {
        let engine = ocr_engine.clone();
        let sem = semaphore.clone();
        let language = ocr_language.to_string();
        let page_number = pages[r.idx].page_number;
        let rt_handle = handle.clone();

        handles.push((
            r.idx,
            page_number,
            tokio::spawn(async move {
                // Park the task (not an OS thread) until a permit is available.
                let _permit = sem.acquire_owned().await.expect("semaphore closed");
                let options = OcrOptions { language };
                // Offload the (possibly CPU-blocking, e.g. Tesseract) recognize
                // onto a blocking thread. Because the permit is already held,
                // at most `num_workers` blocking threads are in use at once,
                // leaving the rest of the pool free for the HTTP client's
                // internal DNS resolution.
                match tokio::task::spawn_blocking(move || {
                    rt_handle.block_on(engine.recognize(&r.rgb_bytes, r.width, r.height, &options))
                })
                .await
                {
                    Ok(result) => result,
                    Err(join_err) => {
                        Err(Box::new(join_err) as Box<dyn std::error::Error + Send + Sync>)
                    }
                }
            }),
        ));
    }

    // Phase 3: collect results and merge into pages.
    let scale_factor = 72.0 / dpi;

    // Track OCR task outcomes so we can distinguish a systemic failure (e.g.
    // missing Tesseract language data, which fails identically on every page)
    // from incidental per-page failures. Without this, every page logs the same
    // error and `parse()` still returns "success" with no OCR text.
    //
    // We additionally track whether any *sparse-text* page failed: a page is
    // rendered for OCR if it has sparse native text OR merely contains an image
    // (`needs_ocr = text_length < 20 || text_coverage < 0.15 || has_images`).
    // A native-text PDF with a logo on every page is rendered for OCR
    // enrichment but already has all its text. We must only fail loud when OCR
    // failure destroyed a sparse page's likely primary text source — otherwise
    // a broken OCR setup would abort perfectly good native-text documents.
    let total_tasks = handles.len();
    let mut failed_tasks = 0usize;
    let mut failed_sparse_text_page = false;
    let mut first_error: Option<String> = None;

    for (idx, page_number, handle) in handles {
        let ocr_results: Vec<OcrResult> = match handle.await {
            Ok(Ok(results)) => results,
            Ok(Err(e)) => {
                failed_tasks += 1;
                failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
                // Only log the first failure to avoid flooding stderr with an
                // identical message for every page.
                if first_error.is_none() {
                    let msg = e.to_string();
                    eprintln!("[ocr] failed for page {}: {}", page_number, msg);
                    first_error = Some(msg);
                }
                continue;
            }
            Err(e) => {
                failed_tasks += 1;
                failed_sparse_text_page |= page_has_sparse_native_text(&pages[idx]);
                if first_error.is_none() {
                    let msg = e.to_string();
                    eprintln!("[ocr] task panicked for page {}: {}", page_number, msg);
                    first_error = Some(msg);
                }
                continue;
            }
        };

        if ocr_results.is_empty() {
            continue;
        }

        let page = &mut pages[idx];
        // Drop unusable native items (substitution-cipher cmap corruption, or
        // unmappable Type3 text) so OCR can replace them. Without this,
        // garbled-but-spatially-present native text suppresses every OCR
        // result that overlaps it via the overlap check below, leaving the
        // output stuck with unreadable text. We apply both per-item and
        // per-page checks: short garbled labels ("GDWH", "XVG") can't be
        // flagged alone, but their host page can.
        if page_is_garbled(page) {
            page.text_items.clear();
        } else {
            page.text_items.retain(|item| !is_unusable_native(item));
        }

        // Only check overlap against native (already-extracted) PDF text. Comparing
        // each OCR result against previously-accepted OCR results caused adjacent
        // OCR lines whose bounding boxes touched within tolerance to suppress each
        // other, dropping every second line on scanned pages.
        let native_count = page.text_items.len();
        for r in &ocr_results {
            if r.confidence <= 0.1 {
                continue;
            }

            // Prefer the screen-space axis-aligned bbox derived from the polygon
            // (when present) so rotated detections carry a tight upright bbox.
            // The polygon also lets us recover an explicit rotation angle so the
            // projector can route rotated sidebar text through its rotation
            // reading-order handler instead of mistaking it for body text.
            let (ocr_x, ocr_y, ocr_w, ocr_h, rotation) = match r.polygon {
                Some(poly) => {
                    let xs = [poly[0][0], poly[1][0], poly[2][0], poly[3][0]];
                    let ys = [poly[0][1], poly[1][1], poly[2][1], poly[3][1]];
                    let x_min = xs.iter().copied().fold(f32::INFINITY, f32::min);
                    let x_max = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
                    let y_min = ys.iter().copied().fold(f32::INFINITY, f32::min);
                    let y_max = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
                    let rot = polygon_rotation_deg(&poly);
                    (
                        x_min * scale_factor,
                        y_min * scale_factor,
                        (x_max - x_min) * scale_factor,
                        (y_max - y_min) * scale_factor,
                        rot,
                    )
                }
                None => (
                    r.bbox[0] * scale_factor,
                    r.bbox[1] * scale_factor,
                    (r.bbox[2] - r.bbox[0]) * scale_factor,
                    (r.bbox[3] - r.bbox[1]) * scale_factor,
                    0.0,
                ),
            };

            if overlaps_existing_text(
                &page.text_items[..native_count],
                ocr_x,
                ocr_y,
                ocr_w,
                ocr_h,
                2.0,
            ) {
                continue;
            }

            let cleaned = clean_ocr_table_artifacts(&r.text);
            if cleaned.is_empty() {
                continue;
            }

            // For native rotated text the font_size approximates line height,
            // which for 90/270° rotations corresponds to the *narrow* screen
            // dimension. Use the perpendicular extent for rotated OCR text so
            // downstream font-size heuristics stay sane.
            let font_size_hint = if rotation == 90.0 || rotation == 270.0 {
                ocr_w.max(1.0)
            } else {
                ocr_h
            };

            page.text_items.push(TextItem {
                text: cleaned,
                x: ocr_x,
                y: ocr_y,
                width: ocr_w,
                height: ocr_h,
                rotation,
                font_name: Some("OCR".to_string()),
                font_size: Some(font_size_hint),
                confidence: Some((r.confidence * 1000.0).round() / 1000.0),
                ..Default::default()
            });
        }
    }

    // If every OCR task failed *and* at least one of those failures was on a
    // sparse-text page (the same length/coverage predicate that sends pages to
    // OCR as text-poor in `render_pages_for_ocr`), treat it as a systemic
    // failure. Returning an error surfaces the root cause (e.g. missing language
    // data) instead of silently emitting an empty or mostly-empty page. We
    // deliberately do NOT fail when the only failures were on pages that already
    // had substantial native text and were merely rendered for image-based OCR
    // enrichment — a broken OCR setup must not abort an otherwise-good
    // native-text document.
    if total_tasks > 0 && failed_tasks == total_tasks && failed_sparse_text_page {
        let detail = first_error.unwrap_or_else(|| "unknown error".to_string());
        return Err(LiteParseError::Ocr(format!(
            "OCR failed for all {} page(s): {}",
            total_tasks, detail
        )));
    }

    // Surface a concise summary for partial failures without flooding stderr.
    if failed_tasks > 0 {
        eprintln!(
            "[ocr] {}/{} page(s) failed OCR; continuing with partial results",
            failed_tasks, total_tasks
        );
    }

    Ok(())
}

/// True when the page's native (already-extracted) text is sparse enough that
/// OCR is likely its primary text source. Mirrors the non-image predicates in
/// `render_pages_for_ocr` (`text_length < 20 || text_coverage < 0.15`) so the
/// systemic-failure guard matches the same pages that were rendered because
/// their native text was insufficient.
fn page_has_sparse_native_text(page: &Page) -> bool {
    let text_length: usize = page
        .text_items
        .iter()
        .filter(|item| !is_unusable_native(item))
        .map(|item| item.text.len())
        .sum();
    let page_area = page.page_width * page.page_height;
    let text_bbox_area: f32 = page
        .text_items
        .iter()
        .filter(|item| !is_unusable_native(item))
        .map(|item| item.width * item.height)
        .sum();
    let text_coverage = if page_area > 0.0 {
        text_bbox_area / page_area
    } else {
        0.0
    };

    text_length < 20 || text_coverage < 0.15
}

/// A native text item that cannot be trusted as a text source: either its
/// Unicode mapping failed outright (Type3 fonts with no ToUnicode — the text
/// is a char-code fallback and the bbox comes from deceptive declared
/// metrics), or its content looks substitution-cipher garbled.
fn is_unusable_native(item: &TextItem) -> bool {
    item.has_unicode_map_error || is_likely_garbled(&item.text)
}

/// Total area of filled vector paths not accounted for by native text items.
/// Glyph outlines drawn as paths produce filled regions with no overlapping
/// text item; rules and table borders are stroke-only and already excluded
/// upstream, and shading rects behind real text are subtracted away by the
/// text overlap. Coverage is approximated by summing per-item intersections
/// (clamped to the path's own area), which can only over-estimate coverage —
/// i.e. err toward not triggering OCR.
fn uncovered_path_area(paths: &[ImageBounds], items: &[TextItem]) -> f32 {
    let mut uncovered = 0.0f32;
    for p in paths {
        let p_area = p.width * p.height;
        if p_area <= 0.0 {
            continue;
        }
        let mut covered = 0.0f32;
        for item in items {
            let ix = (p.x + p.width).min(item.x + item.width) - p.x.max(item.x);
            let iy = (p.y + p.height).min(item.y + item.height) - p.y.max(item.y);
            if ix > 0.0 && iy > 0.0 {
                covered += ix * iy;
                if covered >= p_area {
                    break;
                }
            }
        }
        uncovered += (p_area - covered).max(0.0);
    }
    uncovered
}

/// Heuristic for substitution-cipher / broken-cmap garbling: real Latin-script
/// text has a vowel ratio of roughly 30–45%, but a substitution permutation
/// almost always maps the original A/E/I/O/U onto non-vowel letters, driving
/// the apparent vowel ratio to near zero. Texts without enough ASCII letters
/// to judge (non-Latin scripts, numbers, short labels) are treated as fine.
fn is_likely_garbled(text: &str) -> bool {
    let (letters, vowels) = count_letters_and_vowels(text);
    if letters < 10 {
        return false;
    }
    vowels * 10 < letters
}

fn count_letters_and_vowels(text: &str) -> (usize, usize) {
    let mut letters = 0usize;
    let mut vowels = 0usize;
    for ch in text.chars() {
        if ch.is_ascii_alphabetic() {
            letters += 1;
            if matches!(ch.to_ascii_lowercase(), 'a' | 'e' | 'i' | 'o' | 'u') {
                vowels += 1;
            }
        }
    }
    (letters, vowels)
}

/// Page-level garbled check: even when individual items are too short to judge
/// in isolation (e.g. "GDWH", "FXUUHQFB XVG"), a page whose aggregate vowel
/// ratio collapses to single digits is almost certainly substitution-encoded.
/// Used to drop all native items on the page before OCR merge, so short
/// garbled labels don't suppress overlapping OCR results.
fn page_is_garbled(page: &Page) -> bool {
    let mut total_letters = 0usize;
    let mut total_vowels = 0usize;
    for it in &page.text_items {
        let (l, v) = count_letters_and_vowels(&it.text);
        total_letters += l;
        total_vowels += v;
    }
    if total_letters < 30 {
        return false;
    }
    // Real Latin-script vowel ratios sit ~30–45% across English, Portuguese,
    // Spanish, French, etc. A page-wide ratio under 20% is well outside any
    // natural-language range and signals substitution-style corruption. (A
    // simple +3 Caesar shift still leaves some U/Y letters from the original
    // O/Y mapping, so a 10% bound is too tight to catch this in practice.)
    total_vowels * 5 < total_letters
}

/// Recover a discrete CCW rotation in degrees from a 4-point OCR polygon.
/// Returns one of 0.0, 90.0, 180.0, 270.0 — snapping to the nearest right
/// angle — or 0.0 for nearly-square/degenerate polygons.
///
/// Point ordering varies between OCR engines: some emit TL→TR→BR→BL in the
/// glyphs' upright reading frame (so poly[0]→poly[1] is always the reading
/// direction), but others (notably PaddleOCR 3.x with
/// `use_textline_orientation=True`) emit polygons in screen-axis order, where
/// poly[0]→poly[1] is always horizontal in screen space regardless of how the
/// text actually reads. To handle both, we pick the *longer* of the two
/// adjacent edges as the reading direction — the text always runs along the
/// long axis of its bounding quadrilateral.
fn polygon_rotation_deg(poly: &[[f32; 2]; 4]) -> f32 {
    let e0 = [poly[1][0] - poly[0][0], poly[1][1] - poly[0][1]];
    let e1 = [poly[2][0] - poly[1][0], poly[2][1] - poly[1][1]];
    let len0 = (e0[0] * e0[0] + e0[1] * e0[1]).sqrt();
    let len1 = (e1[0] * e1[0] + e1[1] * e1[1]).sqrt();
    if len0.max(len1) < 1.0 {
        return 0.0;
    }
    // Treat near-square polygons as un-rotated — there's no reliable reading
    // axis to pick from. Single-char/CJK detections fall in here.
    let (longer, shorter) = if len0 >= len1 {
        (len0, len1)
    } else {
        (len1, len0)
    };
    if shorter > 0.0 && longer / shorter < 1.3 {
        return 0.0;
    }
    let reading = if len0 >= len1 { e0 } else { e1 };
    // atan2 with screen-down y; negate to get the conventional CCW angle.
    let angle_ccw = -reading[1].atan2(reading[0]).to_degrees();
    let normalized = angle_ccw.rem_euclid(360.0);
    ((normalized / 90.0).round() as i32 * 90).rem_euclid(360) as f32
}

/// Check if an OCR bounding box overlaps with any existing text item.
fn overlaps_existing_text(
    items: &[TextItem],
    ocr_x: f32,
    ocr_y: f32,
    ocr_w: f32,
    ocr_h: f32,
    tolerance: f32,
) -> bool {
    for item in items {
        let item_right = item.x + item.width;
        let item_bottom = item.y + item.height;

        let overlap_x = ocr_x < item_right + tolerance && ocr_x + ocr_w > item.x - tolerance;
        let overlap_y = ocr_y < item_bottom + tolerance && ocr_y + ocr_h > item.y - tolerance;

        if overlap_x && overlap_y {
            return true;
        }
    }
    false
}

/// Clean common OCR artifacts from table border misreads.
/// OCR often misreads vertical table border lines as bracket-like characters.
fn clean_ocr_table_artifacts(text: &str) -> String {
    let trimmed = text.trim();
    if trimmed.is_empty() {
        return String::new();
    }

    // Strip leading/trailing border artifact characters: | [ ] ( ) { }
    let without_artifacts: &str = trimmed
        .trim_start_matches(['|', '[', ']', '(', ')', '{', '}'])
        .trim_end_matches(['|', '[', ']', '(', ')', '{', '}'])
        .trim();

    if without_artifacts.is_empty() {
        return trimmed.to_string();
    }

    // Only use cleaned version if core content looks numeric-ish
    // This avoids incorrectly stripping brackets from content like "(note)"
    let is_numeric_ish = without_artifacts
        .chars()
        .all(|c| c.is_ascii_digit() || matches!(c, ',' | '.' | ' ' | '%' | '-' | '+' | '*' | '/'))
        || without_artifacts == "N/A"
        || without_artifacts == "Z"
        || without_artifacts == "-";

    if is_numeric_ish {
        without_artifacts.to_string()
    } else {
        trimmed.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_polygon_rotation_horizontal() {
        let p = [[0.0, 0.0], [100.0, 0.0], [100.0, 20.0], [0.0, 20.0]];
        assert_eq!(polygon_rotation_deg(&p), 0.0);
    }

    #[test]
    fn test_polygon_rotation_90_ccw() {
        // Upright text rotated 90° CCW: TL→TR edge points upward (screen y decreasing).
        let p = [[10.0, 100.0], [10.0, 0.0], [30.0, 0.0], [30.0, 100.0]];
        assert_eq!(polygon_rotation_deg(&p), 90.0);
    }

    #[test]
    fn test_polygon_rotation_270_ccw() {
        // Upright text rotated 270° CCW (= 90° CW): TL→TR edge points downward.
        let p = [[10.0, 0.0], [10.0, 100.0], [30.0, 100.0], [30.0, 0.0]];
        assert_eq!(polygon_rotation_deg(&p), 270.0);
    }

    #[test]
    fn test_polygon_rotation_screen_axis_vertical() {
        // PaddleOCR-style: tall+narrow sidebar polygon in screen-axis order
        // (smallest-y first). poly[0]→poly[1] is the SHORT horizontal edge,
        // not the reading direction. The longer edge picks out the rotation.
        let p = [[20.0, 50.0], [50.0, 50.0], [50.0, 750.0], [20.0, 750.0]];
        let r = polygon_rotation_deg(&p);
        assert!(r == 90.0 || r == 270.0, "expected 90 or 270, got {r}");
    }

    #[test]
    fn test_polygon_rotation_near_square() {
        // Single-char detections (CJK glyphs, etc.) should not be classified
        // as rotated — there's no reliable reading axis.
        let p = [[0.0, 0.0], [20.0, 0.0], [20.0, 22.0], [0.0, 22.0]];
        assert_eq!(polygon_rotation_deg(&p), 0.0);
    }

    #[test]
    fn test_polygon_rotation_180() {
        let p = [[100.0, 20.0], [0.0, 20.0], [0.0, 0.0], [100.0, 0.0]];
        assert_eq!(polygon_rotation_deg(&p), 180.0);
    }

    #[test]
    fn test_clean_ocr_table_artifacts() {
        assert_eq!(clean_ocr_table_artifacts("44520]"), "44520");
        assert_eq!(clean_ocr_table_artifacts("|123"), "123");
        assert_eq!(clean_ocr_table_artifacts("0.3|"), "0.3");
        assert_eq!(clean_ocr_table_artifacts("(note)"), "(note)");
        assert_eq!(clean_ocr_table_artifacts("|hello|"), "|hello|");
        assert_eq!(clean_ocr_table_artifacts("N/A"), "N/A");
        assert_eq!(clean_ocr_table_artifacts(""), "");
        assert_eq!(clean_ocr_table_artifacts("|||"), "|||");
    }

    fn make_item(x: f32, y: f32, w: f32, h: f32) -> TextItem {
        TextItem {
            text: "x".into(),
            x,
            y,
            width: w,
            height: h,
            ..Default::default()
        }
    }

    #[test]
    fn test_overlaps_existing_text_inside() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        assert!(overlaps_existing_text(&items, 12.0, 11.0, 5.0, 2.0, 2.0));
    }

    #[test]
    fn test_overlaps_existing_text_disjoint() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        assert!(!overlaps_existing_text(&items, 100.0, 100.0, 5.0, 5.0, 2.0));
    }

    #[test]
    fn test_overlaps_existing_text_tolerance() {
        let items = vec![make_item(10.0, 10.0, 20.0, 5.0)];
        // Just outside but within tolerance
        assert!(overlaps_existing_text(&items, 31.0, 10.0, 5.0, 5.0, 2.0));
        // Beyond tolerance
        assert!(!overlaps_existing_text(&items, 35.0, 10.0, 5.0, 5.0, 2.0));
    }

    #[test]
    fn test_overlaps_empty() {
        assert!(!overlaps_existing_text(&[], 0.0, 0.0, 1.0, 1.0, 0.0));
    }

    fn pb(x: f32, y: f32, w: f32, h: f32) -> ImageBounds {
        ImageBounds {
            x,
            y,
            width: w,
            height: h,
        }
    }

    #[test]
    fn test_uncovered_path_area_no_text() {
        // A sentence-sized outlined region with no native text at all.
        let paths = vec![pb(50.0, 300.0, 200.0, 12.0)];
        let area = uncovered_path_area(&paths, &[]);
        assert!((area - 2400.0).abs() < 1.0);
        assert!(area >= UNCOVERED_VECTOR_AREA_THRESHOLD);
    }

    #[test]
    fn test_uncovered_path_area_fully_covered_by_text() {
        // Shading rect behind real text: fully covered, must not trigger.
        let paths = vec![pb(50.0, 300.0, 200.0, 12.0)];
        let items = vec![make_item(40.0, 295.0, 250.0, 25.0)];
        let area = uncovered_path_area(&paths, &items);
        assert_eq!(area, 0.0);
    }

    #[test]
    fn test_uncovered_path_area_partial_coverage() {
        // Half the outlined region is covered by a text item.
        let paths = vec![pb(0.0, 0.0, 100.0, 10.0)];
        let items = vec![make_item(0.0, 0.0, 50.0, 10.0)];
        let area = uncovered_path_area(&paths, &items);
        assert!((area - 500.0).abs() < 1.0);
    }

    #[test]
    fn test_uncovered_path_area_small_decoration_below_threshold() {
        // A few bullet-sized filled paths shouldn't reach the threshold.
        let paths = vec![pb(10.0, 10.0, 8.0, 8.0), pb(10.0, 30.0, 8.0, 8.0)];
        let area = uncovered_path_area(&paths, &[]);
        assert!(area < UNCOVERED_VECTOR_AREA_THRESHOLD);
    }

    #[test]
    fn test_unusable_native_unicode_map_error() {
        let mut item = make_item(0.0, 0.0, 10.0, 10.0);
        assert!(!is_unusable_native(&item));
        item.has_unicode_map_error = true;
        assert!(is_unusable_native(&item));
    }

    #[test]
    fn test_clean_ocr_keeps_whitespace_trimmed() {
        assert_eq!(clean_ocr_table_artifacts("   "), "");
        assert_eq!(clean_ocr_table_artifacts(" 123 "), "123");
    }

    // A mock OCR engine that always fails, simulating a systemic error such as
    // missing Tesseract language data (the root cause behind issue #253).
    struct FailingEngine;
    impl OcrEngine for FailingEngine {
        fn name(&self) -> &str {
            "failing"
        }
        fn recognize<'a, 'b: 'a, 'c: 'a>(
            &'a self,
            _image_data: &'c [u8],
            _width: u32,
            _height: u32,
            _options: &'b OcrOptions,
        ) -> std::pin::Pin<
            Box<
                dyn Future<
                        Output = Result<Vec<OcrResult>, Box<dyn std::error::Error + Send + Sync>>,
                    > + Send
                    + '_,
            >,
        > {
            Box::pin(async move { Err("Error opening data file tessdata/eng.traineddata".into()) })
        }
    }

    fn make_blank_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: Vec::new(),
        }
    }

    fn make_rendered(idx: usize) -> RenderedPage {
        RenderedPage {
            idx,
            // 1x1 RGB pixel; the engine never inspects it.
            rgb_bytes: vec![0u8, 0u8, 0u8],
            width: 1,
            height: 1,
        }
    }

    // A page that already has substantial native text coverage, as would be the
    // case for a native-text PDF page that was only rendered for OCR because it
    // also contains an image.
    fn make_native_text_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: vec![TextItem {
                text: "this page already has real native text content".into(),
                x: 0.0,
                y: 0.0,
                width: 50.0,
                height: 50.0,
                ..Default::default()
            }],
        }
    }

    // A page with >20 bytes of native text but very low page coverage. These
    // are still text-poor enough that `render_pages_for_ocr` sends them to OCR
    // (`text_coverage < 0.15`), so a systemic OCR failure should not be silently
    // swallowed.
    fn make_low_coverage_text_page(page_number: usize) -> Page {
        Page {
            page_number,
            page_width: 100.0,
            page_height: 100.0,
            text_items: vec![TextItem {
                text: "small native header that is not enough".into(),
                x: 0.0,
                y: 0.0,
                width: 10.0,
                height: 5.0,
                ..Default::default()
            }],
        }
    }

    // When every OCR task fails (e.g. missing language data), the function must
    // return an error instead of silently reporting success with no OCR text.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_all_pages_fail_returns_error() {
        let mut pages = vec![make_blank_page(1), make_blank_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("expected systemic OCR failure to be surfaced");
        let msg = err.to_string();
        assert!(
            msg.contains("OCR failed for all 2 page(s)"),
            "unexpected error message: {msg}"
        );
        assert!(
            msg.contains("traineddata"),
            "error should carry the underlying cause: {msg}"
        );
    }

    // With no rendered pages there is nothing to OCR; this must remain a no-op
    // success rather than tripping the all-failed guard.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_no_rendered_pages_is_ok() {
        let mut pages = vec![make_blank_page(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, Vec::new(), 72.0, engine, "eng", 2).await;

        assert!(result.is_ok(), "empty OCR set should succeed: {result:?}");
    }

    // Regression guard: when OCR fails but every failing page already had native
    // text (it was only rendered for image-based enrichment), a broken OCR setup
    // must NOT abort the parse — the native text is still valid output.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_native_text_pages_not_failed_on_ocr_error() {
        let mut pages = vec![make_native_text_page(1), make_native_text_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        assert!(
            result.is_ok(),
            "OCR failure on already-native-text pages must not abort the parse: {result:?}"
        );
        // Native text is preserved untouched.
        assert_eq!(pages[0].text_items.len(), 1);
        assert_eq!(pages[1].text_items.len(), 1);
    }

    // When failures span both a sparse-text page and a native-text page, the
    // sparse-text page lost its likely primary text source, so we still fail
    // loud.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_mixed_failure_with_sparse_text_page_returns_error() {
        let mut pages = vec![make_native_text_page(1), make_blank_page(2)];
        let rendered = vec![make_rendered(0), make_rendered(1)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("a text-starved page losing all OCR must surface an error");
        assert!(
            err.to_string().contains("OCR failed for all 2 page(s)"),
            "unexpected error message: {err}"
        );
    }

    // Regression guard for the review finding: low-coverage pages are rendered
    // for OCR even when their native text length is >20 bytes. A systemic OCR
    // failure on such pages must still surface as an error.
    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn test_low_coverage_text_page_failure_returns_error() {
        let mut pages = vec![make_low_coverage_text_page(1)];
        let rendered = vec![make_rendered(0)];
        let engine: Arc<dyn OcrEngine> = Arc::new(FailingEngine);

        let result = ocr_and_merge_rendered(&mut pages, rendered, 72.0, engine, "eng", 2).await;

        let err = result.expect_err("low-coverage text page losing OCR must surface an error");
        assert!(
            err.to_string().contains("OCR failed for all 1 page(s)"),
            "unexpected error message: {err}"
        );
    }
}