Skip to main content

edgeparse_core/pdf/
raster_table_ocr.rs

1//! Recover text signal from raster table images using local OCR.
2
3use std::collections::{BTreeMap, HashMap, HashSet};
4use std::env;
5use std::fs;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8use std::sync::OnceLock;
9use std::time::{SystemTime, UNIX_EPOCH};
10
11use image::{GenericImageView, GrayImage, Luma};
12use serde::Deserialize;
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::{ImageChunk, TextChunk};
16use crate::models::content::ContentElement;
17use crate::models::enums::{PdfLayer, TextFormat, TextType};
18use crate::models::table::{
19    TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
20};
21
22// Broaden image eligibility so moderately cropped tables are considered.
23const MIN_IMAGE_WIDTH_RATIO: f64 = 0.40;
24const MIN_IMAGE_AREA_RATIO: f64 = 0.035;
25const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
26const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
27// Accuracy-first: accept degraded glyphs at lower confidence —
28// dual-OEM consensus and spatial coherence filtering will eliminate noise.
29const MIN_OCR_WORD_CONFIDENCE: f64 = 6.0;
30// Reject artificially-high confidence noise (Tesseract artefacts above 100).
31const MAX_OCR_WORD_CONFIDENCE: f64 = 101.0;
32const RASTER_DARK_THRESHOLD: u8 = 180;
33const RASTER_CHART_INK_THRESHOLD: u8 = 240;
34const MIN_BORDERED_VERTICAL_LINES: usize = 3;
35const MIN_BORDERED_HORIZONTAL_LINES: usize = 3;
36// Accuracy-first: lighter lines are still valid table borders.
37const MIN_LINE_DARK_RATIO: f64 = 0.28;
38const MIN_CELL_SIZE_PX: u32 = 10;
39const CELL_INSET_PX: u32 = 5;
40const TABLE_RASTER_OCR_BORDER_PX: u32 = 14;
41// Typography-grounded scale: pdftoppm renders at PDFTOPPM_DPI (150). Scaling by 2
42// gives 300 DPI effective — the Tesseract-documented optimum. At 12pt body text,
43// cap height ≈ 25px raw → 50px scaled, squarely in Tesseract's 32-40px sweet spot.
44// Over-scaling (×5 = 125px) amplifies anti-aliasing and hurts LSTM segmentation.
45const PDFTOPPM_DPI: u32 = 150;
46const OCR_SCALE_FACTOR: u32 = 2;
47/// Effective DPI seen by Tesseract = PDFTOPPM_DPI × OCR_SCALE_FACTOR.
48const TESSERACT_EFFECTIVE_DPI: u32 = PDFTOPPM_DPI * OCR_SCALE_FACTOR;
49const MIN_DOMINANT_IMAGE_WIDTH_RATIO: f64 = 0.65;
50const MIN_DOMINANT_IMAGE_AREA_RATIO: f64 = 0.40;
51const MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE: usize = 80;
52const MIN_DOMINANT_IMAGE_OCR_WORDS: usize = 18;
53const MIN_DOMINANT_IMAGE_TEXT_LINES: usize = 6;
54const MIN_DENSE_PROSE_BLOCK_LINES: usize = 3;
55const MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO: f64 = 0.32;
56// Permit minor breaks in rasterized lines while still enforcing structure.
57const MIN_TRUE_GRID_LINE_CONTINUITY: f64 = 0.60;
58const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
59const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
60const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
61const LOCAL_BINARIZATION_RADIUS: u32 = 14;
62const MIN_BINARIZATION_BLOCK_PIXELS: usize = 81;
63// Handle sparse numeric tables where only a few cells OCR cleanly.
64const MIN_RASTER_TABLE_TEXT_CELL_RATIO: f64 = 0.05;
65const MIN_RASTER_TABLE_ROWS_WITH_TEXT: usize = 1;
66const MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO: f64 = 0.40;
67const MIN_BORDERED_CELL_DARK_RATIO: f64 = 0.03;
68const MIN_BORDERED_INKED_CELL_RATIO: f64 = 0.18;
69const MIN_BORDERED_ROWS_WITH_INK: usize = 2;
70const MAX_BORDERED_TABLE_PER_CELL_FALLBACK_CELLS: usize = 24;
71const MIN_BRIGHT_PHOTO_MID_TONE_RATIO: f64 = 0.24;
72const MIN_BRIGHT_PHOTO_HISTOGRAM_BINS: usize = 8;
73const MIN_BRIGHT_PHOTO_ENTROPY: f64 = 1.6;
74
75#[derive(Debug, Clone)]
76struct OcrWord {
77    line_key: (u32, u32, u32),
78    left: u32,
79    top: u32,
80    width: u32,
81    height: u32,
82    text: String,
83    confidence: f64,
84}
85
86#[derive(Debug, Clone)]
87struct XCluster {
88    center: f64,
89    count: usize,
90    lines: HashSet<(u32, u32, u32)>,
91}
92
93#[derive(Clone)]
94struct OcrRowBuild {
95    top_y: f64,
96    bottom_y: f64,
97    cell_texts: Vec<String>,
98}
99
100#[derive(Debug, Clone)]
101struct EmptyCellRaster {
102    row_idx: usize,
103    cell_idx: usize,
104    x1: u32,
105    y1: u32,
106    x2: u32,
107    y2: u32,
108}
109
110#[derive(Debug, Clone)]
111struct RasterTableGrid {
112    vertical_lines: Vec<u32>,
113    horizontal_lines: Vec<u32>,
114}
115
116#[derive(Debug, Clone)]
117struct OcrCandidateScore {
118    words: Vec<OcrWord>,
119    score: f64,
120}
121
122#[derive(Debug, Clone)]
123struct PdfImagesListEntry {
124    image_type: String,
125}
126
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128enum OcrEngine {
129    Tesseract,
130    RapidOcr,
131}
132
133#[derive(Debug, Deserialize)]
134struct RapidOcrLine {
135    left: u32,
136    top: u32,
137    width: u32,
138    height: u32,
139    text: String,
140    confidence: f64,
141}
142
143static OCR_ENGINE: OnceLock<OcrEngine> = OnceLock::new();
144static RAPIDOCR_PYTHON: OnceLock<Option<String>> = OnceLock::new();
145
146const RAPIDOCR_RUNNER: &str = r#"
147import json, sys
148from rapidocr import RapidOCR
149
150engine = RapidOCR()
151result = engine(sys.argv[1], use_det=True, use_cls=True, use_rec=True)
152
153if result is None:
154    print('[]')
155    raise SystemExit(0)
156
157boxes = getattr(result, 'boxes', []) or []
158txts = getattr(result, 'txts', []) or []
159scores = getattr(result, 'scores', []) or []
160out = []
161for box, text, score in zip(boxes, txts, scores):
162    if not text or not str(text).strip():
163        continue
164    xs = [pt[0] for pt in box]
165    ys = [pt[1] for pt in box]
166    out.append({
167        'left': int(min(xs)),
168        'top': int(min(ys)),
169        'width': max(1, int(max(xs) - min(xs))),
170        'height': max(1, int(max(ys) - min(ys))),
171        'text': str(text),
172        'confidence': float(score),
173    })
174print(json.dumps(out, ensure_ascii=False))
175"#;
176
177fn selected_ocr_engine() -> OcrEngine {
178    *OCR_ENGINE.get_or_init(|| match env::var("EDGEPARSE_OCR_ENGINE") {
179        Ok(value) => match value.to_ascii_lowercase().as_str() {
180            "rapidocr" if rapidocr_python_command().is_some() => OcrEngine::RapidOcr,
181            "rapidocr" => OcrEngine::Tesseract,
182            _ => OcrEngine::Tesseract,
183        },
184        Err(_) => OcrEngine::Tesseract,
185    })
186}
187
188fn rapidocr_python_command() -> Option<&'static str> {
189    RAPIDOCR_PYTHON
190        .get_or_init(|| {
191            let preferred = env::var("EDGEPARSE_OCR_PYTHON").ok();
192            let mut candidates = Vec::new();
193            if let Some(cmd) = preferred {
194                candidates.push(cmd);
195            }
196            candidates.push("python3".to_string());
197            candidates.push("python".to_string());
198
199            for candidate in candidates {
200                let ok = Command::new(&candidate)
201                    .arg("-c")
202                    .arg("import rapidocr")
203                    .output()
204                    .ok()
205                    .is_some_and(|out| out.status.success());
206                if ok {
207                    return Some(candidate);
208                }
209            }
210            None
211        })
212        .as_deref()
213}
214
215fn rapidocr_lines_to_words(lines: Vec<RapidOcrLine>) -> Vec<OcrWord> {
216    let mut words = Vec::new();
217
218    for (line_idx, line) in lines.into_iter().enumerate() {
219        let tokens: Vec<&str> = line.text.split_whitespace().collect();
220        if tokens.is_empty() {
221            continue;
222        }
223
224        let total_chars: u32 = tokens
225            .iter()
226            .map(|token| token.chars().count() as u32)
227            .sum();
228        if total_chars == 0 {
229            continue;
230        }
231
232        let mut cursor = line.left;
233        let mut remaining_width = line.width.max(tokens.len() as u32);
234        let mut remaining_chars = total_chars;
235
236        for (token_idx, token) in tokens.iter().enumerate() {
237            let token_chars = token.chars().count() as u32;
238            let width = if token_idx == tokens.len() - 1 || remaining_chars <= token_chars {
239                remaining_width.max(1)
240            } else {
241                let proportional = ((remaining_width as f64) * (token_chars as f64)
242                    / (remaining_chars as f64))
243                    .round() as u32;
244                proportional.max(1).min(remaining_width)
245            };
246
247            words.push(OcrWord {
248                line_key: (0, line_idx as u32, 0),
249                left: cursor,
250                top: line.top,
251                width,
252                height: line.height.max(1),
253                text: (*token).to_string(),
254                confidence: line.confidence,
255            });
256
257            cursor = cursor.saturating_add(width);
258            remaining_width = remaining_width.saturating_sub(width);
259            remaining_chars = remaining_chars.saturating_sub(token_chars);
260        }
261    }
262
263    words
264}
265
266fn run_rapidocr_words(image: &GrayImage) -> Option<Vec<OcrWord>> {
267    let python = rapidocr_python_command()?;
268    let temp_dir = create_temp_dir(0).ok()?;
269    let image_path = temp_dir.join("ocr.png");
270    if image.save(&image_path).is_err() {
271        let _ = fs::remove_dir_all(&temp_dir);
272        return None;
273    }
274
275    let output = Command::new(python)
276        .current_dir(&temp_dir)
277        .arg("-c")
278        .arg(RAPIDOCR_RUNNER)
279        .arg("ocr.png")
280        .output()
281        .ok()?;
282    let _ = fs::remove_dir_all(&temp_dir);
283    if !output.status.success() {
284        return None;
285    }
286
287    let json = String::from_utf8_lossy(&output.stdout);
288    let lines: Vec<RapidOcrLine> = serde_json::from_str(&json).ok()?;
289    let words = rapidocr_lines_to_words(lines);
290    (!words.is_empty()).then_some(words)
291}
292
293/// Recover OCR text chunks for image-backed table regions on a single page.
294pub fn recover_raster_table_text_chunks(
295    input_path: &Path,
296    page_bbox: &BoundingBox,
297    page_number: u32,
298    text_chunks: &[TextChunk],
299    image_chunks: &[ImageChunk],
300) -> Vec<TextChunk> {
301    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
302        return Vec::new();
303    }
304
305    let candidates: Vec<&ImageChunk> = image_chunks
306        .iter()
307        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
308        .collect();
309    if candidates.is_empty() {
310        return Vec::new();
311    }
312
313    let temp_dir = match create_temp_dir(page_number) {
314        Ok(dir) => dir,
315        Err(_) => return Vec::new(),
316    };
317
318    let result =
319        recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
320
321    let _ = fs::remove_dir_all(&temp_dir);
322    result
323}
324
325/// Recover OCR text lines from dominant non-table page images.
326///
327/// This is for infographic-like pages where the PDF contains a large raster
328/// image but little or no native text. The extracted OCR signal is injected
329/// back into the normal text pipeline as line chunks so downstream grouping can
330/// rebuild headings, paragraphs, and lists.
331pub fn recover_dominant_image_text_chunks(
332    input_path: &Path,
333    page_bbox: &BoundingBox,
334    page_number: u32,
335    text_chunks: &[TextChunk],
336    image_chunks: &[ImageChunk],
337) -> Vec<TextChunk> {
338    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
339        return Vec::new();
340    }
341
342    let candidates: Vec<&ImageChunk> = image_chunks
343        .iter()
344        .filter(|image| is_dominant_image_text_candidate(image, page_bbox, text_chunks))
345        .collect();
346    if candidates.is_empty() {
347        return Vec::new();
348    }
349
350    let temp_dir = match create_temp_dir(page_number) {
351        Ok(dir) => dir,
352        Err(_) => return Vec::new(),
353    };
354
355    let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
356        Some(files) => files,
357        None => {
358            let _ = fs::remove_dir_all(&temp_dir);
359            return Vec::new();
360        }
361    };
362
363    let mut recovered = Vec::new();
364    for image in candidates {
365        let Some(image_index) = image.index else {
366            continue;
367        };
368        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
369            continue;
370        };
371        let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
372            continue;
373        };
374        if recover_bordered_raster_table_from_gray(&gray, image).is_some()
375            || is_obvious_bar_chart_raster(&gray)
376            || is_natural_photograph_raster(&gray)
377            || is_dark_ui_screenshot_raster(&gray)
378        {
379            continue;
380        }
381
382        let Some(words) = run_tesseract_tsv_words_best(&gray, &["11", "6"], |candidate| {
383            looks_like_dense_prose_image_ocr(candidate)
384        }) else {
385            continue;
386        };
387
388        recovered.extend(lines_from_ocr_words(
389            &words,
390            image,
391            gray.width(),
392            gray.height(),
393            text_chunks,
394        ));
395    }
396
397    let _ = fs::remove_dir_all(&temp_dir);
398    recovered
399}
400
401/// Recover synthetic table borders for strongly numeric raster tables.
402pub fn recover_raster_table_borders(
403    input_path: &Path,
404    page_bbox: &BoundingBox,
405    page_number: u32,
406    text_chunks: &[TextChunk],
407    image_chunks: &[ImageChunk],
408) -> Vec<TableBorder> {
409    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
410        return Vec::new();
411    }
412
413    let candidates: Vec<&ImageChunk> = image_chunks
414        .iter()
415        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
416        .collect();
417    if candidates.is_empty() {
418        return Vec::new();
419    }
420
421    let temp_dir = match create_temp_dir(page_number) {
422        Ok(dir) => dir,
423        Err(_) => return Vec::new(),
424    };
425
426    let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
427        Some(files) => files,
428        None => {
429            let _ = fs::remove_dir_all(&temp_dir);
430            return Vec::new();
431        }
432    };
433
434    let mut tables = Vec::new();
435    for image in candidates {
436        let Some(image_index) = image.index else {
437            continue;
438        };
439        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
440            continue;
441        };
442        let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
443            continue;
444        };
445        if is_obvious_bar_chart_raster(&gray)
446            || is_natural_photograph_raster(&gray)
447            || is_dark_ui_screenshot_raster(&gray)
448        {
449            continue;
450        }
451        if let Some(table) = recover_bordered_raster_table_from_gray(&gray, image) {
452            let chart_words = run_tesseract_tsv_words_best(&gray, &["6", "11"], |_| true);
453            if chart_words
454                .as_deref()
455                .is_some_and(looks_like_chart_label_ocr)
456            {
457                continue;
458            }
459            tables.push(table);
460            continue;
461        }
462        let Some(words) = run_tesseract_tsv_words_best(&gray, &["6", "11"], |candidate| {
463            looks_like_table_ocr(candidate)
464        }) else {
465            continue;
466        };
467
468        if looks_like_numeric_table_ocr(&words) {
469            if let Some(table) = build_numeric_table_border(&words, image) {
470                if is_matrixish_ocr_artifact_table(&table) {
471                    continue;
472                }
473                tables.push(table);
474                continue;
475            }
476        }
477
478        if let Some(table) = build_structured_ocr_table_border(&words, image) {
479            if is_matrixish_ocr_artifact_table(&table) {
480                continue;
481            }
482            tables.push(table);
483        }
484    }
485
486    let _ = fs::remove_dir_all(&temp_dir);
487    tables
488}
489
490/// Recover OCR text into empty bordered tables by rasterizing the full page.
491///
492/// This targets graphics-dominant pages where native PDF text is sparse but the
493/// page still exposes strong bordered geometry. It enriches existing empty
494/// `TableBorder` cells directly from the rendered page appearance.
495pub fn recover_page_raster_table_cell_text(
496    input_path: &Path,
497    page_bbox: &BoundingBox,
498    page_number: u32,
499    elements: &mut [ContentElement],
500) {
501    if page_bbox.area() <= 0.0 {
502        return;
503    }
504
505    let native_text_chars = page_native_text_chars(elements);
506
507    let candidate_indices: Vec<usize> = elements
508        .iter()
509        .enumerate()
510        .filter_map(|(idx, elem)| {
511            let table = table_candidate_ref(elem)?;
512            let local_text_chars = native_text_chars_in_region(elements, &table.bbox);
513            if !table_needs_page_raster_ocr(table) {
514                return None;
515            }
516            if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR
517                && local_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR
518            {
519                return None;
520            }
521            Some(idx)
522        })
523        .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
524        .collect();
525    if candidate_indices.is_empty() {
526        return;
527    }
528
529    let coverage: f64 = candidate_indices
530        .iter()
531        .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
532        .sum::<f64>()
533        / page_bbox.area().max(1.0);
534    if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
535        return;
536    }
537
538    let temp_dir = match create_temp_dir(page_number) {
539        Ok(dir) => dir,
540        Err(_) => return,
541    };
542    let prefix = temp_dir.join("page");
543    let status = Command::new("pdftoppm")
544        .arg("-png")
545        .arg("-f")
546        .arg(page_number.to_string())
547        .arg("-l")
548        .arg(page_number.to_string())
549        .arg("-singlefile")
550        .arg(input_path)
551        .arg(&prefix)
552        .status();
553    match status {
554        Ok(s) if s.success() => {}
555        _ => {
556            let _ = fs::remove_dir_all(&temp_dir);
557            return;
558        }
559    }
560
561    let page_image_path = prefix.with_extension("png");
562    let gray = match image::open(&page_image_path) {
563        Ok(img) => img.to_luma8(),
564        Err(_) => {
565            let _ = fs::remove_dir_all(&temp_dir);
566            return;
567        }
568    };
569
570    for idx in candidate_indices {
571        let Some(elem) = elements.get_mut(idx) else {
572            continue;
573        };
574        let Some(table) = table_candidate_mut(elem) else {
575            continue;
576        };
577        enrich_empty_table_from_page_raster(&gray, page_bbox, table);
578    }
579
580    let _ = fs::remove_dir_all(&temp_dir);
581}
582
583fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
584    match elem {
585        ContentElement::TableBorder(table) => Some(table),
586        ContentElement::Table(table) => Some(&table.table_border),
587        _ => None,
588    }
589}
590
591fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
592    match elem {
593        ContentElement::TableBorder(table) => Some(table),
594        ContentElement::Table(table) => Some(&mut table.table_border),
595        _ => None,
596    }
597}
598
599fn page_native_text_chars(elements: &[ContentElement]) -> usize {
600    native_text_chars_in_region(elements, &BoundingBox::new(None, f64::MIN, f64::MIN, f64::MAX, f64::MAX))
601}
602
603fn native_text_chars_in_region(elements: &[ContentElement], region: &BoundingBox) -> usize {
604    elements
605        .iter()
606        .filter(|elem| region.overlaps(elem.bbox()))
607        .map(|elem| match elem {
608            ContentElement::Paragraph(p) => p.base.value().chars().count(),
609            ContentElement::Heading(h) => h.base.base.value().chars().count(),
610            ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
611            ContentElement::TextBlock(tb) => tb.value().chars().count(),
612            ContentElement::TextLine(tl) => tl.value().chars().count(),
613            ContentElement::TextChunk(tc) => tc.value.chars().count(),
614            ContentElement::List(list) => list
615                .list_items
616                .iter()
617                .flat_map(|item| item.contents.iter())
618                .map(|content| match content {
619                    ContentElement::Paragraph(p) => p.base.value().chars().count(),
620                    ContentElement::TextBlock(tb) => tb.value().chars().count(),
621                    ContentElement::TextLine(tl) => tl.value().chars().count(),
622                    ContentElement::TextChunk(tc) => tc.value.chars().count(),
623                    _ => 0,
624                })
625                .sum(),
626            _ => 0,
627        })
628        .sum()
629}
630
631fn recover_from_page_images(
632    input_path: &Path,
633    temp_dir: &Path,
634    page_number: u32,
635    candidates: Vec<&ImageChunk>,
636    text_chunks: &[TextChunk],
637) -> Vec<TextChunk> {
638    let image_files = match extract_visible_page_image_files(input_path, page_number, temp_dir) {
639        Some(files) => files,
640        None => return Vec::new(),
641    };
642    if image_files.is_empty() {
643        return Vec::new();
644    }
645
646    let mut recovered = Vec::new();
647    for image in candidates {
648        let Some(image_index) = image.index else {
649            continue;
650        };
651        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
652            continue;
653        };
654        let bordered_table = recover_bordered_raster_table(image_path, image);
655        if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
656            recovered.push(caption);
657        }
658        if bordered_table.is_some() {
659            continue;
660        }
661        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
662            continue;
663        };
664        // Images extracted via pdfimages are at their native PDF DPI.
665        // We pass PDFTOPPM_DPI as a reasonable hint; Tesseract uses this only for
666        // geometry heuristics, not LSTM recognition, so approximate is fine.
667        let native_dpi = PDFTOPPM_DPI.to_string();
668        let Ok(tsv_output) = Command::new("tesseract")
669            .current_dir(temp_dir)
670            .arg(file_name)
671            .arg("stdout")
672            .arg("--dpi")
673            .arg(&native_dpi)
674            .arg("--psm")
675            .arg("6")
676            .arg("-c")
677            .arg("load_system_dawg=0")
678            .arg("-c")
679            .arg("load_freq_dawg=0")
680            .arg("tsv")
681            .output()
682        else {
683            continue;
684        };
685        if !tsv_output.status.success() {
686            continue;
687        }
688
689        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
690        let words = parse_tesseract_tsv(&tsv);
691        if !looks_like_table_ocr(&words) {
692            continue;
693        }
694
695        recovered.extend(words_to_text_chunks(&words, image, text_chunks));
696    }
697
698    recovered
699}
700
701fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
702    if table.num_rows < 1 || table.num_columns < 2 {
703        return false;
704    }
705
706    let total_cells = table.rows.iter().map(|row| row.cells.len()).sum::<usize>();
707    if total_cells == 0 {
708        return false;
709    }
710
711    let text_cells = table_text_cell_count(table);
712    let text_cell_ratio = text_cells as f64 / total_cells as f64;
713    text_cells == 0 || text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
714}
715
716fn table_text_cell_count(table: &TableBorder) -> usize {
717    table
718        .rows
719        .iter()
720        .flat_map(|row| row.cells.iter())
721        .filter(|cell| cell_has_substantive_text(cell))
722        .count()
723}
724
725fn cell_has_substantive_text(cell: &TableBorderCell) -> bool {
726    let has_token_text = cell.content.iter().any(|token| {
727        matches!(token.token_type, TableTokenType::Text)
728            && token.base.value.chars().any(|ch| ch.is_alphanumeric())
729    });
730    if has_token_text {
731        return true;
732    }
733
734    cell.contents.iter().any(|elem| match elem {
735        ContentElement::Paragraph(p) => p.base.value().chars().any(|ch| ch.is_alphanumeric()),
736        ContentElement::Heading(h) => h.base.base.value().chars().any(|ch| ch.is_alphanumeric()),
737        ContentElement::NumberHeading(h) => h
738            .base
739            .base
740            .base
741            .value()
742            .chars()
743            .any(|ch| ch.is_alphanumeric()),
744        ContentElement::TextBlock(tb) => tb.value().chars().any(|ch| ch.is_alphanumeric()),
745        ContentElement::TextLine(tl) => tl.value().chars().any(|ch| ch.is_alphanumeric()),
746        ContentElement::TextChunk(tc) => tc.value.chars().any(|ch| ch.is_alphanumeric()),
747        _ => false,
748    })
749}
750
751fn enrich_empty_table_from_page_raster(
752    gray: &GrayImage,
753    page_bbox: &BoundingBox,
754    table: &mut TableBorder,
755) {
756    // Collect empty cells first, so we can OCR the whole table once and then
757    // distribute words into cells. This avoids calling tesseract per cell.
758    let mut empty_cells: Vec<EmptyCellRaster> = Vec::new();
759    for (row_idx, row) in table.rows.iter().enumerate() {
760        for (cell_idx, cell) in row.cells.iter().enumerate() {
761            if cell
762                .content
763                .iter()
764                .any(|token| matches!(token.token_type, TableTokenType::Text))
765            {
766                continue;
767            }
768            let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
769            else {
770                continue;
771            };
772            empty_cells.push(EmptyCellRaster {
773                row_idx,
774                cell_idx,
775                x1,
776                y1,
777                x2,
778                y2,
779            });
780        }
781    }
782    if empty_cells.is_empty() {
783        return;
784    }
785
786    // Fallback to legacy per-cell OCR when we can't build a stable table crop.
787    let Some((tx1, ty1, tx2, ty2)) = page_bbox_to_raster_box(gray, page_bbox, &table.bbox) else {
788        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
789        return;
790    };
791
792    let pad = CELL_INSET_PX * 2;
793    let crop_left = tx1.saturating_sub(pad);
794    let crop_top = ty1.saturating_sub(pad);
795    let crop_right = (tx2 + pad).min(gray.width());
796    let crop_bottom = (ty2 + pad).min(gray.height());
797    if crop_right <= crop_left || crop_bottom <= crop_top {
798        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
799        return;
800    }
801
802    let crop_width = crop_right - crop_left;
803    let crop_height = crop_bottom - crop_top;
804    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
805        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
806        return;
807    }
808
809    let cropped = gray
810        .view(crop_left, crop_top, crop_width, crop_height)
811        .to_image();
812    let is_bar_chart = is_obvious_bar_chart_raster(&cropped);
813    let is_photo = is_natural_photograph_raster(&cropped);
814    let is_ui = is_dark_ui_screenshot_raster(&cropped);
815    if is_bar_chart || is_photo || is_ui {
816        return;
817    }
818    let bordered = expand_white_border(&cropped, TABLE_RASTER_OCR_BORDER_PX);
819    let scaled = image::imageops::resize(
820        &bordered,
821        bordered.width() * OCR_SCALE_FACTOR,
822        bordered.height() * OCR_SCALE_FACTOR,
823        image::imageops::FilterType::Lanczos3,
824    );
825
826    let Some(words) = run_tesseract_tsv_words(&scaled, "6") else {
827        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
828        return;
829    };
830    if words.is_empty() {
831        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
832        return;
833    }
834    let chart_like = looks_like_chart_label_ocr(&words);
835    if chart_like {
836        return;
837    }
838
839    let mut buckets: Vec<Vec<(u32, u32, String)>> = vec![Vec::new(); empty_cells.len()];
840    let scale = f64::from(OCR_SCALE_FACTOR);
841    let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
842
843    for word in &words {
844        let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
845        let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
846
847        let cx_crop = cx_scaled / scale - border;
848        let cy_crop = cy_scaled / scale - border;
849        if cx_crop < 0.0 || cy_crop < 0.0 {
850            continue;
851        }
852
853        let cx_page = match u32::try_from(cx_crop.round() as i64) {
854            Ok(v) => crop_left.saturating_add(v),
855            Err(_) => continue,
856        };
857        let cy_page = match u32::try_from(cy_crop.round() as i64) {
858            Ok(v) => crop_top.saturating_add(v),
859            Err(_) => continue,
860        };
861
862        for (idx, cell) in empty_cells.iter().enumerate() {
863            if cx_page >= cell.x1 && cx_page < cell.x2 && cy_page >= cell.y1 && cy_page < cell.y2 {
864                buckets[idx].push((cy_page, cx_page, word.text.clone()));
865                break;
866            }
867        }
868    }
869
870    for (idx, cell) in empty_cells.iter().enumerate() {
871        let Some(row) = table.rows.get_mut(cell.row_idx) else {
872            continue;
873        };
874        let Some(target) = row.cells.get_mut(cell.cell_idx) else {
875            continue;
876        };
877        if target
878            .content
879            .iter()
880            .any(|token| matches!(token.token_type, TableTokenType::Text))
881        {
882            continue;
883        }
884        let mut parts = std::mem::take(&mut buckets[idx]);
885        if parts.is_empty() {
886            continue;
887        }
888        parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
889        let raw = parts
890            .into_iter()
891            .map(|(_, _, t)| t)
892            .collect::<Vec<_>>()
893            .join(" ");
894        let text = normalize_page_raster_cell_text(&target.bbox, raw);
895        if text.is_empty() {
896            continue;
897        }
898        target.content.push(TableToken {
899            base: TextChunk {
900                value: text,
901                bbox: target.bbox.clone(),
902                font_name: "OCR".to_string(),
903                font_size: target.bbox.height().max(6.0),
904                font_weight: 400.0,
905                italic_angle: 0.0,
906                font_color: "#000000".to_string(),
907                contrast_ratio: 21.0,
908                symbol_ends: Vec::new(),
909                text_format: TextFormat::Normal,
910                text_type: TextType::Regular,
911                pdf_layer: PdfLayer::Content,
912                ocg_visible: true,
913                index: None,
914                page_number: target.bbox.page_number,
915                level: None,
916                mcid: None,
917            },
918            token_type: TableTokenType::Text,
919        });
920    }
921}
922
923fn fill_cells_with_per_cell_ocr(
924    gray: &GrayImage,
925    table: &mut TableBorder,
926    empty_cells: &[EmptyCellRaster],
927) {
928    for cell in empty_cells {
929        let Some(row) = table.rows.get_mut(cell.row_idx) else {
930            continue;
931        };
932        let Some(target) = row.cells.get_mut(cell.cell_idx) else {
933            continue;
934        };
935        if target
936            .content
937            .iter()
938            .any(|token| matches!(token.token_type, TableTokenType::Text))
939        {
940            continue;
941        }
942        let Some(text) =
943            extract_page_raster_cell_text(gray, &target.bbox, cell.x1, cell.y1, cell.x2, cell.y2)
944        else {
945            continue;
946        };
947        if text.is_empty() {
948            continue;
949        }
950        target.content.push(TableToken {
951            base: TextChunk {
952                value: text,
953                bbox: target.bbox.clone(),
954                font_name: "OCR".to_string(),
955                font_size: target.bbox.height().max(6.0),
956                font_weight: 400.0,
957                italic_angle: 0.0,
958                font_color: "#000000".to_string(),
959                contrast_ratio: 21.0,
960                symbol_ends: Vec::new(),
961                text_format: TextFormat::Normal,
962                text_type: TextType::Regular,
963                pdf_layer: PdfLayer::Content,
964                ocg_visible: true,
965                index: None,
966                page_number: target.bbox.page_number,
967                level: None,
968                mcid: None,
969            },
970            token_type: TableTokenType::Text,
971        });
972    }
973}
974
975fn page_bbox_to_raster_box(
976    gray: &GrayImage,
977    page_bbox: &BoundingBox,
978    bbox: &BoundingBox,
979) -> Option<(u32, u32, u32, u32)> {
980    if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
981        return None;
982    }
983
984    let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
985        .clamp(0.0, f64::from(gray.width()));
986    let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
987        .clamp(0.0, f64::from(gray.width()));
988    let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
989        .clamp(0.0, f64::from(gray.height()));
990    let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
991        * f64::from(gray.height()))
992    .clamp(0.0, f64::from(gray.height()));
993
994    let x1 = left.floor() as u32;
995    let x2 = right.ceil() as u32;
996    let y1 = top.floor() as u32;
997    let y2 = bottom.ceil() as u32;
998    (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
999}
1000
1001fn extract_page_raster_cell_text(
1002    gray: &GrayImage,
1003    cell_bbox: &BoundingBox,
1004    x1: u32,
1005    y1: u32,
1006    x2: u32,
1007    y2: u32,
1008) -> Option<String> {
1009    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
1010    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
1011    let crop_left = x1 + inset_x;
1012    let crop_top = y1 + inset_y;
1013    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
1014    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
1015    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
1016        return Some(String::new());
1017    }
1018
1019    let cropped = gray
1020        .view(crop_left, crop_top, crop_width, crop_height)
1021        .to_image();
1022    let bordered = expand_white_border(&cropped, 12);
1023    let scaled = image::imageops::resize(
1024        &bordered,
1025        bordered.width() * OCR_SCALE_FACTOR,
1026        bordered.height() * OCR_SCALE_FACTOR,
1027        image::imageops::FilterType::Lanczos3,
1028    );
1029
1030    // Improved PSM selection based on cell aspect ratio
1031    let aspect_ratio = cell_bbox.width() / cell_bbox.height();
1032    let is_vertical = aspect_ratio < 0.8;
1033
1034    // PSM modes ordered by likelihood of success for each cell shape.
1035    // Typography rationale:
1036    //   PSM 6  — single uniform text block (multi-line header/paragraph cells)
1037    //   PSM 7  — single text line (most data cells; one baseline)
1038    //   PSM 8  — single word (numeric data, codes, percentages — one token)
1039    //   PSM 11 — sparse text (cells with scattered numbers / partial fills)
1040    //   PSM 13 — raw line (bypasses heuristics; last resort for oddly typeset cells)
1041    // PSM 10 (single character) is intentionally excluded: table cells always
1042    // contain at least one full token, so char-level segmentation yields fragments.
1043    let psm_modes: [&str; 5] = if is_vertical {
1044        ["7", "8", "6", "11", "13"]
1045    } else {
1046        ["6", "7", "8", "11", "13"]
1047    };
1048
1049    let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
1050    Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
1051}
1052
1053fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
1054    let normalized = text
1055        .replace('|', " ")
1056        .replace('—', "-")
1057        .replace(['“', '”'], "\"")
1058        .replace('’', "'")
1059        .split_whitespace()
1060        .collect::<Vec<_>>()
1061        .join(" ");
1062
1063    if normalized.is_empty() {
1064        return normalized;
1065    }
1066
1067    let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
1068    if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
1069        return String::new();
1070    }
1071
1072    normalized
1073}
1074
1075fn is_ocr_candidate(
1076    image: &ImageChunk,
1077    page_bbox: &BoundingBox,
1078    text_chunks: &[TextChunk],
1079) -> bool {
1080    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1081    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1082    if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
1083        return false;
1084    }
1085
1086    let overlapping_chunks: Vec<&TextChunk> = text_chunks
1087        .iter()
1088        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1089        .collect();
1090    let native_text_chars: usize = overlapping_chunks
1091        .iter()
1092        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1093        .sum();
1094
1095    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
1096        || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
1097}
1098
1099fn is_dominant_image_text_candidate(
1100    image: &ImageChunk,
1101    page_bbox: &BoundingBox,
1102    text_chunks: &[TextChunk],
1103) -> bool {
1104    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1105    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1106    if width_ratio < MIN_DOMINANT_IMAGE_WIDTH_RATIO || area_ratio < MIN_DOMINANT_IMAGE_AREA_RATIO {
1107        return false;
1108    }
1109
1110    let native_text_chars: usize = text_chunks
1111        .iter()
1112        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1113        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1114        .sum();
1115
1116    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE
1117}
1118
1119fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
1120    let mut words = Vec::new();
1121    for line in tsv.lines().skip(1) {
1122        let mut cols = line.splitn(12, '\t');
1123        let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1124        if level != 5 {
1125            continue;
1126        }
1127        let _page_num = cols.next();
1128        let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1129        let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1130        let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1131        let _word_num = cols.next();
1132        let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1133        let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1134        let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1135        let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1136        let confidence = cols
1137            .next()
1138            .and_then(|s| s.parse::<f64>().ok())
1139            .unwrap_or(-1.0);
1140        let text = cols.next().unwrap_or("").trim().to_string();
1141        if !(MIN_OCR_WORD_CONFIDENCE..=MAX_OCR_WORD_CONFIDENCE).contains(&confidence)
1142            || text.is_empty()
1143            || width == 0
1144            || height == 0
1145            || !text.chars().any(|ch| ch.is_alphanumeric())
1146        {
1147            continue;
1148        }
1149        words.push(OcrWord {
1150            line_key: (block_num, par_num, line_num),
1151            left,
1152            top,
1153            width,
1154            height,
1155            text,
1156            confidence,
1157        });
1158    }
1159    words
1160}
1161
1162fn looks_like_chart_label_ocr(words: &[OcrWord]) -> bool {
1163    if words.len() < 8 {
1164        return false;
1165    }
1166
1167    let min_left = words.iter().map(|word| word.left).min().unwrap_or(0);
1168    let min_top = words.iter().map(|word| word.top).min().unwrap_or(0);
1169    let max_right = words
1170        .iter()
1171        .map(|word| word.left.saturating_add(word.width))
1172        .max()
1173        .unwrap_or(0);
1174    let max_bottom = words
1175        .iter()
1176        .map(|word| word.top.saturating_add(word.height))
1177        .max()
1178        .unwrap_or(0);
1179    let image_width = max_right.saturating_sub(min_left);
1180    let image_height = max_bottom.saturating_sub(min_top);
1181    if image_width < 160 || image_height < 120 {
1182        return false;
1183    }
1184
1185    let width_f = f64::from(image_width);
1186    let height_f = f64::from(image_height);
1187    let outer_x = width_f * 0.18;
1188    let outer_y = height_f * 0.18;
1189    let inner_left = width_f * 0.22;
1190    let inner_right = width_f * 0.78;
1191    let inner_top = height_f * 0.22;
1192    let inner_bottom = height_f * 0.78;
1193
1194    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1195    let mut outer_words = 0usize;
1196    let mut inner_words = 0usize;
1197
1198    for word in words {
1199        by_line.entry(word.line_key).or_default().push(word);
1200
1201        let center_x = f64::from(word.left.saturating_sub(min_left)) + f64::from(word.width) / 2.0;
1202        let center_y = f64::from(word.top.saturating_sub(min_top)) + f64::from(word.height) / 2.0;
1203
1204        if center_x <= outer_x
1205            || center_x >= width_f - outer_x
1206            || center_y <= outer_y
1207            || center_y >= height_f - outer_y
1208        {
1209            outer_words += 1;
1210        }
1211
1212        if center_x >= inner_left
1213            && center_x <= inner_right
1214            && center_y >= inner_top
1215            && center_y <= inner_bottom
1216        {
1217            inner_words += 1;
1218        }
1219    }
1220
1221    if by_line.len() < 5 {
1222        return false;
1223    }
1224
1225    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1226    let mut clusters: Vec<XCluster> = Vec::new();
1227    for line_words in by_line.values() {
1228        for word in line_words {
1229            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1230            if let Some(cluster) = clusters
1231                .iter_mut()
1232                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1233            {
1234                cluster.center =
1235                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1236                cluster.count += 1;
1237                cluster.lines.insert(word.line_key);
1238            } else {
1239                let mut lines = HashSet::new();
1240                lines.insert(word.line_key);
1241                clusters.push(XCluster {
1242                    center,
1243                    count: 1,
1244                    lines,
1245                });
1246            }
1247        }
1248    }
1249
1250    let stable_centers: Vec<f64> = clusters
1251        .iter()
1252        .filter(|cluster| cluster.lines.len() >= 4 && cluster.count >= 4)
1253        .map(|cluster| cluster.center)
1254        .collect();
1255    let mut sorted_stable_centers = stable_centers.clone();
1256    sorted_stable_centers
1257        .sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1258    let max_stable_gap = sorted_stable_centers
1259        .windows(2)
1260        .map(|pair| pair[1] - pair[0])
1261        .fold(0.0, f64::max);
1262    let spans_full_table_width = stable_centers.len() >= 3
1263        && stable_centers
1264            .iter()
1265            .any(|center| *center - f64::from(min_left) <= width_f * 0.25)
1266        && stable_centers
1267            .iter()
1268            .any(|center| *center - f64::from(min_left) >= width_f * 0.75)
1269        && stable_centers.iter().any(|center| {
1270            let rel = *center - f64::from(min_left);
1271            rel >= inner_left && rel <= inner_right
1272        })
1273        && max_stable_gap <= width_f * 0.45;
1274    if spans_full_table_width {
1275        let table_like_lines = by_line
1276            .values()
1277            .filter(|line_words| {
1278                let mut seen = HashSet::<usize>::new();
1279                for word in *line_words {
1280                    let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1281                    for (idx, stable_center) in stable_centers.iter().enumerate() {
1282                        if (center - stable_center).abs() <= tolerance {
1283                            seen.insert(idx);
1284                        }
1285                    }
1286                }
1287                seen.len() >= 3
1288            })
1289            .count();
1290        if table_like_lines >= 4 {
1291            return false;
1292        }
1293    }
1294
1295    let mut short_lines = 0usize;
1296    let mut peripheral_label_lines = 0usize;
1297    let mut wide_sentence_lines = 0usize;
1298    let mut axisish_numeric_lines = 0usize;
1299
1300    for line_words in by_line.values() {
1301        let line_left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1302        let line_top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1303        let line_right = line_words
1304            .iter()
1305            .map(|word| word.left.saturating_add(word.width))
1306            .max()
1307            .unwrap_or(0);
1308        let line_bottom = line_words
1309            .iter()
1310            .map(|word| word.top.saturating_add(word.height))
1311            .max()
1312            .unwrap_or(0);
1313        if line_right <= line_left || line_bottom <= line_top {
1314            continue;
1315        }
1316
1317        let word_count = line_words.len();
1318        let numeric_in_line = line_words
1319            .iter()
1320            .filter(|word| is_numeric_like(&word.text))
1321            .count();
1322        let line_width_ratio =
1323            f64::from(line_right.saturating_sub(line_left)) / f64::from(image_width.max(1));
1324        let touches_outer_band = f64::from(line_left.saturating_sub(min_left)) <= outer_x
1325            || f64::from(line_right.saturating_sub(min_left)) >= width_f - outer_x
1326            || f64::from(line_top.saturating_sub(min_top)) <= outer_y
1327            || f64::from(line_bottom.saturating_sub(min_top)) >= height_f - outer_y;
1328
1329        if word_count <= 3 {
1330            short_lines += 1;
1331        }
1332        if touches_outer_band && word_count <= 4 {
1333            peripheral_label_lines += 1;
1334        }
1335        if touches_outer_band && word_count <= 3 && numeric_in_line > 0 {
1336            axisish_numeric_lines += 1;
1337        }
1338        if word_count >= 4 && line_width_ratio >= 0.45 && numeric_in_line == 0 {
1339            wide_sentence_lines += 1;
1340        }
1341    }
1342
1343    let total_lines = by_line.len();
1344    let outer_dominant = outer_words * 10 >= words.len() * 5;
1345    let inner_sparse = inner_words * 10 <= words.len() * 5;
1346    let label_dominant = peripheral_label_lines * 10 >= total_lines * 6;
1347    let short_line_dominant = short_lines * 10 >= total_lines * 6;
1348    let axis_signal = axisish_numeric_lines >= 2;
1349
1350    outer_dominant
1351        && inner_sparse
1352        && label_dominant
1353        && short_line_dominant
1354        && axis_signal
1355        && wide_sentence_lines <= 2
1356}
1357
1358fn looks_like_matrix_formula_ocr(words: &[OcrWord]) -> bool {
1359    if words.len() < 6 {
1360        return false;
1361    }
1362
1363    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1364    for word in words {
1365        by_line.entry(word.line_key).or_default().push(word);
1366    }
1367
1368    if by_line.len() < 2 || by_line.len() > 4 {
1369        return false;
1370    }
1371
1372    let substantive_words = words
1373        .iter()
1374        .filter(|word| is_substantive_table_word(&word.text))
1375        .count();
1376    let short_formulaish_words = words
1377        .iter()
1378        .filter(|word| is_short_formulaish_word(&word.text))
1379        .count();
1380    let slash_words = words.iter().filter(|word| word.text.contains('/')).count();
1381    let equation_label_words = words
1382        .iter()
1383        .filter(|word| looks_like_equation_label_word(&word.text))
1384        .count();
1385    let dense_lines = by_line.values().filter(|line| line.len() >= 3).count();
1386    let short_lines = by_line
1387        .values()
1388        .filter(|line| line.iter().all(|word| is_short_formulaish_word(&word.text)))
1389        .count();
1390
1391    substantive_words == 0
1392        && dense_lines >= 2
1393        && short_lines * 10 >= by_line.len() * 7
1394        && short_formulaish_words * 10 >= words.len() * 7
1395        && (slash_words > 0 || equation_label_words >= 2)
1396}
1397
1398fn is_substantive_table_word(text: &str) -> bool {
1399    let normalized: String = text
1400        .chars()
1401        .filter(|ch| ch.is_alphanumeric())
1402        .flat_map(char::to_lowercase)
1403        .collect();
1404    if normalized.is_empty() {
1405        return false;
1406    }
1407
1408    let alpha_count = normalized.chars().filter(|ch| ch.is_alphabetic()).count();
1409    let digit_count = normalized.chars().filter(|ch| ch.is_ascii_digit()).count();
1410    let has_non_binary_digit = normalized
1411        .chars()
1412        .any(|ch| ch.is_ascii_digit() && !matches!(ch, '0' | '1'));
1413
1414    alpha_count >= 4
1415        || (digit_count >= 2 && alpha_count == 0 && has_non_binary_digit)
1416        || (normalized.len() >= 5 && alpha_count >= 2)
1417}
1418
1419fn is_short_formulaish_word(text: &str) -> bool {
1420    let normalized: String = text
1421        .chars()
1422        .filter(|ch| ch.is_alphanumeric())
1423        .flat_map(char::to_lowercase)
1424        .collect();
1425    if normalized.is_empty() {
1426        return true;
1427    }
1428
1429    normalized.len() <= 3 || (text.contains('/') && normalized.len() <= 4)
1430}
1431
1432fn looks_like_equation_label_word(text: &str) -> bool {
1433    let trimmed = text.trim_matches(|ch: char| !ch.is_alphanumeric());
1434    let mut chars = trimmed.chars();
1435    let Some(first) = chars.next() else {
1436        return false;
1437    };
1438    if !first.is_ascii_alphabetic() || !first.is_ascii_uppercase() {
1439        return false;
1440    }
1441
1442    let remainder: String = chars.collect();
1443    !remainder.is_empty() && remainder.len() <= 3 && remainder.chars().all(|ch| ch.is_ascii_digit())
1444}
1445
1446fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
1447    if words.len() < 8 {
1448        return false;
1449    }
1450
1451    if looks_like_chart_label_ocr(words) {
1452        return false;
1453    }
1454
1455    if looks_like_matrix_formula_ocr(words) {
1456        return false;
1457    }
1458
1459    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1460    for word in words {
1461        by_line.entry(word.line_key).or_default().push(word);
1462    }
1463
1464    let mut qualifying_lines = Vec::new();
1465    let mut numeric_like_count = 0usize;
1466    let mut max_right = 0u32;
1467    for line_words in by_line.values_mut() {
1468        line_words.sort_by_key(|word| word.left);
1469        let numeric_words = line_words
1470            .iter()
1471            .filter(|word| is_numeric_like(&word.text))
1472            .count();
1473        numeric_like_count += numeric_words;
1474        if line_words.len() >= 3 || numeric_words >= 2 {
1475            max_right = max_right.max(
1476                line_words
1477                    .iter()
1478                    .map(|word| word.left.saturating_add(word.width))
1479                    .max()
1480                    .unwrap_or(0),
1481            );
1482            qualifying_lines.push(line_words.clone());
1483        }
1484    }
1485
1486    if qualifying_lines.len() < 2 {
1487        return false;
1488    }
1489
1490    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1491    let mut clusters: Vec<XCluster> = Vec::new();
1492    for line in &qualifying_lines {
1493        for word in line {
1494            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1495            if let Some(cluster) = clusters
1496                .iter_mut()
1497                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1498            {
1499                cluster.center =
1500                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1501                cluster.count += 1;
1502                cluster.lines.insert(word.line_key);
1503            } else {
1504                let mut lines = HashSet::new();
1505                lines.insert(word.line_key);
1506                clusters.push(XCluster {
1507                    center,
1508                    count: 1,
1509                    lines,
1510                });
1511            }
1512        }
1513    }
1514
1515    let repeated_clusters: Vec<&XCluster> = clusters
1516        .iter()
1517        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1518        .collect();
1519    if repeated_clusters.len() < 3 {
1520        return false;
1521    }
1522
1523    let repeated_centers: Vec<f64> = repeated_clusters
1524        .iter()
1525        .map(|cluster| cluster.center)
1526        .collect();
1527    let structured_lines = qualifying_lines
1528        .iter()
1529        .filter(|line| {
1530            let mut seen = HashSet::<usize>::new();
1531            for word in *line {
1532                let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1533                for (idx, repeated_center) in repeated_centers.iter().enumerate() {
1534                    if (center - repeated_center).abs() <= tolerance {
1535                        seen.insert(idx);
1536                    }
1537                }
1538            }
1539            seen.len() >= 3
1540                || (seen.len() >= 2
1541                    && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
1542        })
1543        .count();
1544
1545    let alphabetic_words = words
1546        .iter()
1547        .filter(|word| word.text.chars().any(|ch| ch.is_alphabetic()))
1548        .count();
1549
1550    // Geometric guard: repeated vertical bands alone are not enough for tables.
1551    // Dense prose in infographics often forms stable x-clusters but lacks numeric
1552    // signal. Require either numeric evidence or stronger column multiplicity.
1553    if numeric_like_count == 0
1554        && alphabetic_words * 10 >= words.len() * 9
1555        && repeated_clusters.len() <= 4
1556    {
1557        return false;
1558    }
1559
1560    structured_lines >= 3
1561        || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
1562}
1563
1564fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
1565    if !looks_like_table_ocr(words) {
1566        return false;
1567    }
1568
1569    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1570    for word in words {
1571        by_line.entry(word.line_key).or_default().push(word);
1572    }
1573
1574    let numeric_like_count = words
1575        .iter()
1576        .filter(|word| is_numeric_like(&word.text))
1577        .count();
1578    let numeric_lines = by_line
1579        .values()
1580        .filter(|line| {
1581            line.iter()
1582                .filter(|word| is_numeric_like(&word.text))
1583                .count()
1584                >= 2
1585        })
1586        .count();
1587
1588    numeric_like_count >= 12 && numeric_lines >= 3
1589}
1590
1591fn looks_like_dense_prose_image_ocr(words: &[OcrWord]) -> bool {
1592    if words.len() < MIN_DOMINANT_IMAGE_OCR_WORDS || looks_like_table_ocr(words) {
1593        return false;
1594    }
1595
1596    if looks_like_chart_label_ocr(words) {
1597        return false;
1598    }
1599
1600    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1601    let mut alphabetic_words = 0usize;
1602    let mut numeric_like_words = 0usize;
1603    for word in words {
1604        by_line.entry(word.line_key).or_default().push(word);
1605        if word.text.chars().any(|ch| ch.is_alphabetic()) {
1606            alphabetic_words += 1;
1607        }
1608        if is_numeric_like(&word.text) {
1609            numeric_like_words += 1;
1610        }
1611    }
1612
1613    if by_line.len() < MIN_DOMINANT_IMAGE_TEXT_LINES || alphabetic_words * 3 < words.len() * 2 {
1614        return false;
1615    }
1616    if numeric_like_words * 4 > words.len() {
1617        return false;
1618    }
1619
1620    let multiword_lines = by_line
1621        .values()
1622        .filter(|line| line.iter().filter(|word| word.text.len() >= 2).count() >= 3)
1623        .count();
1624    multiword_lines >= 4 && has_dense_prose_block_geometry(words)
1625}
1626
1627fn has_dense_prose_block_geometry(words: &[OcrWord]) -> bool {
1628    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1629    for word in words {
1630        by_line.entry(word.line_key).or_default().push(word);
1631    }
1632
1633    let mut spatial_lines = Vec::new();
1634    for line_words in by_line.values() {
1635        if line_words.len() < 3 {
1636            continue;
1637        }
1638
1639        let left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1640        let right = line_words
1641            .iter()
1642            .map(|word| word.left.saturating_add(word.width))
1643            .max()
1644            .unwrap_or(0);
1645        let top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1646        let bottom = line_words
1647            .iter()
1648            .map(|word| word.top.saturating_add(word.height))
1649            .max()
1650            .unwrap_or(0);
1651
1652        if right <= left || bottom <= top {
1653            continue;
1654        }
1655
1656        spatial_lines.push(SpatialOcrLine {
1657            left,
1658            top,
1659            right,
1660            bottom,
1661            text: String::new(),
1662            word_count: line_words.len(),
1663            line_count: 1,
1664            line_height_sum: bottom.saturating_sub(top).max(1),
1665        });
1666    }
1667
1668    spatial_lines.sort_by_key(|line| (line.top, line.left));
1669    if spatial_lines.len() < MIN_DENSE_PROSE_BLOCK_LINES {
1670        return false;
1671    }
1672
1673    let image_width = spatial_lines
1674        .iter()
1675        .map(|line| line.right)
1676        .max()
1677        .unwrap_or(0);
1678    if image_width == 0 {
1679        return false;
1680    }
1681
1682    let median_height = {
1683        let mut heights: Vec<u32> = spatial_lines
1684            .iter()
1685            .map(|line| line.bottom.saturating_sub(line.top).max(1))
1686            .collect();
1687        heights.sort_unstable();
1688        heights[heights.len() / 2]
1689    };
1690
1691    let mut best_line_count = 1usize;
1692    let mut best_left = spatial_lines[0].left;
1693    let mut best_right = spatial_lines[0].right;
1694    let mut current_line_count = 1usize;
1695    let mut current_left = spatial_lines[0].left;
1696    let mut current_right = spatial_lines[0].right;
1697
1698    for pair in spatial_lines.windows(2) {
1699        let prev = &pair[0];
1700        let curr = &pair[1];
1701        if spatial_lines_share_block_geometry(prev, curr, image_width, median_height) {
1702            current_line_count += 1;
1703            current_left = current_left.min(curr.left);
1704            current_right = current_right.max(curr.right);
1705        } else {
1706            if current_line_count > best_line_count {
1707                best_line_count = current_line_count;
1708                best_left = current_left;
1709                best_right = current_right;
1710            }
1711            current_line_count = 1;
1712            current_left = curr.left;
1713            current_right = curr.right;
1714        }
1715    }
1716
1717    if current_line_count > best_line_count {
1718        best_line_count = current_line_count;
1719        best_left = current_left;
1720        best_right = current_right;
1721    }
1722
1723    let block_width_ratio =
1724        f64::from(best_right.saturating_sub(best_left)) / f64::from(image_width);
1725    best_line_count >= MIN_DENSE_PROSE_BLOCK_LINES
1726        && block_width_ratio >= MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO
1727}
1728
1729fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1730    let image_width = words
1731        .iter()
1732        .map(|word| word.left.saturating_add(word.width))
1733        .max()?;
1734    let image_height = words
1735        .iter()
1736        .map(|word| word.top.saturating_add(word.height))
1737        .max()?;
1738    if image_width == 0 || image_height == 0 {
1739        return None;
1740    }
1741
1742    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1743    for word in words {
1744        by_line.entry(word.line_key).or_default().push(word);
1745    }
1746
1747    let max_right = words
1748        .iter()
1749        .map(|word| word.left.saturating_add(word.width))
1750        .max()
1751        .unwrap_or(0);
1752    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1753
1754    let mut clusters: Vec<XCluster> = Vec::new();
1755    for line_words in by_line.values() {
1756        for word in line_words {
1757            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1758            if let Some(cluster) = clusters
1759                .iter_mut()
1760                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1761            {
1762                cluster.center =
1763                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1764                cluster.count += 1;
1765                cluster.lines.insert(word.line_key);
1766            } else {
1767                let mut lines = HashSet::new();
1768                lines.insert(word.line_key);
1769                clusters.push(XCluster {
1770                    center,
1771                    count: 1,
1772                    lines,
1773                });
1774            }
1775        }
1776    }
1777    let mut centers: Vec<f64> = clusters
1778        .into_iter()
1779        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1780        .map(|cluster| cluster.center)
1781        .collect();
1782    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1783    if centers.len() < 3 {
1784        return None;
1785    }
1786
1787    let mut built_rows = Vec::<OcrRowBuild>::new();
1788    let mut row_fill_counts = Vec::<usize>::new();
1789    for line_words in by_line.values() {
1790        let mut sorted_words = line_words.clone();
1791        sorted_words.sort_by_key(|word| word.left);
1792
1793        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1794        for word in &sorted_words {
1795            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1796            if let Some((col_idx, distance)) = centers
1797                .iter()
1798                .enumerate()
1799                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1800                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1801            {
1802                if distance <= tolerance {
1803                    cells[col_idx].push(word);
1804                }
1805            }
1806        }
1807
1808        let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
1809        let numeric_cells = cells
1810            .iter()
1811            .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
1812            .count();
1813        if filled_cells < 3 && numeric_cells < 2 {
1814            continue;
1815        }
1816        row_fill_counts.push(filled_cells);
1817
1818        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
1819        let bottom_px = sorted_words
1820            .iter()
1821            .map(|word| word.top.saturating_add(word.height))
1822            .max()
1823            .unwrap_or(0);
1824        let top_y =
1825            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
1826        let bottom_y = image.bbox.top_y
1827            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
1828        let cell_texts = cells
1829            .iter()
1830            .map(|cell_words| {
1831                cell_words
1832                    .iter()
1833                    .map(|word| word.text.as_str())
1834                    .collect::<Vec<_>>()
1835                    .join(" ")
1836            })
1837            .collect();
1838        built_rows.push(OcrRowBuild {
1839            top_y,
1840            bottom_y,
1841            cell_texts,
1842        });
1843    }
1844
1845    if built_rows.len() < 2 {
1846        return None;
1847    }
1848    if row_fill_counts.is_empty() {
1849        return None;
1850    }
1851
1852    let mut sorted_fill_counts = row_fill_counts.clone();
1853    sorted_fill_counts.sort_unstable();
1854    let median_fill_ratio =
1855        sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
1856    if median_fill_ratio < MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO {
1857        return None;
1858    }
1859
1860    built_rows.sort_by(|a, b| {
1861        b.top_y
1862            .partial_cmp(&a.top_y)
1863            .unwrap_or(std::cmp::Ordering::Equal)
1864    });
1865    let x_coordinates = build_boundaries_from_centers(
1866        &centers,
1867        image.bbox.left_x,
1868        image.bbox.right_x,
1869        image_width,
1870    );
1871    let row_bounds: Vec<(f64, f64)> = built_rows
1872        .iter()
1873        .map(|row| (row.top_y, row.bottom_y))
1874        .collect();
1875    let y_coordinates = build_row_boundaries(&row_bounds);
1876    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
1877        return None;
1878    }
1879
1880    let mut rows = Vec::new();
1881    for (row_idx, row_build) in built_rows.iter().enumerate() {
1882        let row_bbox = BoundingBox::new(
1883            image.bbox.page_number,
1884            image.bbox.left_x,
1885            y_coordinates[row_idx + 1],
1886            image.bbox.right_x,
1887            y_coordinates[row_idx],
1888        );
1889        let mut cells = Vec::new();
1890        for col_idx in 0..centers.len() {
1891            let cell_bbox = BoundingBox::new(
1892                image.bbox.page_number,
1893                x_coordinates[col_idx],
1894                y_coordinates[row_idx + 1],
1895                x_coordinates[col_idx + 1],
1896                y_coordinates[row_idx],
1897            );
1898            let text = row_build
1899                .cell_texts
1900                .get(col_idx)
1901                .cloned()
1902                .unwrap_or_default();
1903            let mut content = Vec::new();
1904            if !text.trim().is_empty() {
1905                content.push(TableToken {
1906                    base: TextChunk {
1907                        value: text.trim().to_string(),
1908                        bbox: cell_bbox.clone(),
1909                        font_name: "OCR".to_string(),
1910                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
1911                        font_weight: 400.0,
1912                        italic_angle: 0.0,
1913                        font_color: "#000000".to_string(),
1914                        contrast_ratio: 21.0,
1915                        symbol_ends: Vec::new(),
1916                        text_format: TextFormat::Normal,
1917                        text_type: TextType::Regular,
1918                        pdf_layer: PdfLayer::Content,
1919                        ocg_visible: true,
1920                        index: None,
1921                        page_number: image.bbox.page_number,
1922                        level: None,
1923                        mcid: None,
1924                    },
1925                    token_type: TableTokenType::Text,
1926                });
1927            }
1928            cells.push(TableBorderCell {
1929                bbox: cell_bbox,
1930                index: None,
1931                level: None,
1932                row_number: row_idx,
1933                col_number: col_idx,
1934                row_span: 1,
1935                col_span: 1,
1936                content,
1937                contents: Vec::new(),
1938                semantic_type: None,
1939            });
1940        }
1941        rows.push(TableBorderRow {
1942            bbox: row_bbox,
1943            index: None,
1944            level: None,
1945            row_number: row_idx,
1946            cells,
1947            semantic_type: None,
1948        });
1949    }
1950
1951    Some(TableBorder {
1952        bbox: image.bbox.clone(),
1953        index: None,
1954        level: None,
1955        x_coordinates: x_coordinates.clone(),
1956        x_widths: vec![0.0; x_coordinates.len()],
1957        y_coordinates: y_coordinates.clone(),
1958        y_widths: vec![0.0; y_coordinates.len()],
1959        rows,
1960        num_rows: built_rows.len(),
1961        num_columns: centers.len(),
1962        is_bad_table: false,
1963        is_table_transformer: true,
1964        previous_table: None,
1965        next_table: None,
1966    })
1967}
1968
1969fn build_structured_ocr_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1970    let image_width = words
1971        .iter()
1972        .map(|word| word.left.saturating_add(word.width))
1973        .max()?;
1974    let image_height = words
1975        .iter()
1976        .map(|word| word.top.saturating_add(word.height))
1977        .max()?;
1978    if image_width == 0 || image_height == 0 {
1979        return None;
1980    }
1981
1982    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1983    for word in words {
1984        by_line.entry(word.line_key).or_default().push(word);
1985    }
1986
1987    let max_right = words
1988        .iter()
1989        .map(|word| word.left.saturating_add(word.width))
1990        .max()
1991        .unwrap_or(0);
1992    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1993
1994    let mut clusters: Vec<XCluster> = Vec::new();
1995    for line_words in by_line.values() {
1996        for word in line_words {
1997            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1998            if let Some(cluster) = clusters
1999                .iter_mut()
2000                .find(|cluster| (cluster.center - center).abs() <= tolerance)
2001            {
2002                cluster.center =
2003                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
2004                cluster.count += 1;
2005                cluster.lines.insert(word.line_key);
2006            } else {
2007                let mut lines = HashSet::new();
2008                lines.insert(word.line_key);
2009                clusters.push(XCluster {
2010                    center,
2011                    count: 1,
2012                    lines,
2013                });
2014            }
2015        }
2016    }
2017
2018    let mut centers: Vec<f64> = clusters
2019        .into_iter()
2020        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
2021        .map(|cluster| cluster.center)
2022        .collect();
2023    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
2024    if centers.len() < 3 {
2025        return None;
2026    }
2027
2028    let mut built_rows = Vec::<OcrRowBuild>::new();
2029    let mut row_fill_counts = Vec::<usize>::new();
2030    let mut occupied_columns = vec![0usize; centers.len()];
2031
2032    for line_words in by_line.values() {
2033        let mut sorted_words = line_words.clone();
2034        sorted_words.sort_by_key(|word| word.left);
2035
2036        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
2037        for word in &sorted_words {
2038            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
2039            if let Some((col_idx, distance)) = centers
2040                .iter()
2041                .enumerate()
2042                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
2043                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
2044            {
2045                if distance <= tolerance {
2046                    cells[col_idx].push(word);
2047                }
2048            }
2049        }
2050
2051        let filled_indices: Vec<usize> = cells
2052            .iter()
2053            .enumerate()
2054            .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
2055            .collect();
2056        if filled_indices.len() < 2 {
2057            continue;
2058        }
2059
2060        let span = filled_indices.last().unwrap_or(&0) - filled_indices.first().unwrap_or(&0) + 1;
2061        if filled_indices.len() < 3 && span < 3 {
2062            continue;
2063        }
2064
2065        row_fill_counts.push(filled_indices.len());
2066        for idx in &filled_indices {
2067            if let Some(count) = occupied_columns.get_mut(*idx) {
2068                *count += 1;
2069            }
2070        }
2071
2072        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
2073        let bottom_px = sorted_words
2074            .iter()
2075            .map(|word| word.top.saturating_add(word.height))
2076            .max()
2077            .unwrap_or(0);
2078        let top_y =
2079            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
2080        let bottom_y = image.bbox.top_y
2081            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
2082        let cell_texts = cells
2083            .iter()
2084            .map(|cell_words| {
2085                let mut sorted_cell_words = cell_words.clone();
2086                sorted_cell_words.sort_by_key(|word| word.left);
2087                sorted_cell_words
2088                    .iter()
2089                    .map(|word| word.text.as_str())
2090                    .collect::<Vec<_>>()
2091                    .join(" ")
2092            })
2093            .collect();
2094        built_rows.push(OcrRowBuild {
2095            top_y,
2096            bottom_y,
2097            cell_texts,
2098        });
2099    }
2100
2101    if built_rows.len() < 3 || row_fill_counts.is_empty() {
2102        return None;
2103    }
2104
2105    let repeated_columns = occupied_columns.iter().filter(|count| **count >= 2).count();
2106    if repeated_columns < 3 {
2107        return None;
2108    }
2109
2110    let mut sorted_fill_counts = row_fill_counts.clone();
2111    sorted_fill_counts.sort_unstable();
2112    let median_fill_ratio =
2113        sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
2114    if median_fill_ratio < 0.5 {
2115        return None;
2116    }
2117
2118    built_rows.sort_by(|a, b| {
2119        b.top_y
2120            .partial_cmp(&a.top_y)
2121            .unwrap_or(std::cmp::Ordering::Equal)
2122    });
2123    let x_coordinates = build_boundaries_from_centers(
2124        &centers,
2125        image.bbox.left_x,
2126        image.bbox.right_x,
2127        image_width,
2128    );
2129    let row_bounds: Vec<(f64, f64)> = built_rows
2130        .iter()
2131        .map(|row| (row.top_y, row.bottom_y))
2132        .collect();
2133    let y_coordinates = build_row_boundaries(&row_bounds);
2134    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
2135        return None;
2136    }
2137
2138    let mut rows = Vec::new();
2139    for (row_idx, row_build) in built_rows.iter().enumerate() {
2140        let row_bbox = BoundingBox::new(
2141            image.bbox.page_number,
2142            image.bbox.left_x,
2143            y_coordinates[row_idx + 1],
2144            image.bbox.right_x,
2145            y_coordinates[row_idx],
2146        );
2147        let mut cells = Vec::new();
2148        for col_idx in 0..centers.len() {
2149            let cell_bbox = BoundingBox::new(
2150                image.bbox.page_number,
2151                x_coordinates[col_idx],
2152                y_coordinates[row_idx + 1],
2153                x_coordinates[col_idx + 1],
2154                y_coordinates[row_idx],
2155            );
2156            let text = row_build
2157                .cell_texts
2158                .get(col_idx)
2159                .cloned()
2160                .unwrap_or_default();
2161            let mut content = Vec::new();
2162            if !text.trim().is_empty() {
2163                content.push(TableToken {
2164                    base: TextChunk {
2165                        value: text.trim().to_string(),
2166                        bbox: cell_bbox.clone(),
2167                        font_name: "OCR".to_string(),
2168                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
2169                        font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2170                        italic_angle: 0.0,
2171                        font_color: "#000000".to_string(),
2172                        contrast_ratio: 21.0,
2173                        symbol_ends: Vec::new(),
2174                        text_format: TextFormat::Normal,
2175                        text_type: TextType::Regular,
2176                        pdf_layer: PdfLayer::Content,
2177                        ocg_visible: true,
2178                        index: None,
2179                        page_number: image.bbox.page_number,
2180                        level: None,
2181                        mcid: None,
2182                    },
2183                    token_type: TableTokenType::Text,
2184                });
2185            }
2186            cells.push(TableBorderCell {
2187                bbox: cell_bbox,
2188                index: None,
2189                level: None,
2190                row_number: row_idx,
2191                col_number: col_idx,
2192                row_span: 1,
2193                col_span: 1,
2194                content,
2195                contents: Vec::new(),
2196                semantic_type: None,
2197            });
2198        }
2199        rows.push(TableBorderRow {
2200            bbox: row_bbox,
2201            index: None,
2202            level: None,
2203            row_number: row_idx,
2204            cells,
2205            semantic_type: None,
2206        });
2207    }
2208
2209    Some(TableBorder {
2210        bbox: image.bbox.clone(),
2211        index: None,
2212        level: None,
2213        x_coordinates: x_coordinates.clone(),
2214        x_widths: vec![0.0; x_coordinates.len()],
2215        y_coordinates: y_coordinates.clone(),
2216        y_widths: vec![0.0; y_coordinates.len()],
2217        rows,
2218        num_rows: built_rows.len(),
2219        num_columns: centers.len(),
2220        is_bad_table: false,
2221        is_table_transformer: true,
2222        previous_table: None,
2223        next_table: None,
2224    })
2225}
2226
2227fn is_matrixish_ocr_artifact_table(table: &TableBorder) -> bool {
2228    if !table.is_table_transformer
2229        || table.num_rows < 2
2230        || table.num_rows > 4
2231        || table.num_columns < 3
2232        || table.bbox.height() > table.bbox.width() * 0.55
2233    {
2234        return false;
2235    }
2236
2237    let texts: Vec<String> = table
2238        .rows
2239        .iter()
2240        .flat_map(|row| row.cells.iter())
2241        .map(table_cell_text)
2242        .filter(|text| !text.is_empty())
2243        .collect();
2244    if texts.len() < 6 {
2245        return false;
2246    }
2247
2248    let substantive_cells = texts
2249        .iter()
2250        .filter(|text| is_substantive_ocr_cell_text(text))
2251        .count();
2252    let short_cells = texts
2253        .iter()
2254        .filter(|text| is_short_ocr_cell_text(text))
2255        .count();
2256    let ambiguous_cells = texts
2257        .iter()
2258        .filter(|text| is_ambiguous_matrix_cell_text(text))
2259        .count();
2260
2261    substantive_cells == 0
2262        && short_cells * 10 >= texts.len() * 8
2263        && ambiguous_cells * 10 >= texts.len() * 5
2264}
2265
2266fn table_cell_text(cell: &TableBorderCell) -> String {
2267    cell.content
2268        .iter()
2269        .map(|token| token.base.value.trim())
2270        .filter(|value| !value.is_empty())
2271        .collect::<Vec<_>>()
2272        .join(" ")
2273}
2274
2275fn is_substantive_ocr_cell_text(text: &str) -> bool {
2276    text.split_whitespace().any(is_substantive_table_word)
2277}
2278
2279fn is_short_ocr_cell_text(text: &str) -> bool {
2280    let normalized: String = text
2281        .chars()
2282        .filter(|ch| ch.is_alphanumeric())
2283        .flat_map(char::to_lowercase)
2284        .collect();
2285    !normalized.is_empty() && normalized.len() <= 4
2286}
2287
2288fn is_ambiguous_matrix_cell_text(text: &str) -> bool {
2289    if text.contains(['/', '\\', '=', '|', '[', ']', '{', '}', '(', ')']) {
2290        return true;
2291    }
2292
2293    let normalized: String = text
2294        .chars()
2295        .filter(|ch| ch.is_alphanumeric())
2296        .flat_map(char::to_lowercase)
2297        .collect();
2298    !normalized.is_empty()
2299        && normalized.len() <= 4
2300        && normalized
2301            .chars()
2302            .all(|ch| matches!(ch, '0' | '1' | 'o' | 'd' | 'q' | 'i' | 'l'))
2303}
2304
2305fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
2306    let gray = image::open(image_path).ok()?.to_luma8();
2307    recover_bordered_raster_caption_from_gray(&gray, image)
2308}
2309
2310fn recover_bordered_raster_caption_from_gray(
2311    gray: &GrayImage,
2312    image: &ImageChunk,
2313) -> Option<TextChunk> {
2314    let grid = detect_bordered_raster_grid(gray)?;
2315    let first_h = *grid.horizontal_lines.first()?;
2316    if first_h <= 2 {
2317        return None;
2318    }
2319
2320    let crop = gray.view(0, 0, gray.width(), first_h).to_image();
2321    let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
2322    if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
2323        return None;
2324    }
2325
2326    let bbox = raster_box_to_page_bbox(
2327        image,
2328        0,
2329        0,
2330        gray.width(),
2331        first_h.max(1),
2332        gray.width().max(1),
2333        gray.height().max(1),
2334    )?;
2335    let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
2336    Some(TextChunk {
2337        value: caption_text,
2338        bbox,
2339        font_name: "OCR".to_string(),
2340        font_size,
2341        font_weight: 700.0,
2342        italic_angle: 0.0,
2343        font_color: "#000000".to_string(),
2344        contrast_ratio: 21.0,
2345        symbol_ends: Vec::new(),
2346        text_format: TextFormat::Normal,
2347        text_type: TextType::Regular,
2348        pdf_layer: PdfLayer::Content,
2349        ocg_visible: true,
2350        index: None,
2351        page_number: image.bbox.page_number,
2352        level: None,
2353        mcid: None,
2354    })
2355}
2356
2357fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
2358    let gray = image::open(image_path).ok()?.to_luma8();
2359    recover_bordered_raster_table_from_gray(&gray, image)
2360}
2361
2362fn recover_bordered_raster_table_from_gray(
2363    gray: &GrayImage,
2364    image: &ImageChunk,
2365) -> Option<TableBorder> {
2366    let grid = detect_bordered_raster_grid(gray)?;
2367    let num_cols = grid.vertical_lines.len().checked_sub(1)?;
2368    let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
2369    if num_cols < 2 || num_rows < 2 {
2370        return None;
2371    }
2372    let table_bbox = raster_box_to_page_bbox(
2373        image,
2374        *grid.vertical_lines.first()?,
2375        *grid.horizontal_lines.first()?,
2376        *grid.vertical_lines.last()?,
2377        *grid.horizontal_lines.last()?,
2378        gray.width(),
2379        gray.height(),
2380    )?;
2381
2382    let x_coordinates = raster_boundaries_to_page(
2383        &grid.vertical_lines,
2384        image.bbox.left_x,
2385        image.bbox.right_x,
2386        gray.width(),
2387    )?;
2388    let y_coordinates = raster_boundaries_to_page_desc(
2389        &grid.horizontal_lines,
2390        image.bbox.bottom_y,
2391        image.bbox.top_y,
2392        gray.height(),
2393    )?;
2394
2395    if !bordered_grid_has_cell_ink(gray, &grid) {
2396        return None;
2397    }
2398
2399    let mut rows = Vec::with_capacity(num_rows);
2400    let mut non_empty_cells = 0usize;
2401    let mut rows_with_text = 0usize;
2402    let mut total_cells = 0usize;
2403    let mut whole_table_buckets =
2404        collect_bordered_table_ocr_buckets(gray, &grid, num_rows, num_cols)
2405            .unwrap_or_else(|| vec![Vec::new(); num_rows * num_cols]);
2406    let allow_per_cell_fallback =
2407        num_rows.saturating_mul(num_cols) <= MAX_BORDERED_TABLE_PER_CELL_FALLBACK_CELLS;
2408    for row_idx in 0..num_rows {
2409        let row_bbox = BoundingBox::new(
2410            image.bbox.page_number,
2411            image.bbox.left_x,
2412            y_coordinates[row_idx + 1],
2413            image.bbox.right_x,
2414            y_coordinates[row_idx],
2415        );
2416        let mut cells = Vec::with_capacity(num_cols);
2417        let mut row_has_text = false;
2418
2419        for col_idx in 0..num_cols {
2420            let x1 = grid.vertical_lines[col_idx];
2421            let x2 = grid.vertical_lines[col_idx + 1];
2422            let y1 = grid.horizontal_lines[row_idx];
2423            let y2 = grid.horizontal_lines[row_idx + 1];
2424            let cell_bbox = BoundingBox::new(
2425                image.bbox.page_number,
2426                x_coordinates[col_idx],
2427                y_coordinates[row_idx + 1],
2428                x_coordinates[col_idx + 1],
2429                y_coordinates[row_idx],
2430            );
2431            let bucket_idx = row_idx * num_cols + col_idx;
2432            let text = if let Some(parts) = whole_table_buckets.get_mut(bucket_idx) {
2433                if parts.is_empty() {
2434                    String::new()
2435                } else {
2436                    parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
2437                    let raw = parts
2438                        .iter()
2439                        .map(|(_, _, text)| text.as_str())
2440                        .collect::<Vec<_>>()
2441                        .join(" ");
2442                    normalize_raster_cell_text(row_idx, col_idx, raw)
2443                }
2444            } else {
2445                String::new()
2446            };
2447            let text = if text.is_empty() && allow_per_cell_fallback {
2448                extract_raster_cell_text(gray, row_idx, col_idx, x1, y1, x2, y2).unwrap_or_default()
2449            } else {
2450                text
2451            };
2452            total_cells += 1;
2453
2454            let mut content = Vec::new();
2455            if !text.is_empty() {
2456                row_has_text = true;
2457                non_empty_cells += 1;
2458                content.push(TableToken {
2459                    base: TextChunk {
2460                        value: text,
2461                        bbox: cell_bbox.clone(),
2462                        font_name: "OCR".to_string(),
2463                        font_size: (cell_bbox.height() * 0.55).max(6.0),
2464                        font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2465                        italic_angle: 0.0,
2466                        font_color: "#000000".to_string(),
2467                        contrast_ratio: 21.0,
2468                        symbol_ends: Vec::new(),
2469                        text_format: TextFormat::Normal,
2470                        text_type: TextType::Regular,
2471                        pdf_layer: PdfLayer::Content,
2472                        ocg_visible: true,
2473                        index: None,
2474                        page_number: image.bbox.page_number,
2475                        level: None,
2476                        mcid: None,
2477                    },
2478                    token_type: TableTokenType::Text,
2479                });
2480            }
2481
2482            cells.push(TableBorderCell {
2483                bbox: cell_bbox,
2484                index: None,
2485                level: None,
2486                row_number: row_idx,
2487                col_number: col_idx,
2488                row_span: 1,
2489                col_span: 1,
2490                content,
2491                contents: Vec::new(),
2492                semantic_type: None,
2493            });
2494        }
2495
2496        if row_has_text {
2497            rows_with_text += 1;
2498        }
2499
2500        rows.push(TableBorderRow {
2501            bbox: row_bbox,
2502            index: None,
2503            level: None,
2504            row_number: row_idx,
2505            cells,
2506            semantic_type: None,
2507        });
2508    }
2509
2510    if total_cells == 0 {
2511        return None;
2512    }
2513    let text_cell_ratio = non_empty_cells as f64 / total_cells as f64;
2514    if text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
2515        || rows_with_text < MIN_RASTER_TABLE_ROWS_WITH_TEXT
2516    {
2517        return None;
2518    }
2519
2520    Some(TableBorder {
2521        bbox: table_bbox,
2522        index: None,
2523        level: None,
2524        x_coordinates: x_coordinates.clone(),
2525        x_widths: vec![0.0; x_coordinates.len()],
2526        y_coordinates: y_coordinates.clone(),
2527        y_widths: vec![0.0; y_coordinates.len()],
2528        rows,
2529        num_rows,
2530        num_columns: num_cols,
2531        is_bad_table: false,
2532        is_table_transformer: true,
2533        previous_table: None,
2534        next_table: None,
2535    })
2536}
2537
2538fn collect_bordered_table_ocr_buckets(
2539    gray: &GrayImage,
2540    grid: &RasterTableGrid,
2541    num_rows: usize,
2542    num_cols: usize,
2543) -> Option<Vec<Vec<(u32, u32, String)>>> {
2544    if num_rows == 0 || num_cols == 0 {
2545        return None;
2546    }
2547
2548    let bordered = expand_white_border(gray, TABLE_RASTER_OCR_BORDER_PX);
2549    let scaled = image::imageops::resize(
2550        &bordered,
2551        bordered.width() * OCR_SCALE_FACTOR,
2552        bordered.height() * OCR_SCALE_FACTOR,
2553        image::imageops::FilterType::Lanczos3,
2554    );
2555    let words = run_tesseract_tsv_words_best(&scaled, &["6", "11"], |_| true)?;
2556    if words.is_empty() || looks_like_chart_label_ocr(&words) {
2557        return None;
2558    }
2559
2560    let mut buckets = vec![Vec::new(); num_rows * num_cols];
2561    let scale = f64::from(OCR_SCALE_FACTOR);
2562    let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
2563
2564    for word in words {
2565        let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
2566        let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
2567
2568        let cx = cx_scaled / scale - border;
2569        let cy = cy_scaled / scale - border;
2570        if cx < 0.0 || cy < 0.0 {
2571            continue;
2572        }
2573
2574        let cx = match u32::try_from(cx.round() as i64) {
2575            Ok(value) => value,
2576            Err(_) => continue,
2577        };
2578        let cy = match u32::try_from(cy.round() as i64) {
2579            Ok(value) => value,
2580            Err(_) => continue,
2581        };
2582
2583        let col_idx = grid
2584            .vertical_lines
2585            .windows(2)
2586            .position(|span| cx >= span[0] && cx < span[1]);
2587        let row_idx = grid
2588            .horizontal_lines
2589            .windows(2)
2590            .position(|span| cy >= span[0] && cy < span[1]);
2591        let (Some(row_idx), Some(col_idx)) = (row_idx, col_idx) else {
2592            continue;
2593        };
2594
2595        buckets[row_idx * num_cols + col_idx].push((cy, cx, word.text));
2596    }
2597
2598    Some(buckets)
2599}
2600
2601fn is_obvious_bar_chart_raster(gray: &GrayImage) -> bool {
2602    let width = gray.width();
2603    let height = gray.height();
2604    if width < 160 || height < 120 {
2605        return false;
2606    }
2607
2608    let min_ink_pixels = (f64::from(width) * 0.35).ceil() as u32;
2609    let min_run_height = (height / 80).max(6);
2610    let wide_ink_row_runs = merge_runs(
2611        (0..height)
2612            .filter(|&y| count_ink_in_row(gray, y, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels),
2613    );
2614    let thick_runs = wide_ink_row_runs
2615        .into_iter()
2616        .filter(|(start, end)| end.saturating_sub(*start) + 1 >= min_run_height)
2617        .count();
2618
2619    thick_runs >= 3 || is_obvious_vertical_bar_chart_raster(gray)
2620}
2621
2622fn is_obvious_vertical_bar_chart_raster(gray: &GrayImage) -> bool {
2623    let width = gray.width();
2624    let height = gray.height();
2625    if width < 160 || height < 120 {
2626        return false;
2627    }
2628
2629    let min_ink_pixels = (f64::from(height) * 0.08).ceil() as u32;
2630    let min_bar_width = (width / 28).max(10);
2631    let min_bar_height = (height / 8).max(16);
2632    let max_baseline_delta = (height / 14).max(8);
2633    let min_fill_ratio = 0.10;
2634
2635    let candidate_runs =
2636        merge_runs((0..width).filter(|&x| {
2637            count_ink_in_column(gray, x, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels
2638        }));
2639    let mut baselines = Vec::new();
2640    let mut has_dominant_bar = false;
2641    let mut qualifying_bars = 0usize;
2642
2643    for (start, end) in candidate_runs {
2644        let run_width = end.saturating_sub(start) + 1;
2645        if run_width < min_bar_width {
2646            continue;
2647        }
2648
2649        let mut top = height;
2650        let mut bottom = 0u32;
2651        let mut ink_pixels = 0usize;
2652        for x in start..=end {
2653            for y in 0..height {
2654                if gray.get_pixel(x, y).0[0] < RASTER_CHART_INK_THRESHOLD {
2655                    top = top.min(y);
2656                    bottom = bottom.max(y);
2657                    ink_pixels += 1;
2658                }
2659            }
2660        }
2661
2662        if top >= height || bottom <= top {
2663            continue;
2664        }
2665
2666        let run_height = bottom.saturating_sub(top) + 1;
2667        if run_height < min_bar_height {
2668            continue;
2669        }
2670
2671        let bbox_area = run_width as usize * run_height as usize;
2672        if bbox_area == 0 {
2673            continue;
2674        }
2675
2676        let fill_ratio = ink_pixels as f64 / bbox_area as f64;
2677        if fill_ratio < min_fill_ratio {
2678            continue;
2679        }
2680
2681        qualifying_bars += 1;
2682        if run_width >= min_bar_width.saturating_mul(2) {
2683            has_dominant_bar = true;
2684        }
2685        baselines.push(bottom);
2686    }
2687
2688    if baselines.len() < 2 {
2689        return false;
2690    }
2691
2692    baselines.sort_unstable();
2693    let median_baseline = baselines[baselines.len() / 2];
2694    let aligned_baselines = baselines
2695        .iter()
2696        .filter(|baseline| baseline.abs_diff(median_baseline) <= max_baseline_delta)
2697        .count();
2698
2699    aligned_baselines >= 2 && (has_dominant_bar || (qualifying_bars >= 4 && aligned_baselines >= 4))
2700}
2701
2702/// Return true when the image appears to be a natural photograph rather than a
2703/// synthetic chart, diagram, or scanned document page.
2704///
2705/// Photographs have a broadly distributed pixel histogram — many mid-tone pixels
2706/// (neither pure white nor pure black).  Synthetic images (charts, tables,
2707/// diagrams) consist mostly of a white background (~255) with sparse dark ink
2708/// (~0-50).  We classify an image as photographic when either at least 30% of
2709/// its pixels fall in the mid-tone band [40, 215], or when a bright image still
2710/// shows photo-like tonal diversity via a wide histogram support and high
2711/// entropy. Numeric table recovery should be skipped for photographic images
2712/// because OCR'd annotation labels (axis ticks, caption fragments) are not table
2713/// data.
2714fn is_natural_photograph_raster(gray: &GrayImage) -> bool {
2715    let total = (gray.width() * gray.height()) as usize;
2716    if total < 400 {
2717        return false;
2718    }
2719
2720    let mut histogram = [0usize; 256];
2721    for pixel in gray.pixels() {
2722        histogram[pixel[0] as usize] += 1;
2723    }
2724
2725    let mid_tone_count: usize = histogram[40..=215].iter().sum();
2726    if mid_tone_count * 10 >= total * 3 {
2727        return true;
2728    }
2729
2730    let mut coarse_histogram = [0usize; 16];
2731    for (value, count) in histogram.iter().enumerate() {
2732        coarse_histogram[value / 16] += count;
2733    }
2734
2735    let occupied_bins = coarse_histogram
2736        .iter()
2737        .filter(|count| **count as f64 >= total as f64 * 0.01)
2738        .count();
2739    let entropy = coarse_histogram.iter().fold(0.0, |acc, count| {
2740        if *count == 0 {
2741            return acc;
2742        }
2743        let probability = *count as f64 / total as f64;
2744        acc - probability * probability.log2()
2745    });
2746
2747    mid_tone_count as f64 / total as f64 >= MIN_BRIGHT_PHOTO_MID_TONE_RATIO
2748        && occupied_bins >= MIN_BRIGHT_PHOTO_HISTOGRAM_BINS
2749        && entropy >= MIN_BRIGHT_PHOTO_ENTROPY
2750}
2751
2752/// Return true for dark UI or video-player screenshots that are visually rich
2753/// but not document tables.
2754fn is_dark_ui_screenshot_raster(gray: &GrayImage) -> bool {
2755    let total = (gray.width() * gray.height()) as usize;
2756    if total < 400 {
2757        return false;
2758    }
2759
2760    let very_dark_count = gray.pixels().filter(|p| p[0] <= 39).count();
2761    let non_extreme_count = gray.pixels().filter(|p| p[0] >= 15 && p[0] <= 240).count();
2762    let bright_detail_count = gray.pixels().filter(|p| p[0] >= 180 && p[0] <= 245).count();
2763
2764    very_dark_count * 20 >= total * 13
2765        && non_extreme_count * 2 >= total
2766        && bright_detail_count * 20 >= total
2767}
2768
2769fn bordered_grid_has_cell_ink(gray: &GrayImage, grid: &RasterTableGrid) -> bool {
2770    let num_cols = match grid.vertical_lines.len().checked_sub(1) {
2771        Some(value) => value,
2772        None => return false,
2773    };
2774    let num_rows = match grid.horizontal_lines.len().checked_sub(1) {
2775        Some(value) => value,
2776        None => return false,
2777    };
2778    if num_cols == 0 || num_rows == 0 {
2779        return false;
2780    }
2781
2782    let mut total_cells = 0usize;
2783    let mut inked_cells = 0usize;
2784    let mut rows_with_ink = 0usize;
2785
2786    for row_idx in 0..num_rows {
2787        let mut row_has_ink = false;
2788        for col_idx in 0..num_cols {
2789            total_cells += 1;
2790            let x1 = grid.vertical_lines[col_idx];
2791            let x2 = grid.vertical_lines[col_idx + 1];
2792            let y1 = grid.horizontal_lines[row_idx];
2793            let y2 = grid.horizontal_lines[row_idx + 1];
2794
2795            let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2796            let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2797            let crop_left = x1 + inset_x;
2798            let crop_top = y1 + inset_y;
2799            let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2800            let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2801            if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2802                continue;
2803            }
2804
2805            let dark_pixels = (crop_top..crop_top + crop_height)
2806                .flat_map(|y| (crop_left..crop_left + crop_width).map(move |x| (x, y)))
2807                .filter(|&(x, y)| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2808                .count();
2809            let area = (crop_width as usize) * (crop_height as usize);
2810            if area == 0 {
2811                continue;
2812            }
2813
2814            let dark_ratio = dark_pixels as f64 / area as f64;
2815            if dark_ratio >= MIN_BORDERED_CELL_DARK_RATIO {
2816                inked_cells += 1;
2817                row_has_ink = true;
2818            }
2819        }
2820        if row_has_ink {
2821            rows_with_ink += 1;
2822        }
2823    }
2824
2825    if total_cells == 0 {
2826        return false;
2827    }
2828
2829    (inked_cells as f64 / total_cells as f64) >= MIN_BORDERED_INKED_CELL_RATIO
2830        && rows_with_ink >= MIN_BORDERED_ROWS_WITH_INK
2831}
2832
2833fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
2834    let mut best_grid: Option<(RasterTableGrid, f64)> = None;
2835    for variant in build_ocr_variants(gray) {
2836        let Some((grid, score)) = detect_bordered_raster_grid_single(&variant) else {
2837            continue;
2838        };
2839        match &best_grid {
2840            Some((_, best_score)) if *best_score >= score => {}
2841            _ => best_grid = Some((grid, score)),
2842        }
2843    }
2844    best_grid.map(|(grid, _)| grid)
2845}
2846
2847fn detect_bordered_raster_grid_single(gray: &GrayImage) -> Option<(RasterTableGrid, f64)> {
2848    let width = gray.width();
2849    let height = gray.height();
2850    if width < 100 || height < 80 {
2851        return None;
2852    }
2853
2854    let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
2855    let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
2856
2857    let vertical_runs =
2858        merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
2859    let horizontal_runs =
2860        merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
2861    if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
2862        || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
2863    {
2864        return None;
2865    }
2866
2867    let mut vertical_lines: Vec<u32> = vertical_runs
2868        .into_iter()
2869        .map(|(start, end)| (start + end) / 2)
2870        .collect();
2871    let mut horizontal_lines: Vec<u32> = horizontal_runs
2872        .into_iter()
2873        .map(|(start, end)| (start + end) / 2)
2874        .collect();
2875
2876    let (&rough_min_x, &rough_max_x) = vertical_lines.first().zip(vertical_lines.last())?;
2877    let (&rough_min_y, &rough_max_y) = horizontal_lines.first().zip(horizontal_lines.last())?;
2878    if rough_max_x <= rough_min_x || rough_max_y <= rough_min_y {
2879        return None;
2880    }
2881
2882    vertical_lines.retain(|&x| {
2883        dark_ratio_in_column(gray, x, rough_min_y, rough_max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY
2884    });
2885    horizontal_lines.retain(|&y| {
2886        dark_ratio_in_row(gray, y, rough_min_x, rough_max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY
2887    });
2888    if vertical_lines.len() < MIN_BORDERED_VERTICAL_LINES
2889        || horizontal_lines.len() < MIN_BORDERED_HORIZONTAL_LINES
2890    {
2891        return None;
2892    }
2893
2894    if vertical_lines
2895        .windows(2)
2896        .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2897        || horizontal_lines
2898            .windows(2)
2899            .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2900    {
2901        return None;
2902    }
2903    if !grid_lines_are_continuous(&vertical_lines, &horizontal_lines, gray) {
2904        return None;
2905    }
2906
2907    let continuity = grid_continuity_score(&vertical_lines, &horizontal_lines, gray);
2908    let line_score = vertical_lines.len() as f64 + horizontal_lines.len() as f64;
2909    let score = continuity * 100.0 + line_score;
2910
2911    Some((
2912        RasterTableGrid {
2913            vertical_lines,
2914            horizontal_lines,
2915        },
2916        score,
2917    ))
2918}
2919
2920fn grid_lines_are_continuous(
2921    vertical_lines: &[u32],
2922    horizontal_lines: &[u32],
2923    gray: &GrayImage,
2924) -> bool {
2925    let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2926        return false;
2927    };
2928    let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2929        return false;
2930    };
2931    if max_x <= min_x || max_y <= min_y {
2932        return false;
2933    }
2934
2935    vertical_lines
2936        .iter()
2937        .all(|&x| dark_ratio_in_column(gray, x, min_y, max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2938        && horizontal_lines
2939            .iter()
2940            .all(|&y| dark_ratio_in_row(gray, y, min_x, max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2941}
2942
2943fn grid_continuity_score(
2944    vertical_lines: &[u32],
2945    horizontal_lines: &[u32],
2946    gray: &GrayImage,
2947) -> f64 {
2948    let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2949        return 0.0;
2950    };
2951    let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2952        return 0.0;
2953    };
2954    if max_x <= min_x || max_y <= min_y {
2955        return 0.0;
2956    }
2957
2958    let mut samples = 0usize;
2959    let mut sum = 0.0;
2960    for &x in vertical_lines {
2961        sum += dark_ratio_in_column(gray, x, min_y, max_y);
2962        samples += 1;
2963    }
2964    for &y in horizontal_lines {
2965        sum += dark_ratio_in_row(gray, y, min_x, max_x);
2966        samples += 1;
2967    }
2968    if samples == 0 {
2969        0.0
2970    } else {
2971        sum / samples as f64
2972    }
2973}
2974
2975fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
2976    count_ink_in_column(gray, x, RASTER_DARK_THRESHOLD)
2977}
2978
2979fn count_ink_in_column(gray: &GrayImage, x: u32, threshold: u8) -> u32 {
2980    (0..gray.height())
2981        .filter(|&y| gray.get_pixel(x, y).0[0] < threshold)
2982        .count() as u32
2983}
2984
2985fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
2986    count_ink_in_row(gray, y, RASTER_DARK_THRESHOLD)
2987}
2988
2989fn count_ink_in_row(gray: &GrayImage, y: u32, threshold: u8) -> u32 {
2990    (0..gray.width())
2991        .filter(|&x| gray.get_pixel(x, y).0[0] < threshold)
2992        .count() as u32
2993}
2994
2995fn dark_ratio_in_column(gray: &GrayImage, x: u32, y1: u32, y2: u32) -> f64 {
2996    if y2 <= y1 || x >= gray.width() {
2997        return 0.0;
2998    }
2999    let dark = (y1..=y2)
3000        .filter(|&y| y < gray.height() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
3001        .count();
3002    dark as f64 / f64::from(y2 - y1 + 1)
3003}
3004
3005fn dark_ratio_in_row(gray: &GrayImage, y: u32, x1: u32, x2: u32) -> f64 {
3006    if x2 <= x1 || y >= gray.height() {
3007        return 0.0;
3008    }
3009    let dark = (x1..=x2)
3010        .filter(|&x| x < gray.width() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
3011        .count();
3012    dark as f64 / f64::from(x2 - x1 + 1)
3013}
3014
3015fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
3016    let mut runs = Vec::new();
3017    let mut start = None;
3018    let mut prev = 0u32;
3019    for value in values {
3020        match start {
3021            None => {
3022                start = Some(value);
3023                prev = value;
3024            }
3025            Some(s) if value == prev + 1 => {
3026                prev = value;
3027                start = Some(s);
3028            }
3029            Some(s) => {
3030                runs.push((s, prev));
3031                start = Some(value);
3032                prev = value;
3033            }
3034        }
3035    }
3036    if let Some(s) = start {
3037        runs.push((s, prev));
3038    }
3039    runs
3040}
3041
3042fn build_boundaries_from_centers(
3043    centers: &[f64],
3044    left_edge: f64,
3045    right_edge: f64,
3046    image_width: u32,
3047) -> Vec<f64> {
3048    let mut boundaries = Vec::with_capacity(centers.len() + 1);
3049    boundaries.push(left_edge);
3050    if centers.len() < 2 || image_width == 0 || right_edge <= left_edge {
3051        boundaries.push(right_edge.max(left_edge));
3052        return boundaries;
3053    }
3054
3055    let page_width = right_edge - left_edge;
3056    let mut previous = left_edge;
3057    for pair in centers.windows(2) {
3058        let midpoint_px = ((pair[0] + pair[1]) / 2.0).clamp(0.0, f64::from(image_width));
3059        let boundary =
3060            left_edge + midpoint_px / f64::from(image_width) * page_width;
3061        let boundary = boundary.clamp(previous, right_edge);
3062        boundaries.push(boundary);
3063        previous = boundary;
3064    }
3065    boundaries.push(right_edge);
3066    boundaries
3067}
3068
3069fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
3070    let mut boundaries = Vec::with_capacity(rows.len() + 1);
3071    boundaries.push(rows[0].0);
3072    for pair in rows.windows(2) {
3073        boundaries.push((pair[0].1 + pair[1].0) / 2.0);
3074    }
3075    boundaries.push(rows[rows.len() - 1].1);
3076    boundaries
3077}
3078
3079fn raster_boundaries_to_page(
3080    lines: &[u32],
3081    left_edge: f64,
3082    right_edge: f64,
3083    image_width: u32,
3084) -> Option<Vec<f64>> {
3085    if image_width == 0 {
3086        return None;
3087    }
3088    let scale = (right_edge - left_edge) / f64::from(image_width);
3089    Some(
3090        lines
3091            .iter()
3092            .map(|line| left_edge + f64::from(*line) * scale)
3093            .collect(),
3094    )
3095}
3096
3097fn raster_boundaries_to_page_desc(
3098    lines: &[u32],
3099    bottom_edge: f64,
3100    top_edge: f64,
3101    image_height: u32,
3102) -> Option<Vec<f64>> {
3103    if image_height == 0 {
3104        return None;
3105    }
3106    let page_height = top_edge - bottom_edge;
3107    Some(
3108        lines
3109            .iter()
3110            .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
3111            .collect(),
3112    )
3113}
3114
3115fn raster_box_to_page_bbox(
3116    image: &ImageChunk,
3117    x1: u32,
3118    y1: u32,
3119    x2: u32,
3120    y2: u32,
3121    image_width: u32,
3122    image_height: u32,
3123) -> Option<BoundingBox> {
3124    if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
3125        return None;
3126    }
3127    let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
3128    let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
3129    let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
3130    let bottom_y =
3131        image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
3132    Some(BoundingBox::new(
3133        image.bbox.page_number,
3134        left_x,
3135        bottom_y,
3136        right_x,
3137        top_y,
3138    ))
3139}
3140
3141fn extract_raster_cell_text(
3142    gray: &GrayImage,
3143    row_idx: usize,
3144    col_idx: usize,
3145    x1: u32,
3146    y1: u32,
3147    x2: u32,
3148    y2: u32,
3149) -> Option<String> {
3150    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
3151    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
3152    let crop_left = x1 + inset_x;
3153    let crop_top = y1 + inset_y;
3154    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
3155    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
3156    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
3157        return Some(String::new());
3158    }
3159
3160    let cropped = gray
3161        .view(crop_left, crop_top, crop_width, crop_height)
3162        .to_image();
3163    let bordered = expand_white_border(&cropped, 12);
3164    let scaled = image::imageops::resize(
3165        &bordered,
3166        bordered.width() * OCR_SCALE_FACTOR,
3167        bordered.height() * OCR_SCALE_FACTOR,
3168        image::imageops::FilterType::Lanczos3,
3169    );
3170    let psm_modes: [&str; 3] = if row_idx == 0 {
3171        ["6", "11", "7"]
3172    } else {
3173        ["7", "6", "11"]
3174    };
3175    let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
3176    Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
3177}
3178
3179fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
3180    let mut expanded = GrayImage::from_pixel(
3181        image.width() + border * 2,
3182        image.height() + border * 2,
3183        Luma([255]),
3184    );
3185    for y in 0..image.height() {
3186        for x in 0..image.width() {
3187            expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
3188        }
3189    }
3190    expanded
3191}
3192
3193fn run_tesseract_tsv_words(image: &GrayImage, psm: &str) -> Option<Vec<OcrWord>> {
3194    match selected_ocr_engine() {
3195        OcrEngine::RapidOcr => run_rapidocr_words(image),
3196        OcrEngine::Tesseract => run_tesseract_tsv_words_with_oem(image, psm, "3"),
3197    }
3198}
3199
3200fn run_tesseract_tsv_words_with_oem(
3201    image: &GrayImage,
3202    psm: &str,
3203    oem: &str,
3204) -> Option<Vec<OcrWord>> {
3205    let temp_dir = create_temp_dir(0).ok()?;
3206    let image_path = temp_dir.join("ocr.png");
3207    if image.save(&image_path).is_err() {
3208        let _ = fs::remove_dir_all(&temp_dir);
3209        return None;
3210    }
3211
3212    let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3213    let output = Command::new("tesseract")
3214        .current_dir(&temp_dir)
3215        .arg("ocr.png")
3216        .arg("stdout")
3217        // Tell Tesseract the actual DPI of the scaled image so its character-size
3218        // models are correctly calibrated (avoids ~72 DPI guess).
3219        .arg("--dpi")
3220        .arg(&dpi)
3221        .arg("--oem")
3222        .arg(oem)
3223        .arg("--psm")
3224        .arg(psm)
3225        // Disable word-frequency and system dictionaries: table cells contain
3226        // numeric codes, abbreviations, and domain-specific tokens that the
3227        // dictionary would "correct" into wrong English words.
3228        .arg("-c")
3229        .arg("load_system_dawg=0")
3230        .arg("-c")
3231        .arg("load_freq_dawg=0")
3232        .arg("tsv")
3233        .output()
3234        .ok()?;
3235    let _ = fs::remove_dir_all(&temp_dir);
3236    if !output.status.success() {
3237        return None;
3238    }
3239
3240    let tsv = String::from_utf8_lossy(&output.stdout);
3241    Some(parse_tesseract_tsv(&tsv))
3242}
3243
3244fn run_tesseract_cell_text_best(image: &GrayImage, psm_modes: &[&str]) -> Option<String> {
3245    let mut best: Option<(String, f64)> = None;
3246
3247    if matches!(selected_ocr_engine(), OcrEngine::Tesseract) {
3248        // First pass: collect consensus words across Tesseract perspectives.
3249        let consensus_words = collect_consensus_words(image, psm_modes);
3250        if !consensus_words.is_empty() {
3251            let text = words_to_plain_line_text(&consensus_words);
3252            if !text.is_empty() {
3253                let score = score_ocr_words(&consensus_words, image.width(), image.height());
3254                best = Some((text, score));
3255            }
3256        }
3257    }
3258
3259    // Fallback: standard best-variant approach if no consensus words found
3260    if best.is_none() {
3261        for variant in build_ocr_variants(image) {
3262            for psm in psm_modes {
3263                let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3264                    continue;
3265                };
3266                if words.is_empty() {
3267                    continue;
3268                }
3269                let text = words_to_plain_line_text(&words);
3270                if text.is_empty() {
3271                    continue;
3272                }
3273                let score = score_ocr_words(&words, variant.width(), variant.height());
3274                match &best {
3275                    Some((_, best_score)) if *best_score >= score => {}
3276                    _ => best = Some((text, score)),
3277                }
3278
3279                if let Some(text) = run_tesseract_plain_text_with_variant(&variant, psm) {
3280                    let norm_len = normalize_text(&text).len() as f64;
3281                    if norm_len > 0.0 {
3282                        match &best {
3283                            Some((_, best_score)) if *best_score >= norm_len => {}
3284                            _ => best = Some((text, norm_len)),
3285                        }
3286                    }
3287                }
3288            }
3289
3290            // Docling-inspired multi-engine path: when RapidOCR is available,
3291            // treat it as an additional OCR engine candidate rather than a hard
3292            // replacement. This keeps Tesseract's stronger word-level geometry
3293            // while allowing a modern detector/recognizer to win on difficult cells.
3294            if let Some(words) = run_rapidocr_words(&variant) {
3295                let text = words_to_plain_line_text(&words);
3296                if !text.is_empty() {
3297                    let score = score_ocr_words(&words, variant.width(), variant.height());
3298                    match &best {
3299                        Some((_, best_score)) if *best_score >= score => {}
3300                        _ => best = Some((text, score)),
3301                    }
3302                }
3303            }
3304        }
3305    }
3306
3307    best.map(|(text, _)| text)
3308}
3309
3310fn collect_consensus_words(image: &GrayImage, psm_modes: &[&str]) -> Vec<OcrWord> {
3311    let variants = build_ocr_variants(image);
3312
3313    // Collect words per (PSM, OEM) perspective. A "perspective" is an independent
3314    // Tesseract configuration; preprocessing variants are replicates of the same
3315    // perspective (same segmentation model, same language model).
3316    //
3317    // First-principles rationale:
3318    //   A real word should be detected by the correct PSM regardless of which
3319    //   preprocessed image variant is used. So consensus = "word appears under
3320    //   ≥2 distinct (PSM, OEM) combinations", NOT "≥25% of (variant×PSM×OEM)".
3321    //   The percentage approach breaks as more variants are added: threshold
3322    //   rises and real words get filtered out.
3323
3324    let oems = ["1", "3"]; // OEM 1 = legacy Tesseract; OEM 3 = LSTM neural
3325
3326    // For each (PSM, OEM) pair, keep the best-confidence word seen in any variant.
3327    let mut perspective_best: HashMap<(String, String, String), OcrWord> = HashMap::new();
3328
3329    for variant in &variants {
3330        for psm in psm_modes {
3331            for oem in oems {
3332                let Some(words) = run_tesseract_tsv_words_with_oem(variant, psm, oem) else {
3333                    continue;
3334                };
3335                for word in words {
3336                    let key = (psm.to_string(), oem.to_string(), word.text.to_lowercase());
3337                    perspective_best
3338                        .entry(key)
3339                        .and_modify(|best| {
3340                            if word.confidence > best.confidence {
3341                                *best = word.clone();
3342                            }
3343                        })
3344                        .or_insert(word);
3345                }
3346            }
3347        }
3348    }
3349
3350    // Count distinct (PSM, OEM) perspectives in which each word text appears.
3351    // Threshold: at least 2 independent configurations must agree.
3352    const MIN_PERSPECTIVES: usize = 2;
3353
3354    let mut text_to_perspectives: HashMap<String, HashSet<(String, String)>> = HashMap::new();
3355    for (psm, oem, norm_text) in perspective_best.keys() {
3356        text_to_perspectives
3357            .entry(norm_text.clone())
3358            .or_default()
3359            .insert((psm.clone(), oem.clone()));
3360    }
3361
3362    // Return the best-confidence word for each text that meets the threshold.
3363    let mut consensus: Vec<OcrWord> = text_to_perspectives
3364        .iter()
3365        .filter(|(_, perspectives)| perspectives.len() >= MIN_PERSPECTIVES)
3366        .filter_map(|(norm_text, _)| {
3367            perspective_best
3368                .iter()
3369                .filter(|((_, _, t), _)| t == norm_text)
3370                .max_by(|(_, a), (_, b)| {
3371                    a.confidence
3372                        .partial_cmp(&b.confidence)
3373                        .unwrap_or(std::cmp::Ordering::Equal)
3374                })
3375                .map(|(_, w)| w.clone())
3376        })
3377        .collect();
3378
3379    consensus.sort_by_key(|w| (w.top, w.left));
3380    consensus
3381}
3382
3383fn filter_words_by_spatial_coherence(words: &[OcrWord]) -> Vec<OcrWord> {
3384    if words.len() <= 1 {
3385        return words.to_vec();
3386    }
3387
3388    // First-principles thresholds derived from the actual character height in this
3389    // image — fully agnostic to DPI and scale factor. Typography conventions:
3390    //   • Word spacing within a line ≈ 0.25–0.33 em (em = cap height ≈ word.height)
3391    //   • A gap larger than 3 em between words on the same Tesseract line is
3392    //     almost certainly a segmentation error, not a legitimate space.
3393    //   • A single-word line smaller than 0.4 em wide is glyph noise.
3394    let median_h: u32 = {
3395        let mut heights: Vec<u32> = words.iter().map(|w| w.height.max(1)).collect();
3396        heights.sort_unstable();
3397        heights[heights.len() / 2]
3398    };
3399    // Gap beyond which two adjacent words on the same line are considered disjoint
3400    let gap_threshold = (median_h * 3).max(8);
3401    // Width below which a word on its own line looks like a glyph artifact
3402    let narrow_threshold = (median_h / 2).max(4);
3403    // Minimum bounding box for a credible isolated single-line word
3404    let min_iso_width = (median_h * 2 / 5).max(4);
3405    let min_iso_height = (median_h * 2 / 5).max(3);
3406
3407    // Split words into lines
3408    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3409    for word in words {
3410        by_line.entry(word.line_key).or_default().push(word);
3411    }
3412
3413    let mut filtered = Vec::new();
3414
3415    // For each line, filter out isolated words that are far from neighbors
3416    for line_words in by_line.values_mut() {
3417        if line_words.len() <= 1 {
3418            // Single-word lines: only keep if reasonably sized (not a stray pixel)
3419            if let Some(word) = line_words.first() {
3420                if word.width >= min_iso_width && word.height >= min_iso_height {
3421                    filtered.push((*word).clone());
3422                }
3423            }
3424            continue;
3425        }
3426
3427        line_words.sort_by_key(|word| word.left);
3428
3429        // Check spatial coherence within each word-to-word transition
3430        for (i, word) in line_words.iter().enumerate() {
3431            let is_isolated = if i > 0 {
3432                let prev = line_words[i - 1];
3433                let gap = word
3434                    .left
3435                    .saturating_sub(prev.left.saturating_add(prev.width));
3436                gap > gap_threshold && word.width < narrow_threshold
3437            } else if i < line_words.len() - 1 {
3438                let next = line_words[i + 1];
3439                let gap = next
3440                    .left
3441                    .saturating_sub(word.left.saturating_add(word.width));
3442                gap > gap_threshold && word.width < narrow_threshold
3443            } else {
3444                false
3445            };
3446
3447            if !is_isolated {
3448                filtered.push((*word).clone());
3449            }
3450        }
3451    }
3452
3453    filtered
3454}
3455
3456fn cluster_words_by_proximity(words: &[OcrWord], gap_tolerance: u32) -> Vec<Vec<OcrWord>> {
3457    if words.is_empty() {
3458        return Vec::new();
3459    }
3460
3461    let mut sorted_words = words.to_vec();
3462    sorted_words.sort_by_key(|w| (w.top, w.left));
3463
3464    // Vertical tolerance: two words are on the "same line" when their top edges
3465    // differ by less than half the median word height. This is typographically
3466    // correct: legitimate multi-word lines share a common baseline ± leading / 2.
3467    let median_h: i32 = {
3468        let mut heights: Vec<u32> = sorted_words.iter().map(|w| w.height.max(1)).collect();
3469        heights.sort_unstable();
3470        heights[heights.len() / 2] as i32
3471    };
3472    let vertical_tolerance = (median_h / 2).max(2);
3473
3474    let mut clusters: Vec<Vec<OcrWord>> = Vec::new();
3475    let mut current_cluster = vec![sorted_words[0].clone()];
3476
3477    for word in &sorted_words[1..] {
3478        if let Some(last) = current_cluster.last() {
3479            let vertical_gap = (word.top as i32 - last.top as i32).abs();
3480            let horizontal_gap = word
3481                .left
3482                .saturating_sub(last.left.saturating_add(last.width));
3483
3484            if vertical_gap <= vertical_tolerance && horizontal_gap <= gap_tolerance {
3485                current_cluster.push(word.clone());
3486            } else {
3487                clusters.push(current_cluster);
3488                current_cluster = vec![word.clone()];
3489            }
3490        }
3491    }
3492
3493    if !current_cluster.is_empty() {
3494        clusters.push(current_cluster);
3495    }
3496
3497    clusters
3498}
3499
3500fn words_to_plain_line_text(words: &[OcrWord]) -> String {
3501    // Apply spatial coherence filtering to remove isolated artifacts
3502    let filtered_words = filter_words_by_spatial_coherence(words);
3503
3504    if filtered_words.is_empty() {
3505        return String::new();
3506    }
3507
3508    // Cluster words by spatial proximity with adaptive gap tolerance
3509    let avg_word_width =
3510        filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3511    let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3512    let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3513
3514    let mut lines: Vec<String> = Vec::new();
3515    for cluster in clusters {
3516        let mut sorted_cluster = cluster;
3517        sorted_cluster.sort_by_key(|w| w.left);
3518
3519        let line = sorted_cluster
3520            .iter()
3521            .map(|word| word.text.as_str())
3522            .collect::<Vec<_>>()
3523            .join(" ")
3524            .trim()
3525            .to_string();
3526
3527        if !line.is_empty() {
3528            lines.push(line);
3529        }
3530    }
3531
3532    lines.join(" ")
3533}
3534
3535/// Apply common Tesseract OCR character corrections for table content.
3536///
3537/// Only applies corrections that are safe in all contexts — targeting
3538/// purely numeric tokens where digit/letter confusion is certain.
3539fn run_tesseract_tsv_words_best<F>(
3540    image: &GrayImage,
3541    psm_modes: &[&str],
3542    accept: F,
3543) -> Option<Vec<OcrWord>>
3544where
3545    F: Fn(&[OcrWord]) -> bool,
3546{
3547    let variants = build_ocr_variants(image);
3548    let mut best: Option<OcrCandidateScore> = None;
3549
3550    for variant in variants {
3551        for psm in psm_modes {
3552            let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3553                continue;
3554            };
3555            if !accept(&words) {
3556                continue;
3557            }
3558            let score = score_ocr_words(&words, variant.width(), variant.height());
3559            match &best {
3560                Some(current) if current.score >= score => {}
3561                _ => {
3562                    best = Some(OcrCandidateScore { words, score });
3563                }
3564            }
3565        }
3566    }
3567
3568    best.map(|candidate| candidate.words)
3569}
3570
3571fn score_ocr_words(words: &[OcrWord], width: u32, height: u32) -> f64 {
3572    if words.is_empty() || width == 0 || height == 0 {
3573        return 0.0;
3574    }
3575
3576    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3577    let mut alpha_words = 0usize;
3578    let mut area_coverage = 0f64;
3579    let mut vertical_spread_top = height;
3580    let mut vertical_spread_bottom = 0u32;
3581    let mut total_confidence = 0f64;
3582
3583    for word in words {
3584        by_line.entry(word.line_key).or_default().push(word);
3585        if word.text.chars().any(|ch| ch.is_alphabetic()) {
3586            alpha_words += 1;
3587        }
3588        area_coverage += f64::from(word.width.saturating_mul(word.height));
3589        vertical_spread_top = vertical_spread_top.min(word.top);
3590        vertical_spread_bottom = vertical_spread_bottom.max(word.top.saturating_add(word.height));
3591        total_confidence += word.confidence;
3592    }
3593
3594    let line_count = by_line.len() as f64;
3595    let alpha_ratio = alpha_words as f64 / words.len() as f64;
3596    let density = (area_coverage / f64::from(width.saturating_mul(height))).clamp(0.0, 1.0);
3597    let spread = if vertical_spread_bottom > vertical_spread_top {
3598        f64::from(vertical_spread_bottom - vertical_spread_top) / f64::from(height)
3599    } else {
3600        0.0
3601    };
3602    let avg_confidence = total_confidence / words.len() as f64;
3603    // Confidence bonus: normalize 0-100 range to 0-1 bonus multiplier
3604    let confidence_bonus = (avg_confidence / 100.0).clamp(0.0, 1.0);
3605
3606    // Horizontal spread bonus: reward words that span the full cell width
3607    let horizontal_spread = if words.is_empty() {
3608        0.0
3609    } else {
3610        let min_left = words.iter().map(|w| w.left).min().unwrap_or(0);
3611        let max_right = words
3612            .iter()
3613            .map(|w| w.left + w.width)
3614            .max()
3615            .unwrap_or(width);
3616        f64::from(max_right.saturating_sub(min_left)) / f64::from(width)
3617    };
3618
3619    words.len() as f64
3620        + line_count * 1.5
3621        + alpha_ratio * 6.0
3622        + density * 25.0
3623        + spread * 3.0
3624        + horizontal_spread * 2.0
3625        + confidence_bonus * 5.0 // High-confidence words get a boost
3626}
3627
3628fn build_ocr_variants(gray: &GrayImage) -> Vec<GrayImage> {
3629    vec![
3630        gray.clone(),
3631        contrast_stretch(gray),
3632        global_otsu_binarize(gray),
3633        local_mean_binarize(gray, LOCAL_BINARIZATION_RADIUS),
3634        // Add morphological cleaning as a variant for denoising
3635        morphological_clean(gray),
3636        // Sharpening (unsharp mask) helps Tesseract detect character boundaries on
3637        // blurry / low-DPI cells that survive from low-resolution source PDFs.
3638        unsharp_mask(gray, 1.5),
3639        // Gamma brightening improves contrast for very light ink cells.
3640        gamma_correct(gray, 0.6),
3641    ]
3642}
3643
3644/// Sharpen a grayscale image using an unsharp mask.
3645/// `amount` controls strength (1.5 = moderate). Uses i32 arithmetic throughout
3646/// to avoid u32 underflow when the 3×3 kernel straddles the x=0 or y=0 boundary.
3647fn unsharp_mask(gray: &GrayImage, amount: f32) -> GrayImage {
3648    let width = gray.width() as i32;
3649    let height = gray.height() as i32;
3650    let mut out = GrayImage::new(gray.width(), gray.height());
3651    for y in 0..height {
3652        for x in 0..width {
3653            let mut sum = 0i32;
3654            let mut count = 0i32;
3655            for dy in -1i32..=1 {
3656                for dx in -1i32..=1 {
3657                    let nx = x + dx;
3658                    let ny = y + dy;
3659                    if nx >= 0 && ny >= 0 && nx < width && ny < height {
3660                        sum += gray.get_pixel(nx as u32, ny as u32).0[0] as i32;
3661                        count += 1;
3662                    }
3663                }
3664            }
3665            let blurred = if count > 0 {
3666                sum / count
3667            } else {
3668                gray.get_pixel(x as u32, y as u32).0[0] as i32
3669            };
3670            let original = gray.get_pixel(x as u32, y as u32).0[0] as i32;
3671            let sharpened = original + ((original - blurred) as f32 * amount) as i32;
3672            out.put_pixel(x as u32, y as u32, Luma([sharpened.clamp(0, 255) as u8]));
3673        }
3674    }
3675    out
3676}
3677
3678/// Apply gamma correction to brighten or darken an image.
3679/// gamma < 1.0 brightens (helps see light ink); gamma > 1.0 darkens.
3680fn gamma_correct(gray: &GrayImage, gamma: f32) -> GrayImage {
3681    let mut out = GrayImage::new(gray.width(), gray.height());
3682    for (x, y, pixel) in gray.enumerate_pixels() {
3683        let v = pixel.0[0] as f32 / 255.0;
3684        let corrected = (v.powf(gamma) * 255.0).round() as u8;
3685        out.put_pixel(x, y, Luma([corrected]));
3686    }
3687    out
3688}
3689
3690fn contrast_stretch(gray: &GrayImage) -> GrayImage {
3691    let mut min_val = u8::MAX;
3692    let mut max_val = u8::MIN;
3693    for pixel in gray.pixels() {
3694        let value = pixel.0[0];
3695        min_val = min_val.min(value);
3696        max_val = max_val.max(value);
3697    }
3698
3699    if max_val <= min_val {
3700        return gray.clone();
3701    }
3702
3703    let in_range = (max_val - min_val) as f64;
3704    let mut out = GrayImage::new(gray.width(), gray.height());
3705    for (x, y, pixel) in gray.enumerate_pixels() {
3706        let value = pixel.0[0];
3707        let normalized = ((value.saturating_sub(min_val)) as f64 / in_range * 255.0).round() as u8;
3708        out.put_pixel(x, y, Luma([normalized]));
3709    }
3710    out
3711}
3712
3713fn global_otsu_binarize(gray: &GrayImage) -> GrayImage {
3714    let threshold = otsu_threshold(gray);
3715    let mut out = GrayImage::new(gray.width(), gray.height());
3716    for (x, y, pixel) in gray.enumerate_pixels() {
3717        let value = if pixel.0[0] <= threshold { 0 } else { 255 };
3718        out.put_pixel(x, y, Luma([value]));
3719    }
3720    out
3721}
3722
3723fn otsu_threshold(gray: &GrayImage) -> u8 {
3724    let mut histogram = [0u64; 256];
3725    for pixel in gray.pixels() {
3726        histogram[pixel.0[0] as usize] += 1;
3727    }
3728
3729    let total = (gray.width() as u64) * (gray.height() as u64);
3730    if total == 0 {
3731        return 127;
3732    }
3733
3734    let sum_total: f64 = histogram
3735        .iter()
3736        .enumerate()
3737        .map(|(idx, count)| idx as f64 * *count as f64)
3738        .sum();
3739
3740    let mut sum_background = 0f64;
3741    let mut weight_background = 0f64;
3742    let mut max_variance = -1f64;
3743    let mut best_threshold = 127u8;
3744
3745    for (idx, count) in histogram.iter().enumerate() {
3746        weight_background += *count as f64;
3747        if weight_background <= 0.0 {
3748            continue;
3749        }
3750
3751        let weight_foreground = total as f64 - weight_background;
3752        if weight_foreground <= 0.0 {
3753            break;
3754        }
3755
3756        sum_background += idx as f64 * *count as f64;
3757        let mean_background = sum_background / weight_background;
3758        let mean_foreground = (sum_total - sum_background) / weight_foreground;
3759        let between_class_variance =
3760            weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
3761
3762        if between_class_variance > max_variance {
3763            max_variance = between_class_variance;
3764            best_threshold = idx as u8;
3765        }
3766    }
3767
3768    best_threshold
3769}
3770
3771fn local_mean_binarize(gray: &GrayImage, radius: u32) -> GrayImage {
3772    if gray.width() == 0 || gray.height() == 0 {
3773        return gray.clone();
3774    }
3775
3776    let width = gray.width() as usize;
3777    let height = gray.height() as usize;
3778    let (integral, stride) = integral_image(gray);
3779    let mut out = GrayImage::new(gray.width(), gray.height());
3780
3781    for y in 0..height {
3782        for x in 0..width {
3783            let x1 = x.saturating_sub(radius as usize);
3784            let y1 = y.saturating_sub(radius as usize);
3785            let x2 = (x + radius as usize).min(width - 1);
3786            let y2 = (y + radius as usize).min(height - 1);
3787
3788            let area = (x2 - x1 + 1) * (y2 - y1 + 1);
3789            let sum = region_sum(&integral, stride, x1, y1, x2, y2);
3790            let local_mean = (sum as f64) / (area as f64);
3791            let offset = if area >= MIN_BINARIZATION_BLOCK_PIXELS {
3792                8.0
3793            } else {
3794                4.0
3795            };
3796            let threshold = (local_mean - offset).clamp(0.0, 255.0);
3797
3798            let pixel_value = gray.get_pixel(x as u32, y as u32).0[0] as f64;
3799            let value = if pixel_value <= threshold { 0 } else { 255 };
3800            out.put_pixel(x as u32, y as u32, Luma([value]));
3801        }
3802    }
3803
3804    out
3805}
3806
3807/// Morphological cleaning via dilation then erosion (closing operation)
3808/// Removes small noise and fills small holes in text
3809fn morphological_clean(gray: &GrayImage) -> GrayImage {
3810    if gray.width() == 0 || gray.height() == 0 {
3811        return gray.clone();
3812    }
3813
3814    // First binarize with otsu
3815    let binary = global_otsu_binarize(gray);
3816
3817    // Close operation: dilate then erode (fills small holes, connects broken text)
3818    let dilated = morphological_dilate(&binary, 2);
3819    morphological_erode(&dilated, 2)
3820}
3821
3822fn morphological_dilate(gray: &GrayImage, iterations: u32) -> GrayImage {
3823    let mut result = gray.clone();
3824    for _ in 0..iterations {
3825        let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3826
3827        for y in 1..gray.height().saturating_sub(1) {
3828            for x in 1..gray.width().saturating_sub(1) {
3829                // Check 3x3 neighborhood
3830                let mut has_black = false;
3831                for dy in 0..3 {
3832                    for dx in 0..3 {
3833                        let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3834                        if px < 128 {
3835                            has_black = true;
3836                            break;
3837                        }
3838                    }
3839                    if has_black {
3840                        break;
3841                    }
3842                }
3843                next.put_pixel(x, y, if has_black { Luma([0]) } else { Luma([255]) });
3844            }
3845        }
3846        result = next;
3847    }
3848    result
3849}
3850
3851fn morphological_erode(gray: &GrayImage, iterations: u32) -> GrayImage {
3852    let mut result = gray.clone();
3853    for _ in 0..iterations {
3854        let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3855
3856        for y in 1..gray.height().saturating_sub(1) {
3857            for x in 1..gray.width().saturating_sub(1) {
3858                // Erode black foreground: any white neighbor breaks the stroke,
3859                // otherwise the pixel remains black.
3860                let mut all_black = true;
3861                for dy in 0..3 {
3862                    for dx in 0..3 {
3863                        let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3864                        if px >= 128 {
3865                            all_black = false;
3866                            break;
3867                        }
3868                    }
3869                    if !all_black {
3870                        break;
3871                    }
3872                }
3873                next.put_pixel(x, y, if all_black { Luma([0]) } else { Luma([255]) });
3874            }
3875        }
3876        result = next;
3877    }
3878    result
3879}
3880
3881fn integral_image(gray: &GrayImage) -> (Vec<u64>, usize) {
3882    let width = gray.width() as usize;
3883    let height = gray.height() as usize;
3884    let stride = width + 1;
3885    let mut integral = vec![0u64; (width + 1) * (height + 1)];
3886
3887    for y in 0..height {
3888        let mut row_sum = 0u64;
3889        for x in 0..width {
3890            row_sum += gray.get_pixel(x as u32, y as u32).0[0] as u64;
3891            let idx = (y + 1) * stride + (x + 1);
3892            integral[idx] = integral[y * stride + (x + 1)] + row_sum;
3893        }
3894    }
3895
3896    (integral, stride)
3897}
3898
3899fn region_sum(integral: &[u64], stride: usize, x1: usize, y1: usize, x2: usize, y2: usize) -> u64 {
3900    let a = integral[y1 * stride + x1];
3901    let b = integral[y1 * stride + (x2 + 1)];
3902    let c = integral[(y2 + 1) * stride + x1];
3903    let d = integral[(y2 + 1) * stride + (x2 + 1)];
3904    d + a - b - c
3905}
3906
3907fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
3908    run_tesseract_plain_text_with_variant(image, psm)
3909}
3910
3911fn run_tesseract_plain_text_with_variant(image: &GrayImage, psm: &str) -> Option<String> {
3912    if matches!(selected_ocr_engine(), OcrEngine::RapidOcr) {
3913        return run_rapidocr_words(image).map(|words| words_to_plain_line_text(&words));
3914    }
3915
3916    let temp_dir = create_temp_dir(0).ok()?;
3917    let image_path = temp_dir.join("ocr.png");
3918    if image.save(&image_path).is_err() {
3919        let _ = fs::remove_dir_all(&temp_dir);
3920        return None;
3921    }
3922
3923    let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3924    let output = Command::new("tesseract")
3925        .current_dir(&temp_dir)
3926        .arg("ocr.png")
3927        .arg("stdout")
3928        .arg("--dpi")
3929        .arg(&dpi)
3930        .arg("--oem")
3931        .arg("3")
3932        .arg("--psm")
3933        .arg(psm)
3934        .arg("-c")
3935        .arg("load_system_dawg=0")
3936        .arg("-c")
3937        .arg("load_freq_dawg=0")
3938        .output()
3939        .ok()?;
3940    let _ = fs::remove_dir_all(&temp_dir);
3941    if !output.status.success() {
3942        return None;
3943    }
3944
3945    Some(
3946        String::from_utf8_lossy(&output.stdout)
3947            .replace('\n', " ")
3948            .split_whitespace()
3949            .collect::<Vec<_>>()
3950            .join(" "),
3951    )
3952}
3953
3954fn words_to_text_chunks(
3955    words: &[OcrWord],
3956    image: &ImageChunk,
3957    text_chunks: &[TextChunk],
3958) -> Vec<TextChunk> {
3959    let mut image_size = (0u32, 0u32);
3960    for word in words {
3961        image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
3962        image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
3963    }
3964    if image_size.0 == 0 || image_size.1 == 0 {
3965        return Vec::new();
3966    }
3967
3968    let mut dedupe: HashMap<String, usize> = HashMap::new();
3969    for chunk in text_chunks {
3970        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3971    }
3972
3973    let mut recovered = Vec::new();
3974    for word in words {
3975        let normalized = normalize_text(&word.text);
3976        if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
3977            continue;
3978        }
3979
3980        let left_ratio = f64::from(word.left) / f64::from(image_size.0);
3981        let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
3982        let top_ratio = f64::from(word.top) / f64::from(image_size.1);
3983        let bottom_ratio =
3984            f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
3985
3986        let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
3987        let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
3988        let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
3989        let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
3990        if right_x <= left_x || top_y <= bottom_y {
3991            continue;
3992        }
3993
3994        recovered.push(TextChunk {
3995            value: word.text.clone(),
3996            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3997            font_name: "OCR".to_string(),
3998            font_size: (top_y - bottom_y).max(6.0),
3999            font_weight: 400.0,
4000            italic_angle: 0.0,
4001            font_color: "#000000".to_string(),
4002            contrast_ratio: 21.0,
4003            symbol_ends: Vec::new(),
4004            text_format: TextFormat::Normal,
4005            text_type: TextType::Regular,
4006            pdf_layer: PdfLayer::Content,
4007            ocg_visible: true,
4008            index: None,
4009            page_number: image.bbox.page_number,
4010            level: None,
4011            mcid: None,
4012        });
4013    }
4014
4015    recovered
4016}
4017
4018fn lines_from_ocr_words(
4019    words: &[OcrWord],
4020    image: &ImageChunk,
4021    image_width: u32,
4022    image_height: u32,
4023    text_chunks: &[TextChunk],
4024) -> Vec<TextChunk> {
4025    if image_width == 0 || image_height == 0 {
4026        return Vec::new();
4027    }
4028
4029    let mut dedupe: HashMap<String, usize> = HashMap::new();
4030    for chunk in text_chunks {
4031        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
4032    }
4033
4034    let spatial_lines = build_spatial_ocr_lines(words);
4035    if spatial_lines.is_empty() {
4036        return Vec::new();
4037    }
4038
4039    let blocks = merge_spatial_ocr_lines_into_blocks(&spatial_lines, image_width);
4040    if blocks.is_empty() {
4041        return Vec::new();
4042    }
4043
4044    let mut recovered = Vec::new();
4045    for block in blocks {
4046        let normalized = normalize_text(&block.text);
4047        if normalized.len() >= 8 && dedupe.contains_key(&normalized) {
4048            continue;
4049        }
4050
4051        if block.right <= block.left || block.bottom <= block.top {
4052            continue;
4053        }
4054
4055        let left_x = image.bbox.left_x
4056            + image.bbox.width() * (f64::from(block.left) / f64::from(image_width));
4057        let right_x = image.bbox.left_x
4058            + image.bbox.width() * (f64::from(block.right) / f64::from(image_width));
4059        let top_y = image.bbox.top_y
4060            - image.bbox.height() * (f64::from(block.top) / f64::from(image_height));
4061        let bottom_y = image.bbox.top_y
4062            - image.bbox.height() * (f64::from(block.bottom) / f64::from(image_height));
4063        if right_x <= left_x || top_y <= bottom_y {
4064            continue;
4065        }
4066
4067        recovered.push(TextChunk {
4068            value: block.text,
4069            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
4070            font_name: "OCR".to_string(),
4071            font_size: (f64::from(block.line_height_sum) / block.line_count.max(1) as f64).max(6.0),
4072            font_weight: 400.0,
4073            italic_angle: 0.0,
4074            font_color: "#000000".to_string(),
4075            contrast_ratio: 21.0,
4076            symbol_ends: Vec::new(),
4077            text_format: TextFormat::Normal,
4078            text_type: TextType::Regular,
4079            pdf_layer: PdfLayer::Content,
4080            ocg_visible: true,
4081            index: None,
4082            page_number: image.bbox.page_number,
4083            level: None,
4084            mcid: None,
4085        });
4086    }
4087
4088    recovered
4089}
4090
4091#[derive(Debug, Clone)]
4092struct SpatialOcrLine {
4093    left: u32,
4094    top: u32,
4095    right: u32,
4096    bottom: u32,
4097    text: String,
4098    word_count: usize,
4099    line_count: usize,
4100    line_height_sum: u32,
4101}
4102
4103fn build_spatial_ocr_lines(words: &[OcrWord]) -> Vec<SpatialOcrLine> {
4104    let filtered_words = filter_words_by_spatial_coherence(words);
4105    if filtered_words.is_empty() {
4106        return Vec::new();
4107    }
4108
4109    let avg_word_width =
4110        filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
4111    let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
4112    let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
4113
4114    let mut lines = Vec::new();
4115    for mut cluster in clusters {
4116        cluster.sort_by_key(|word| word.left);
4117        let text = cluster
4118            .iter()
4119            .map(|word| word.text.as_str())
4120            .collect::<Vec<_>>()
4121            .join(" ")
4122            .trim()
4123            .to_string();
4124        if text.is_empty() {
4125            continue;
4126        }
4127
4128        let left = cluster.iter().map(|word| word.left).min().unwrap_or(0);
4129        let right = cluster
4130            .iter()
4131            .map(|word| word.left.saturating_add(word.width))
4132            .max()
4133            .unwrap_or(0);
4134        let top = cluster.iter().map(|word| word.top).min().unwrap_or(0);
4135        let bottom = cluster
4136            .iter()
4137            .map(|word| word.top.saturating_add(word.height))
4138            .max()
4139            .unwrap_or(0);
4140        if right <= left || bottom <= top {
4141            continue;
4142        }
4143
4144        lines.push(SpatialOcrLine {
4145            left,
4146            top,
4147            right,
4148            bottom,
4149            text,
4150            word_count: cluster.len(),
4151            line_count: 1,
4152            line_height_sum: bottom.saturating_sub(top).max(1),
4153        });
4154    }
4155
4156    lines.sort_by_key(|line| (line.top, line.left));
4157    lines
4158}
4159
4160fn merge_spatial_ocr_lines_into_blocks(
4161    lines: &[SpatialOcrLine],
4162    image_width: u32,
4163) -> Vec<SpatialOcrLine> {
4164    if lines.is_empty() {
4165        return Vec::new();
4166    }
4167
4168    let median_height = {
4169        let mut heights: Vec<u32> = lines
4170            .iter()
4171            .map(|line| line.bottom.saturating_sub(line.top).max(1))
4172            .collect();
4173        heights.sort_unstable();
4174        heights[heights.len() / 2]
4175    };
4176    let vertical_tolerance = (median_height / 2).max(3);
4177    let max_vertical_gap = median_height.saturating_mul(2).max(8);
4178
4179    let mut blocks: Vec<SpatialOcrLine> = Vec::new();
4180    for line in lines {
4181        let merge_idx = blocks.iter().rposition(|block| {
4182            let vertical_gap = line.top.saturating_sub(block.bottom);
4183            if vertical_gap > max_vertical_gap {
4184                return false;
4185            }
4186            if line.top + vertical_tolerance < block.bottom {
4187                return false;
4188            }
4189
4190            spatial_lines_share_block_geometry(block, line, image_width, median_height)
4191        });
4192
4193        if let Some(merge_idx) = merge_idx {
4194            let block = &mut blocks[merge_idx];
4195            block.left = block.left.min(line.left);
4196            block.top = block.top.min(line.top);
4197            block.right = block.right.max(line.right);
4198            block.bottom = block.bottom.max(line.bottom);
4199            block.word_count += line.word_count;
4200            block.line_count += line.line_count;
4201            block.line_height_sum = block.line_height_sum.saturating_add(line.line_height_sum);
4202            if !block.text.ends_with('-') {
4203                block.text.push(' ');
4204            }
4205            block.text.push_str(&line.text);
4206            continue;
4207        }
4208
4209        blocks.push(line.clone());
4210    }
4211
4212    blocks
4213        .into_iter()
4214        .filter_map(|mut block| {
4215            block.text = block.text.split_whitespace().collect::<Vec<_>>().join(" ");
4216            let alphabetic = block.text.chars().filter(|ch| ch.is_alphabetic()).count();
4217            let min_chars = if block.word_count >= 4 { 10 } else { 16 };
4218            if block.text.len() < min_chars || alphabetic < 4 {
4219                return None;
4220            }
4221            Some(block)
4222        })
4223        .collect()
4224}
4225
4226fn spatial_lines_share_block_geometry(
4227    upper: &SpatialOcrLine,
4228    lower: &SpatialOcrLine,
4229    image_width: u32,
4230    median_height: u32,
4231) -> bool {
4232    let overlap_left = upper.left.max(lower.left);
4233    let overlap_right = upper.right.min(lower.right);
4234    let overlap = overlap_right.saturating_sub(overlap_left);
4235    let upper_width = upper.right.saturating_sub(upper.left).max(1);
4236    let lower_width = lower.right.saturating_sub(lower.left).max(1);
4237    let min_width = upper_width.min(lower_width);
4238    let max_width = upper_width.max(lower_width);
4239    let overlap_ratio = overlap as f64 / min_width as f64;
4240    let width_ratio = min_width as f64 / max_width as f64;
4241    let max_left_shift = ((f64::from(image_width) * 0.045).round() as u32)
4242        .max(median_height.saturating_mul(2))
4243        .max(8);
4244    let left_shift = upper.left.abs_diff(lower.left);
4245
4246    overlap_ratio >= 0.40
4247        || (overlap_ratio >= 0.15 && left_shift <= max_left_shift && width_ratio >= 0.55)
4248}
4249
4250fn is_numeric_like(text: &str) -> bool {
4251    text.chars().any(|ch| ch.is_ascii_digit())
4252}
4253
4254fn normalize_text(text: &str) -> String {
4255    text.chars()
4256        .filter(|ch| ch.is_alphanumeric())
4257        .flat_map(|ch| ch.to_lowercase())
4258        .collect()
4259}
4260
4261fn normalize_caption_text(text: &str) -> String {
4262    text.replace("CarolinaBLUTM", "CarolinaBLU™")
4263        .replace("CarolinaBLU™™", "CarolinaBLU™")
4264        .trim()
4265        .to_string()
4266}
4267
4268fn normalize_raster_cell_text(row_idx: usize, _col_idx: usize, text: String) -> String {
4269    let mut normalized = text
4270        .replace('|', " ")
4271        .replace('—', "-")
4272        .replace("AorB", "A or B")
4273        .replace("Aor B", "A or B")
4274        .replace("H,O", "H2O")
4275        .split_whitespace()
4276        .collect::<Vec<_>>()
4277        .join(" ");
4278
4279    if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
4280        return String::new();
4281    }
4282    if row_idx > 0
4283        && normalized
4284            .chars()
4285            .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
4286    {
4287        return String::new();
4288    }
4289
4290    normalized = normalized
4291        .replace(" ywL", " μL")
4292        .replace(" yuL", " μL")
4293        .replace(" yL", " μL")
4294        .replace(" wL", " μL")
4295        .replace(" uL", " μL")
4296        .replace(" pL", " μL");
4297
4298    normalized.trim().to_string()
4299}
4300
4301fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
4302    let unique = SystemTime::now()
4303        .duration_since(UNIX_EPOCH)
4304        .unwrap_or_default()
4305        .as_nanos();
4306    let dir = std::env::temp_dir().join(format!(
4307        "edgeparse-raster-ocr-{}-{}-{}",
4308        std::process::id(),
4309        page_number,
4310        unique
4311    ));
4312    fs::create_dir_all(&dir)?;
4313    Ok(dir)
4314}
4315
4316fn extract_visible_page_image_files(
4317    input_path: &Path,
4318    page_number: u32,
4319    temp_dir: &Path,
4320) -> Option<Vec<PathBuf>> {
4321    let list_output = Command::new("pdfimages")
4322        .arg("-f")
4323        .arg(page_number.to_string())
4324        .arg("-l")
4325        .arg(page_number.to_string())
4326        .arg("-list")
4327        .arg(input_path)
4328        .output()
4329        .ok()?;
4330    if !list_output.status.success() {
4331        return None;
4332    }
4333
4334    let entries = parse_pdfimages_list(&String::from_utf8_lossy(&list_output.stdout));
4335    let visible_indices: Vec<usize> = entries
4336        .iter()
4337        .enumerate()
4338        .filter_map(|(idx, entry)| (entry.image_type == "image").then_some(idx))
4339        .collect();
4340    if visible_indices.is_empty() {
4341        return Some(Vec::new());
4342    }
4343
4344    let prefix = temp_dir.join("img");
4345    let status = Command::new("pdfimages")
4346        .arg("-f")
4347        .arg(page_number.to_string())
4348        .arg("-l")
4349        .arg(page_number.to_string())
4350        .arg("-png")
4351        .arg(input_path)
4352        .arg(&prefix)
4353        .status()
4354        .ok()?;
4355    if !status.success() {
4356        return None;
4357    }
4358
4359    let mut image_files: Vec<PathBuf> = fs::read_dir(temp_dir)
4360        .ok()?
4361        .filter_map(|entry| entry.ok().map(|e| e.path()))
4362        .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
4363        .collect();
4364    image_files.sort();
4365
4366    let visible_files: Vec<PathBuf> = visible_indices
4367        .into_iter()
4368        .filter_map(|idx| image_files.get(idx).cloned())
4369        .collect();
4370    Some(visible_files)
4371}
4372
4373fn parse_pdfimages_list(output: &str) -> Vec<PdfImagesListEntry> {
4374    let mut entries = Vec::new();
4375    let mut in_rows = false;
4376
4377    for line in output.lines() {
4378        let trimmed = line.trim();
4379        if trimmed.is_empty() {
4380            continue;
4381        }
4382        if trimmed.starts_with("---") {
4383            in_rows = true;
4384            continue;
4385        }
4386        if !in_rows {
4387            continue;
4388        }
4389
4390        let mut cols = trimmed.split_whitespace();
4391        let Some(_page) = cols.next() else {
4392            continue;
4393        };
4394        let Some(_num) = cols.next() else {
4395            continue;
4396        };
4397        let Some(image_type) = cols.next() else {
4398            continue;
4399        };
4400
4401        entries.push(PdfImagesListEntry {
4402            image_type: image_type.to_string(),
4403        });
4404    }
4405
4406    entries
4407}
4408
4409#[cfg(test)]
4410mod tests {
4411    use super::*;
4412    use image::GrayImage;
4413    use crate::models::enums::{PdfLayer, TextFormat, TextType};
4414
4415    fn image_chunk() -> ImageChunk {
4416        ImageChunk {
4417            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 400.0, 400.0),
4418            index: Some(1),
4419            level: None,
4420        }
4421    }
4422
4423    fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
4424        OcrWord {
4425            line_key: line,
4426            left,
4427            top: 0,
4428            width: 40,
4429            height: 12,
4430            text: text.to_string(),
4431            confidence: 90.0,
4432        }
4433    }
4434
4435    fn word_at(line: (u32, u32, u32), left: u32, top: u32, width: u32, text: &str) -> OcrWord {
4436        OcrWord {
4437            line_key: line,
4438            left,
4439            top,
4440            width,
4441            height: 12,
4442            text: text.to_string(),
4443            confidence: 90.0,
4444        }
4445    }
4446
4447    fn text_chunk(value: &str, bbox: BoundingBox) -> TextChunk {
4448        TextChunk {
4449            value: value.to_string(),
4450            bbox,
4451            font_name: "Helvetica".to_string(),
4452            font_size: 12.0,
4453            font_weight: 400.0,
4454            italic_angle: 0.0,
4455            font_color: "#000000".to_string(),
4456            contrast_ratio: 21.0,
4457            symbol_ends: Vec::new(),
4458            text_format: TextFormat::Normal,
4459            text_type: TextType::Regular,
4460            pdf_layer: PdfLayer::Main,
4461            ocg_visible: true,
4462            index: None,
4463            page_number: Some(1),
4464            level: None,
4465            mcid: None,
4466        }
4467    }
4468
4469    fn test_cell_text(cell: &TableBorderCell) -> String {
4470        cell.content
4471            .iter()
4472            .map(|token| token.base.value.trim())
4473            .filter(|value| !value.is_empty())
4474            .collect::<Vec<_>>()
4475            .join(" ")
4476    }
4477
4478    #[test]
4479    fn test_table_like_ocr_detects_repeated_columns() {
4480        let words = vec![
4481            word((1, 1, 1), 10, "Temperature"),
4482            word((1, 1, 1), 120, "Viscosity"),
4483            word((1, 1, 1), 240, "Temperature"),
4484            word((1, 1, 1), 360, "Viscosity"),
4485            word((1, 1, 2), 10, "0"),
4486            word((1, 1, 2), 120, "1.793E-06"),
4487            word((1, 1, 2), 240, "25"),
4488            word((1, 1, 2), 360, "8.930E-07"),
4489            word((1, 1, 3), 10, "1"),
4490            word((1, 1, 3), 120, "1.732E-06"),
4491            word((1, 1, 3), 240, "26"),
4492            word((1, 1, 3), 360, "8.760E-07"),
4493        ];
4494        assert!(!looks_like_chart_label_ocr(&words));
4495        assert!(looks_like_table_ocr(&words));
4496    }
4497
4498    #[test]
4499    fn test_structured_ocr_table_border_recovers_non_numeric_table() {
4500        let image = image_chunk();
4501        let words = vec![
4502            word_at((1, 1, 1), 10, 10, 80, "Tube"),
4503            word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4504            word_at((1, 1, 1), 305, 10, 70, "DNA"),
4505            word_at((1, 1, 2), 10, 42, 80, "1"),
4506            word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4507            word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4508            word_at((1, 1, 3), 10, 74, 80, "2"),
4509            word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4510            word_at((1, 1, 3), 305, 74, 70, "lambda"),
4511            word_at((1, 1, 4), 10, 106, 80, "3"),
4512            word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4513            word_at((1, 1, 4), 305, 106, 70, "control"),
4514        ];
4515
4516        assert!(!looks_like_chart_label_ocr(&words));
4517        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4518        assert_eq!(table.num_columns, 3);
4519        assert_eq!(table.num_rows, 4);
4520        assert_eq!(test_cell_text(&table.rows[0].cells[0]), "Tube");
4521        assert_eq!(test_cell_text(&table.rows[1].cells[1]), "BamHI");
4522        assert_eq!(test_cell_text(&table.rows[3].cells[2]), "control");
4523    }
4524
4525    #[test]
4526    fn test_structured_ocr_table_border_scales_column_boundaries_to_page_bbox() {
4527        let image = ImageChunk {
4528            bbox: BoundingBox::new(Some(1), 56.6929, 163.6519, 555.3071, 442.0069),
4529            index: Some(1),
4530            level: None,
4531        };
4532        let words = vec![
4533            word_at((1, 1, 1), 10, 10, 110, "TempC"),
4534            word_at((1, 1, 1), 255, 10, 150, "KinViscA"),
4535            word_at((1, 1, 1), 520, 10, 110, "TempC"),
4536            word_at((1, 1, 1), 760, 10, 170, "KinViscB"),
4537            word_at((1, 1, 2), 10, 44, 24, "0"),
4538            word_at((1, 1, 2), 255, 44, 130, "1.793E-06"),
4539            word_at((1, 1, 2), 520, 44, 28, "25"),
4540            word_at((1, 1, 2), 760, 44, 130, "8.930E-07"),
4541            word_at((1, 1, 3), 10, 78, 24, "1"),
4542            word_at((1, 1, 3), 255, 78, 130, "1.732E-06"),
4543            word_at((1, 1, 3), 520, 78, 28, "26"),
4544            word_at((1, 1, 3), 760, 78, 130, "8.760E-07"),
4545        ];
4546
4547        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4548
4549        assert_eq!(table.num_columns, 4);
4550        assert_eq!(table.num_rows, 3);
4551        assert_eq!(test_cell_text(&table.rows[1].cells[1]), "1.793E-06");
4552        assert!(table.x_coordinates.windows(2).all(|pair| pair[1] >= pair[0]));
4553        assert!(table
4554            .x_coordinates
4555            .iter()
4556            .all(|x| *x >= image.bbox.left_x && *x <= image.bbox.right_x));
4557    }
4558
4559    #[test]
4560    fn test_chart_label_ocr_does_not_reject_five_row_table() {
4561        let words = vec![
4562            word_at((1, 1, 1), 10, 10, 80, "Tube"),
4563            word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4564            word_at((1, 1, 1), 305, 10, 70, "DNA"),
4565            word_at((1, 1, 2), 10, 42, 80, "1"),
4566            word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4567            word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4568            word_at((1, 1, 3), 10, 74, 80, "2"),
4569            word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4570            word_at((1, 1, 3), 305, 74, 70, "lambda"),
4571            word_at((1, 1, 4), 10, 106, 80, "3"),
4572            word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4573            word_at((1, 1, 4), 305, 106, 70, "control"),
4574            word_at((1, 1, 5), 10, 138, 80, "4"),
4575            word_at((1, 1, 5), 145, 138, 110, "NotI"),
4576            word_at((1, 1, 5), 305, 138, 70, "sample"),
4577        ];
4578
4579        assert!(!looks_like_chart_label_ocr(&words));
4580        assert!(looks_like_table_ocr(&words));
4581    }
4582
4583    #[test]
4584    fn test_structured_ocr_table_border_rejects_two_column_prose_layout() {
4585        let image = image_chunk();
4586        let words = vec![
4587            word_at((1, 1, 1), 10, 10, 90, "Summary"),
4588            word_at((1, 1, 1), 220, 10, 120, "Detailed findings"),
4589            word_at((1, 1, 2), 10, 42, 90, "Background"),
4590            word_at((1, 1, 2), 220, 42, 120, "Additional context"),
4591            word_at((1, 1, 3), 10, 74, 90, "Notes"),
4592            word_at((1, 1, 3), 220, 74, 120, "Further explanation"),
4593        ];
4594
4595        assert!(build_structured_ocr_table_border(&words, &image).is_none());
4596    }
4597
4598    #[test]
4599    fn test_parse_pdfimages_list_ignores_smask_entries() {
4600        let output = "page   num  type   width height color comp bpc  enc interp  object ID x-ppi y-ppi size ratio\n--------------------------------------------------------------------------------------------\n   1     0 image    1320   358  icc     3   8  image  no        46  0   208   208 63.5K 4.6%\n   1     1 smask    1320   358  gray    1   8  image  no        46  0   208   208  483B 0.1%\n";
4601
4602        let entries = parse_pdfimages_list(output);
4603        assert_eq!(entries.len(), 2);
4604        assert_eq!(entries[0].image_type, "image");
4605        assert_eq!(entries[1].image_type, "smask");
4606    }
4607
4608    #[test]
4609    fn test_table_like_ocr_rejects_single_line_caption() {
4610        let words = vec![
4611            word((1, 1, 1), 10, "Figure"),
4612            word((1, 1, 1), 90, "7.2"),
4613            word((1, 1, 1), 150, "Viscosity"),
4614            word((1, 1, 1), 260, "of"),
4615            word((1, 1, 1), 300, "Water"),
4616        ];
4617        assert!(!looks_like_table_ocr(&words));
4618    }
4619
4620    #[test]
4621    fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
4622        assert_eq!(
4623            normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
4624            "3 μL"
4625        );
4626        assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
4627        assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
4628    }
4629
4630    #[test]
4631    fn test_detect_bordered_raster_grid_finds_strong_lines() {
4632        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
4633        for x in [10, 40, 80, 110] {
4634            for y in 10..71 {
4635                image.put_pixel(x, y, Luma([0]));
4636            }
4637        }
4638        for y in [10, 30, 50, 70] {
4639            for x in 10..111 {
4640                image.put_pixel(x, y, Luma([0]));
4641            }
4642        }
4643
4644        let grid = detect_bordered_raster_grid(&image).expect("grid");
4645        assert_eq!(grid.vertical_lines.len(), 4);
4646        assert_eq!(grid.horizontal_lines.len(), 4);
4647    }
4648
4649    #[test]
4650    fn test_obvious_bar_chart_raster_is_rejected() {
4651        let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4652        for &(y1, y2) in &[(25, 40), (70, 85), (115, 130), (160, 175)] {
4653            for y in y1..y2 {
4654                for x in 40..280 {
4655                    image.put_pixel(x, y, Luma([80]));
4656                }
4657            }
4658        }
4659
4660        assert!(is_obvious_bar_chart_raster(&image));
4661    }
4662
4663    #[test]
4664    fn test_vertical_bar_chart_raster_is_rejected() {
4665        let mut image = GrayImage::from_pixel(360, 240, Luma([255]));
4666        for &(x1, x2, y1) in &[
4667            (40, 78, 52),
4668            (92, 126, 118),
4669            (140, 170, 146),
4670            (184, 210, 162),
4671        ] {
4672            for x in x1..x2 {
4673                for y in y1..212 {
4674                    image.put_pixel(x, y, Luma([90]));
4675                }
4676            }
4677        }
4678
4679        assert!(is_obvious_bar_chart_raster(&image));
4680    }
4681
4682    #[test]
4683    fn test_light_fill_vertical_bar_chart_raster_is_rejected() {
4684        let mut image = GrayImage::from_pixel(420, 260, Luma([255]));
4685        for x in 24..396 {
4686            image.put_pixel(x, 222, Luma([170]));
4687        }
4688        for &(x1, x2, y1, shade) in &[
4689            (46, 82, 132, 222),
4690            (104, 140, 84, 214),
4691            (162, 198, 62, 206),
4692            (220, 256, 144, 228),
4693        ] {
4694            for x in x1..x2 {
4695                for y in y1..222 {
4696                    image.put_pixel(x, y, Luma([shade]));
4697                }
4698            }
4699        }
4700
4701        assert!(is_obvious_bar_chart_raster(&image));
4702    }
4703
4704    #[test]
4705    fn test_grouped_vertical_bar_chart_raster_is_rejected() {
4706        let mut image = GrayImage::from_pixel(420, 240, Luma([255]));
4707        for x in 28..392 {
4708            image.put_pixel(x, 214, Luma([175]));
4709        }
4710        for &(x1, x2, y1, shade) in &[
4711            (44, 60, 98, 210),
4712            (64, 80, 140, 225),
4713            (108, 124, 116, 214),
4714            (128, 144, 148, 229),
4715            (172, 188, 88, 206),
4716            (192, 208, 128, 222),
4717            (236, 252, 104, 212),
4718            (256, 272, 156, 228),
4719        ] {
4720            for x in x1..x2 {
4721                for y in y1..214 {
4722                    image.put_pixel(x, y, Luma([shade]));
4723                }
4724            }
4725        }
4726
4727        assert!(is_obvious_bar_chart_raster(&image));
4728    }
4729
4730    #[test]
4731    fn test_natural_photograph_raster_is_detected() {
4732        // Create a photo-like image: wide histogram spread across [20, 230] mid-tones
4733        let w = 100u32;
4734        let h = 100u32;
4735        let mut image = GrayImage::new(w, h);
4736        // Fill with a gradient covering the full range — most pixels will be mid-tone
4737        for y in 0..h {
4738            for x in 0..w {
4739                let v = ((x + y) * 255 / (w + h - 2)) as u8;
4740                image.put_pixel(x, y, Luma([v]));
4741            }
4742        }
4743        // Should be classified as photographic (≥30% mid-tone pixels)
4744        assert!(is_natural_photograph_raster(&image));
4745    }
4746
4747    #[test]
4748    fn test_chart_image_is_not_classified_as_photograph() {
4749        // Chart-like image: mostly white with a few dark lines (no mid-tone content)
4750        let mut image = GrayImage::from_pixel(200, 160, Luma([255]));
4751        // A few thin dark lines (table borders or chart axes)
4752        for x in 20..180 {
4753            image.put_pixel(x, 20, Luma([0]));
4754            image.put_pixel(x, 80, Luma([0]));
4755            image.put_pixel(x, 140, Luma([0]));
4756        }
4757        for y in 20..141 {
4758            image.put_pixel(20, y, Luma([0]));
4759            image.put_pixel(180, y, Luma([0]));
4760        }
4761        // Very few mid-tone pixels — should NOT be classified as photograph
4762        assert!(!is_natural_photograph_raster(&image));
4763        assert!(!is_dark_ui_screenshot_raster(&image));
4764    }
4765
4766    #[test]
4767    fn test_bright_natural_photograph_raster_is_detected() {
4768        let mut image = GrayImage::from_pixel(240, 180, Luma([250]));
4769        for y in 24..148 {
4770            for x in 52..156 {
4771                let tone = 72 + (((x - 52) * 11 + (y - 24) * 7) % 132) as u8;
4772                image.put_pixel(x, y, Luma([tone]));
4773            }
4774        }
4775
4776        assert!(is_natural_photograph_raster(&image));
4777    }
4778
4779    #[test]
4780    fn test_dark_ui_screenshot_raster_is_detected() {
4781        let mut image = GrayImage::from_pixel(260, 180, Luma([20]));
4782        for x in 18..242 {
4783            for y in 18..34 {
4784                image.put_pixel(x, y, Luma([210]));
4785            }
4786        }
4787        for &(x1, y1, x2, y2, shade) in &[
4788            (26, 58, 84, 108, 198),
4789            (94, 58, 152, 108, 210),
4790            (162, 58, 220, 108, 192),
4791            (26, 118, 220, 134, 224),
4792        ] {
4793            for x in x1..x2 {
4794                for y in y1..y2 {
4795                    image.put_pixel(x, y, Luma([shade]));
4796                }
4797            }
4798        }
4799
4800        assert!(is_dark_ui_screenshot_raster(&image));
4801    }
4802
4803    #[test]
4804    fn test_table_like_ocr_rejects_matrix_formula_layout() {
4805        let words = vec![
4806            word_at((1, 1, 1), 14, 10, 36, "B23"),
4807            word_at((1, 1, 1), 160, 10, 22, "C1"),
4808            word_at((1, 1, 1), 230, 10, 22, "C2"),
4809            word_at((1, 1, 1), 300, 10, 22, "C3"),
4810            word_at((1, 1, 2), 20, 44, 24, "0/0"),
4811            word_at((1, 1, 2), 150, 44, 18, "0"),
4812            word_at((1, 1, 2), 220, 44, 28, "001"),
4813            word_at((1, 1, 2), 300, 44, 28, "000"),
4814            word_at((1, 1, 3), 20, 76, 24, "0/1"),
4815            word_at((1, 1, 3), 150, 76, 28, "000"),
4816            word_at((1, 1, 3), 220, 76, 28, "010"),
4817            word_at((1, 1, 3), 300, 76, 28, "000"),
4818        ];
4819
4820        assert!(looks_like_matrix_formula_ocr(&words));
4821        assert!(!looks_like_table_ocr(&words));
4822    }
4823
4824    #[test]
4825    fn test_table_like_ocr_keeps_small_numeric_table_with_real_headers() {
4826        let words = vec![
4827            word_at((1, 1, 1), 10, 10, 64, "Year"),
4828            word_at((1, 1, 1), 130, 10, 28, "Q1"),
4829            word_at((1, 1, 1), 220, 10, 28, "Q2"),
4830            word_at((1, 1, 1), 310, 10, 28, "Q3"),
4831            word_at((1, 1, 2), 10, 42, 64, "2022"),
4832            word_at((1, 1, 2), 130, 42, 24, "10"),
4833            word_at((1, 1, 2), 220, 42, 24, "25"),
4834            word_at((1, 1, 2), 310, 42, 24, "30"),
4835            word_at((1, 1, 3), 10, 74, 64, "2023"),
4836            word_at((1, 1, 3), 130, 74, 24, "11"),
4837            word_at((1, 1, 3), 220, 74, 24, "26"),
4838            word_at((1, 1, 3), 310, 74, 24, "31"),
4839        ];
4840
4841        assert!(!looks_like_matrix_formula_ocr(&words));
4842        assert!(looks_like_table_ocr(&words));
4843    }
4844
4845    #[test]
4846    fn test_matrixish_small_ocr_table_is_rejected_after_build() {
4847        let image = ImageChunk {
4848            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 120.0),
4849            index: Some(1),
4850            level: None,
4851        };
4852        let words = vec![
4853            word_at((1, 1, 1), 14, 10, 36, "B23"),
4854            word_at((1, 1, 1), 160, 10, 22, "C1"),
4855            word_at((1, 1, 1), 230, 10, 22, "C2"),
4856            word_at((1, 1, 1), 300, 10, 22, "C3"),
4857            word_at((1, 1, 2), 20, 44, 24, "0/0"),
4858            word_at((1, 1, 2), 150, 44, 18, "0"),
4859            word_at((1, 1, 2), 220, 44, 28, "001"),
4860            word_at((1, 1, 2), 300, 44, 28, "000"),
4861            word_at((1, 1, 3), 20, 76, 24, "0/1"),
4862            word_at((1, 1, 3), 150, 76, 28, "000"),
4863            word_at((1, 1, 3), 220, 76, 28, "010"),
4864            word_at((1, 1, 3), 300, 76, 28, "000"),
4865        ];
4866
4867        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4868        assert!(is_matrixish_ocr_artifact_table(&table));
4869    }
4870
4871    #[test]
4872    fn test_small_numeric_table_with_real_headers_is_not_rejected_after_build() {
4873        let image = ImageChunk {
4874            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 140.0),
4875            index: Some(1),
4876            level: None,
4877        };
4878        let words = vec![
4879            word_at((1, 1, 1), 10, 10, 64, "Year"),
4880            word_at((1, 1, 1), 130, 10, 28, "Q1"),
4881            word_at((1, 1, 1), 220, 10, 28, "Q2"),
4882            word_at((1, 1, 1), 310, 10, 28, "Q3"),
4883            word_at((1, 1, 2), 10, 42, 64, "2022"),
4884            word_at((1, 1, 2), 130, 42, 24, "10"),
4885            word_at((1, 1, 2), 220, 42, 24, "25"),
4886            word_at((1, 1, 2), 310, 42, 24, "30"),
4887            word_at((1, 1, 3), 10, 74, 64, "2023"),
4888            word_at((1, 1, 3), 130, 74, 24, "11"),
4889            word_at((1, 1, 3), 220, 74, 24, "26"),
4890            word_at((1, 1, 3), 310, 74, 24, "31"),
4891        ];
4892
4893        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4894        assert!(!is_matrixish_ocr_artifact_table(&table));
4895    }
4896
4897    #[test]
4898    fn test_bordered_table_raster_is_not_rejected_as_chart() {
4899        let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4900        for x in [20, 110, 210, 300] {
4901            for y in 20..181 {
4902                image.put_pixel(x, y, Luma([0]));
4903            }
4904        }
4905        for y in [20, 70, 120, 180] {
4906            for x in 20..301 {
4907                image.put_pixel(x, y, Luma([0]));
4908            }
4909        }
4910
4911        assert!(!is_obvious_bar_chart_raster(&image));
4912    }
4913
4914    #[test]
4915    fn test_morphological_erode_preserves_white_background() {
4916        let image = GrayImage::from_fn(9, 9, |x, y| {
4917            if x == 4 || y == 4 {
4918                Luma([0])
4919            } else {
4920                Luma([255])
4921            }
4922        });
4923
4924        let eroded = morphological_erode(&image, 1);
4925
4926        assert_eq!(eroded.get_pixel(0, 0).0[0], 255);
4927        assert_eq!(eroded.get_pixel(8, 8).0[0], 255);
4928        assert_eq!(eroded.get_pixel(4, 4).0[0], 255);
4929    }
4930
4931    #[test]
4932    fn test_dense_prose_image_ocr_detects_infographic_text() {
4933        let mut words = Vec::new();
4934        let mut top = 20;
4935        for line_num in 1..=8 {
4936            for (idx, (left, text)) in [
4937                (20, "Copyright"),
4938                (120, "protects"),
4939                (240, "creative"),
4940                (350, "work"),
4941            ]
4942            .into_iter()
4943            .enumerate()
4944            {
4945                words.push(OcrWord {
4946                    line_key: (1, 1, line_num),
4947                    left,
4948                    top,
4949                    width: 60,
4950                    height: 14,
4951                    confidence: 85.0,
4952                    text: if idx == 0 && line_num % 2 == 0 {
4953                        "Creators".to_string()
4954                    } else {
4955                        text.to_string()
4956                    },
4957                });
4958            }
4959            top += 22;
4960        }
4961
4962        assert!(looks_like_dense_prose_image_ocr(&words));
4963    }
4964
4965    #[test]
4966    fn test_dense_prose_image_ocr_rejects_chart_like_words() {
4967        let words = vec![
4968            word((1, 1, 1), 10, "70.2"),
4969            word((1, 1, 1), 90, "75.6"),
4970            word((1, 1, 1), 170, "92.4"),
4971            word((1, 1, 2), 10, "80.4"),
4972            word((1, 1, 2), 90, "94.2"),
4973            word((1, 1, 2), 170, "95.5"),
4974            word((1, 1, 3), 10, "Company"),
4975            word((1, 1, 3), 90, "A"),
4976            word((1, 1, 3), 170, "B"),
4977            word((1, 1, 4), 10, "Scene"),
4978            word((1, 1, 4), 90, "Document"),
4979            word((1, 1, 5), 10, "65"),
4980            word((1, 1, 5), 90, "70"),
4981            word((1, 1, 5), 170, "75"),
4982            word((1, 1, 6), 10, "80"),
4983            word((1, 1, 6), 90, "85"),
4984            word((1, 1, 6), 170, "90"),
4985            word((1, 1, 7), 10, "95"),
4986            word((1, 1, 7), 90, "100"),
4987        ];
4988
4989        assert!(!looks_like_dense_prose_image_ocr(&words));
4990    }
4991
4992    #[test]
4993    fn test_dense_prose_image_ocr_rejects_scattered_chart_labels() {
4994        let words = vec![
4995            word_at((1, 1, 1), 20, 20, 80, "Participation"),
4996            word_at((1, 1, 1), 120, 20, 70, "of"),
4997            word_at((1, 1, 1), 210, 20, 90, "Institutions"),
4998            word_at((1, 1, 2), 310, 50, 50, "57"),
4999            word_at((1, 1, 2), 380, 50, 60, "(24%)"),
5000            word_at((1, 1, 3), 290, 86, 40, "20"),
5001            word_at((1, 1, 3), 345, 86, 50, "(8%)"),
5002            word_at((1, 1, 4), 80, 124, 120, "Government"),
5003            word_at((1, 1, 4), 260, 124, 90, "Other"),
5004            word_at((1, 1, 4), 360, 124, 60, "State"),
5005            word_at((1, 1, 5), 70, 160, 80, "Civil"),
5006            word_at((1, 1, 5), 170, 160, 80, "Society"),
5007            word_at((1, 1, 5), 280, 160, 110, "Organizations"),
5008            word_at((1, 1, 6), 300, 194, 50, "31"),
5009            word_at((1, 1, 6), 365, 194, 60, "(13%)"),
5010            word_at((1, 1, 7), 35, 228, 120, "Educational"),
5011            word_at((1, 1, 7), 180, 228, 100, "Institution"),
5012            word_at((1, 1, 8), 250, 262, 40, "16"),
5013            word_at((1, 1, 8), 305, 262, 50, "(7%)"),
5014        ];
5015
5016        assert!(looks_like_chart_label_ocr(&words));
5017        assert!(!looks_like_table_ocr(&words));
5018        assert!(!looks_like_dense_prose_image_ocr(&words));
5019    }
5020
5021    #[test]
5022    fn test_chart_label_ocr_detects_stacked_bar_chart_legend_layout() {
5023        let words = vec![
5024            word_at((1, 1, 1), 10, 15, 22, "ano"),
5025            word_at((1, 1, 1), 10, 8, 24, "MW."),
5026            word_at((1, 1, 2), 410, 25, 38, "Waste"),
5027            word_at((1, 1, 2), 452, 25, 55, "materials"),
5028            word_at((1, 1, 3), 11, 38, 21, "350"),
5029            word_at((1, 1, 4), 11, 61, 21, "300"),
5030            word_at((1, 1, 4), 411, 56, 38, "Biogas"),
5031            word_at((1, 1, 5), 7, 79, 25, "250"),
5032            word_at((1, 1, 5), 399, 87, 8, "'™"),
5033            word_at((1, 1, 5), 411, 87, 75, "Construction"),
5034            word_at((1, 1, 5), 490, 86, 33, "wood"),
5035            word_at((1, 1, 5), 527, 87, 35, "waste"),
5036            word_at((1, 1, 6), 11, 106, 21, "200"),
5037            word_at((1, 1, 7), 411, 117, 59, "General"),
5038            word_at((1, 1, 7), 467, 116, 27, "wood"),
5039            word_at((1, 1, 7), 499, 116, 54, "(10MWs)"),
5040            word_at((1, 1, 8), 11, 129, 21, "150"),
5041            word_at((1, 1, 9), 11, 152, 21, "100"),
5042            word_at((1, 1, 9), 399, 148, 7, "="),
5043            word_at((1, 1, 9), 411, 135, 46, "General"),
5044            word_at((1, 1, 9), 464, 135, 27, "wood"),
5045            word_at((1, 1, 9), 498, 146, 56, "(<LOMW)"),
5046            word_at((1, 1, 10), 13, 163, 18, "50"),
5047            word_at((1, 1, 10), 399, 178, 7, "="),
5048            word_at((1, 1, 10), 411, 176, 73, "Unutilised"),
5049            word_at((1, 1, 10), 480, 166, 29, "wood"),
5050            word_at((1, 1, 10), 516, 176, 45, "(2MWs)"),
5051            word_at((1, 1, 11), 24, 197, 7, "o"),
5052            word_at((1, 1, 12), 399, 208, 8, "m="),
5053            word_at((1, 1, 12), 411, 206, 59, "Unutilised"),
5054            word_at((1, 1, 12), 474, 206, 33, "wood"),
5055            word_at((1, 1, 12), 512, 206, 48, "(<2MW)"),
5056            word_at((1, 1, 13), 51, 217, 32, "12-13"),
5057            word_at((1, 1, 13), 96, 217, 28, "2014"),
5058            word_at((1, 1, 13), 139, 217, 28, "2015"),
5059            word_at((1, 1, 13), 182, 217, 28, "2016"),
5060            word_at((1, 1, 13), 225, 217, 28, "2017"),
5061            word_at((1, 1, 13), 268, 217, 28, "2018"),
5062            word_at((1, 1, 13), 311, 217, 28, "2019"),
5063            word_at((1, 1, 13), 354, 217, 28, "2020"),
5064        ];
5065
5066        assert!(looks_like_chart_label_ocr(&words));
5067        assert!(!looks_like_table_ocr(&words));
5068    }
5069
5070    #[test]
5071    fn test_build_numeric_table_border_rejects_sparse_chart_layout() {
5072        let image = image_chunk();
5073        let mut words = Vec::new();
5074        let columns = [20, 55, 90, 125, 160, 195, 230, 265, 300, 335, 370, 405];
5075
5076        for (idx, left) in columns.iter().enumerate() {
5077            words.push(word_at((1, 1, 1), *left, 20, 22, &format!("H{}", idx + 1)));
5078        }
5079        for (idx, left) in [20, 160, 300].into_iter().enumerate() {
5080            words.push(word_at((1, 1, 2), left, 52, 22, &format!("{}", idx + 1)));
5081        }
5082        for (idx, left) in [55, 195, 335].into_iter().enumerate() {
5083            words.push(word_at((1, 1, 3), left, 84, 22, &format!("{}", idx + 4)));
5084        }
5085        for (idx, left) in [90, 230, 370].into_iter().enumerate() {
5086            words.push(word_at((1, 1, 4), left, 116, 22, &format!("{}", idx + 7)));
5087        }
5088        for (idx, left) in columns.iter().enumerate() {
5089            words.push(word_at((1, 1, 5), *left, 148, 22, &format!("{}", idx + 10)));
5090        }
5091
5092        assert!(looks_like_chart_label_ocr(&words));
5093        assert!(!looks_like_table_ocr(&words));
5094        assert!(!looks_like_numeric_table_ocr(&words));
5095        assert!(build_numeric_table_border(&words, &image).is_none());
5096    }
5097
5098    #[test]
5099    fn test_lines_from_ocr_words_merges_wrapped_lines_into_blocks() {
5100        let words = vec![
5101            word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5102            word_at((1, 1, 1), 100, 20, 56, "protects"),
5103            word_at((1, 1, 2), 20, 38, 52, "creative"),
5104            word_at((1, 1, 2), 84, 38, 36, "work"),
5105            word_at((1, 1, 3), 240, 20, 52, "Public"),
5106            word_at((1, 1, 3), 304, 20, 40, "domain"),
5107            word_at((1, 1, 4), 240, 38, 60, "expires"),
5108            word_at((1, 1, 4), 312, 38, 44, "later"),
5109        ];
5110
5111        let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &[]);
5112
5113        assert_eq!(recovered.len(), 2);
5114        assert_eq!(recovered[0].value, "Copyright protects creative work");
5115        assert_eq!(recovered[1].value, "Public domain expires later");
5116    }
5117
5118    #[test]
5119    fn test_page_raster_ocr_skips_bar_chart_tables() {
5120        let mut chart = GrayImage::from_pixel(420, 260, Luma([255]));
5121        for x in 24..396 {
5122            chart.put_pixel(x, 222, Luma([170]));
5123        }
5124        for &(x1, x2, y1, shade) in &[
5125            (46, 82, 132, 222),
5126            (104, 140, 84, 214),
5127            (162, 198, 62, 206),
5128            (220, 256, 144, 228),
5129        ] {
5130            for x in x1..x2 {
5131                for y in y1..222 {
5132                    chart.put_pixel(x, y, Luma([shade]));
5133                }
5134            }
5135        }
5136
5137        let page_bbox = BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0);
5138        let mut table = TableBorder {
5139            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0),
5140            index: None,
5141            level: None,
5142            x_coordinates: vec![0.0, 210.0, 420.0],
5143            x_widths: vec![0.0; 3],
5144            y_coordinates: vec![260.0, 130.0, 0.0],
5145            y_widths: vec![0.0; 3],
5146            rows: vec![
5147                TableBorderRow {
5148                    bbox: BoundingBox::new(Some(1), 0.0, 130.0, 420.0, 260.0),
5149                    index: None,
5150                    level: None,
5151                    row_number: 0,
5152                    cells: vec![
5153                        TableBorderCell {
5154                            bbox: BoundingBox::new(Some(1), 0.0, 130.0, 210.0, 260.0),
5155                            index: None,
5156                            level: None,
5157                            row_number: 0,
5158                            col_number: 0,
5159                            row_span: 1,
5160                            col_span: 1,
5161                            content: Vec::new(),
5162                            contents: Vec::new(),
5163                            semantic_type: None,
5164                        },
5165                        TableBorderCell {
5166                            bbox: BoundingBox::new(Some(1), 210.0, 130.0, 420.0, 260.0),
5167                            index: None,
5168                            level: None,
5169                            row_number: 0,
5170                            col_number: 1,
5171                            row_span: 1,
5172                            col_span: 1,
5173                            content: Vec::new(),
5174                            contents: Vec::new(),
5175                            semantic_type: None,
5176                        },
5177                    ],
5178                    semantic_type: None,
5179                },
5180                TableBorderRow {
5181                    bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 130.0),
5182                    index: None,
5183                    level: None,
5184                    row_number: 1,
5185                    cells: vec![
5186                        TableBorderCell {
5187                            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 210.0, 130.0),
5188                            index: None,
5189                            level: None,
5190                            row_number: 1,
5191                            col_number: 0,
5192                            row_span: 1,
5193                            col_span: 1,
5194                            content: Vec::new(),
5195                            contents: Vec::new(),
5196                            semantic_type: None,
5197                        },
5198                        TableBorderCell {
5199                            bbox: BoundingBox::new(Some(1), 210.0, 0.0, 420.0, 130.0),
5200                            index: None,
5201                            level: None,
5202                            row_number: 1,
5203                            col_number: 1,
5204                            row_span: 1,
5205                            col_span: 1,
5206                            content: Vec::new(),
5207                            contents: Vec::new(),
5208                            semantic_type: None,
5209                        },
5210                    ],
5211                    semantic_type: None,
5212                },
5213            ],
5214            num_rows: 2,
5215            num_columns: 2,
5216            is_bad_table: false,
5217            is_table_transformer: true,
5218            previous_table: None,
5219            next_table: None,
5220        };
5221
5222        enrich_empty_table_from_page_raster(&chart, &page_bbox, &mut table);
5223
5224        assert!(table
5225            .rows
5226            .iter()
5227            .flat_map(|row| row.cells.iter())
5228            .all(|cell| cell.content.is_empty()));
5229    }
5230
5231    #[test]
5232    fn test_native_text_chars_in_region_ignores_distant_page_text() {
5233        let table_bbox = BoundingBox::new(Some(1), 40.0, 120.0, 360.0, 280.0);
5234        let distant_text = ContentElement::TextChunk(text_chunk(
5235            &"A".repeat(MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR + 40),
5236            BoundingBox::new(Some(1), 40.0, 500.0, 380.0, 560.0),
5237        ));
5238        let overlapping_text = ContentElement::TextChunk(text_chunk(
5239            "1234",
5240            BoundingBox::new(Some(1), 60.0, 160.0, 100.0, 176.0),
5241        ));
5242        let elements = vec![distant_text, overlapping_text];
5243
5244        assert!(page_native_text_chars(&elements) > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR);
5245        assert_eq!(native_text_chars_in_region(&elements, &table_bbox), 4);
5246    }
5247
5248    #[test]
5249    fn test_table_needs_page_raster_ocr_for_sparse_partial_table() {
5250        let mut table = TableBorder {
5251            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 300.0, 200.0),
5252            index: None,
5253            level: None,
5254            x_coordinates: vec![0.0, 60.0, 120.0, 180.0, 240.0, 300.0],
5255            x_widths: vec![0.0; 6],
5256            y_coordinates: vec![200.0, 160.0, 120.0, 80.0, 40.0, 0.0],
5257            y_widths: vec![0.0; 6],
5258            rows: Vec::new(),
5259            num_rows: 5,
5260            num_columns: 5,
5261            is_bad_table: false,
5262            is_table_transformer: true,
5263            previous_table: None,
5264            next_table: None,
5265        };
5266
5267        for row_idx in 0..5 {
5268            let mut row = TableBorderRow {
5269                bbox: BoundingBox::new(Some(1), 0.0, 0.0, 300.0, 200.0),
5270                index: None,
5271                level: None,
5272                row_number: row_idx,
5273                cells: Vec::new(),
5274                semantic_type: None,
5275            };
5276            for col_idx in 0..5 {
5277                row.cells.push(TableBorderCell {
5278                    bbox: BoundingBox::new(Some(1), 0.0, 0.0, 60.0, 40.0),
5279                    index: None,
5280                    level: None,
5281                    row_number: row_idx,
5282                    col_number: col_idx,
5283                    row_span: 1,
5284                    col_span: 1,
5285                    content: Vec::new(),
5286                    contents: Vec::new(),
5287                    semantic_type: None,
5288                });
5289            }
5290            table.rows.push(row);
5291        }
5292
5293        table.rows[0].cells[0].content.push(TableToken {
5294            base: text_chunk("12", BoundingBox::new(Some(1), 0.0, 0.0, 20.0, 10.0)),
5295            token_type: TableTokenType::Text,
5296        });
5297
5298        assert!(table_needs_page_raster_ocr(&table));
5299    }
5300
5301    #[test]
5302    fn test_lines_from_ocr_words_dedupes_against_native_text() {
5303        let words = vec![
5304            word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5305            word_at((1, 1, 1), 100, 20, 56, "protects"),
5306            word_at((1, 1, 2), 20, 38, 52, "creative"),
5307            word_at((1, 1, 2), 84, 38, 36, "work"),
5308        ];
5309        let native = vec![TextChunk {
5310            value: "Copyright protects creative work".to_string(),
5311            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 10.0, 10.0),
5312            font_name: "Native".to_string(),
5313            font_size: 12.0,
5314            font_weight: 400.0,
5315            italic_angle: 0.0,
5316            font_color: "#000000".to_string(),
5317            contrast_ratio: 21.0,
5318            symbol_ends: Vec::new(),
5319            text_format: TextFormat::Normal,
5320            text_type: TextType::Regular,
5321            pdf_layer: PdfLayer::Content,
5322            ocg_visible: true,
5323            index: None,
5324            page_number: Some(1),
5325            level: None,
5326            mcid: None,
5327        }];
5328
5329        let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &native);
5330
5331        assert!(recovered.is_empty());
5332    }
5333}