Skip to main content

edgeparse_core/pdf/
raster_table_ocr.rs

1//! Recover text signal from raster table images using local OCR.
2
3use std::collections::{BTreeMap, HashMap, HashSet};
4use std::env;
5use std::fs;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8use std::sync::OnceLock;
9use std::time::{SystemTime, UNIX_EPOCH};
10
11use image::{GenericImageView, GrayImage, Luma};
12use serde::Deserialize;
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::{ImageChunk, TextChunk};
16use crate::models::content::ContentElement;
17use crate::models::enums::{PdfLayer, TextFormat, TextType};
18use crate::models::table::{
19    TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
20};
21
22// Broaden image eligibility so moderately cropped tables are considered.
23const MIN_IMAGE_WIDTH_RATIO: f64 = 0.40;
24const MIN_IMAGE_AREA_RATIO: f64 = 0.035;
25const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
26const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
27// Accuracy-first: accept degraded glyphs at lower confidence —
28// dual-OEM consensus and spatial coherence filtering will eliminate noise.
29const MIN_OCR_WORD_CONFIDENCE: f64 = 6.0;
30// Reject artificially-high confidence noise (Tesseract artefacts above 100).
31const MAX_OCR_WORD_CONFIDENCE: f64 = 101.0;
32const RASTER_DARK_THRESHOLD: u8 = 180;
33const RASTER_CHART_INK_THRESHOLD: u8 = 240;
34const MIN_BORDERED_VERTICAL_LINES: usize = 3;
35const MIN_BORDERED_HORIZONTAL_LINES: usize = 3;
36// Accuracy-first: lighter lines are still valid table borders.
37const MIN_LINE_DARK_RATIO: f64 = 0.28;
38const MIN_CELL_SIZE_PX: u32 = 10;
39const CELL_INSET_PX: u32 = 5;
40const TABLE_RASTER_OCR_BORDER_PX: u32 = 14;
41// Typography-grounded scale: pdftoppm renders at PDFTOPPM_DPI (150). Scaling by 2
42// gives 300 DPI effective — the Tesseract-documented optimum. At 12pt body text,
43// cap height ≈ 25px raw → 50px scaled, squarely in Tesseract's 32-40px sweet spot.
44// Over-scaling (×5 = 125px) amplifies anti-aliasing and hurts LSTM segmentation.
45const PDFTOPPM_DPI: u32 = 150;
46const OCR_SCALE_FACTOR: u32 = 2;
47/// Effective DPI seen by Tesseract = PDFTOPPM_DPI × OCR_SCALE_FACTOR.
48const TESSERACT_EFFECTIVE_DPI: u32 = PDFTOPPM_DPI * OCR_SCALE_FACTOR;
49const MIN_DOMINANT_IMAGE_WIDTH_RATIO: f64 = 0.65;
50const MIN_DOMINANT_IMAGE_AREA_RATIO: f64 = 0.40;
51const MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE: usize = 80;
52const MIN_DOMINANT_IMAGE_OCR_WORDS: usize = 18;
53const MIN_DOMINANT_IMAGE_TEXT_LINES: usize = 6;
54const MIN_DENSE_PROSE_BLOCK_LINES: usize = 3;
55const MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO: f64 = 0.32;
56// Permit minor breaks in rasterized lines while still enforcing structure.
57const MIN_TRUE_GRID_LINE_CONTINUITY: f64 = 0.60;
58const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
59const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
60const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
61const LOCAL_BINARIZATION_RADIUS: u32 = 14;
62const MIN_BINARIZATION_BLOCK_PIXELS: usize = 81;
63// Handle sparse numeric tables where only a few cells OCR cleanly.
64const MIN_RASTER_TABLE_TEXT_CELL_RATIO: f64 = 0.05;
65const MIN_RASTER_TABLE_ROWS_WITH_TEXT: usize = 1;
66const MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO: f64 = 0.40;
67const MIN_BORDERED_CELL_DARK_RATIO: f64 = 0.03;
68const MIN_BORDERED_INKED_CELL_RATIO: f64 = 0.18;
69const MIN_BORDERED_ROWS_WITH_INK: usize = 2;
70const MIN_BRIGHT_PHOTO_MID_TONE_RATIO: f64 = 0.24;
71const MIN_BRIGHT_PHOTO_HISTOGRAM_BINS: usize = 8;
72const MIN_BRIGHT_PHOTO_ENTROPY: f64 = 1.6;
73
74#[derive(Debug, Clone)]
75struct OcrWord {
76    line_key: (u32, u32, u32),
77    left: u32,
78    top: u32,
79    width: u32,
80    height: u32,
81    text: String,
82    confidence: f64,
83}
84
85#[derive(Debug, Clone)]
86struct XCluster {
87    center: f64,
88    count: usize,
89    lines: HashSet<(u32, u32, u32)>,
90}
91
92#[derive(Clone)]
93struct OcrRowBuild {
94    top_y: f64,
95    bottom_y: f64,
96    cell_texts: Vec<String>,
97}
98
99#[derive(Debug, Clone)]
100struct EmptyCellRaster {
101    row_idx: usize,
102    cell_idx: usize,
103    x1: u32,
104    y1: u32,
105    x2: u32,
106    y2: u32,
107}
108
109#[derive(Debug, Clone)]
110struct RasterTableGrid {
111    vertical_lines: Vec<u32>,
112    horizontal_lines: Vec<u32>,
113}
114
115#[derive(Debug, Clone)]
116struct OcrCandidateScore {
117    words: Vec<OcrWord>,
118    score: f64,
119}
120
121#[derive(Debug, Clone)]
122struct PdfImagesListEntry {
123    image_type: String,
124}
125
126#[derive(Debug, Clone, Copy, PartialEq, Eq)]
127enum OcrEngine {
128    Tesseract,
129    RapidOcr,
130}
131
132#[derive(Debug, Deserialize)]
133struct RapidOcrLine {
134    left: u32,
135    top: u32,
136    width: u32,
137    height: u32,
138    text: String,
139    confidence: f64,
140}
141
142static OCR_ENGINE: OnceLock<OcrEngine> = OnceLock::new();
143static RAPIDOCR_PYTHON: OnceLock<Option<String>> = OnceLock::new();
144
145const RAPIDOCR_RUNNER: &str = r#"
146import json, sys
147from rapidocr import RapidOCR
148
149engine = RapidOCR()
150result = engine(sys.argv[1], use_det=True, use_cls=True, use_rec=True)
151
152if result is None:
153    print('[]')
154    raise SystemExit(0)
155
156boxes = getattr(result, 'boxes', []) or []
157txts = getattr(result, 'txts', []) or []
158scores = getattr(result, 'scores', []) or []
159out = []
160for box, text, score in zip(boxes, txts, scores):
161    if not text or not str(text).strip():
162        continue
163    xs = [pt[0] for pt in box]
164    ys = [pt[1] for pt in box]
165    out.append({
166        'left': int(min(xs)),
167        'top': int(min(ys)),
168        'width': max(1, int(max(xs) - min(xs))),
169        'height': max(1, int(max(ys) - min(ys))),
170        'text': str(text),
171        'confidence': float(score),
172    })
173print(json.dumps(out, ensure_ascii=False))
174"#;
175
176fn selected_ocr_engine() -> OcrEngine {
177    *OCR_ENGINE.get_or_init(|| match env::var("EDGEPARSE_OCR_ENGINE") {
178        Ok(value) => match value.to_ascii_lowercase().as_str() {
179            "rapidocr" if rapidocr_python_command().is_some() => OcrEngine::RapidOcr,
180            "rapidocr" => OcrEngine::Tesseract,
181            _ => OcrEngine::Tesseract,
182        },
183        Err(_) => OcrEngine::Tesseract,
184    })
185}
186
187fn rapidocr_python_command() -> Option<&'static str> {
188    RAPIDOCR_PYTHON
189        .get_or_init(|| {
190            let preferred = env::var("EDGEPARSE_OCR_PYTHON").ok();
191            let mut candidates = Vec::new();
192            if let Some(cmd) = preferred {
193                candidates.push(cmd);
194            }
195            candidates.push("python3".to_string());
196            candidates.push("python".to_string());
197
198            for candidate in candidates {
199                let ok = Command::new(&candidate)
200                    .arg("-c")
201                    .arg("import rapidocr")
202                    .output()
203                    .ok()
204                    .is_some_and(|out| out.status.success());
205                if ok {
206                    return Some(candidate);
207                }
208            }
209            None
210        })
211        .as_deref()
212}
213
214fn rapidocr_lines_to_words(lines: Vec<RapidOcrLine>) -> Vec<OcrWord> {
215    let mut words = Vec::new();
216
217    for (line_idx, line) in lines.into_iter().enumerate() {
218        let tokens: Vec<&str> = line.text.split_whitespace().collect();
219        if tokens.is_empty() {
220            continue;
221        }
222
223        let total_chars: u32 = tokens
224            .iter()
225            .map(|token| token.chars().count() as u32)
226            .sum();
227        if total_chars == 0 {
228            continue;
229        }
230
231        let mut cursor = line.left;
232        let mut remaining_width = line.width.max(tokens.len() as u32);
233        let mut remaining_chars = total_chars;
234
235        for (token_idx, token) in tokens.iter().enumerate() {
236            let token_chars = token.chars().count() as u32;
237            let width = if token_idx == tokens.len() - 1 || remaining_chars <= token_chars {
238                remaining_width.max(1)
239            } else {
240                let proportional = ((remaining_width as f64) * (token_chars as f64)
241                    / (remaining_chars as f64))
242                    .round() as u32;
243                proportional.max(1).min(remaining_width)
244            };
245
246            words.push(OcrWord {
247                line_key: (0, line_idx as u32, 0),
248                left: cursor,
249                top: line.top,
250                width,
251                height: line.height.max(1),
252                text: (*token).to_string(),
253                confidence: line.confidence,
254            });
255
256            cursor = cursor.saturating_add(width);
257            remaining_width = remaining_width.saturating_sub(width);
258            remaining_chars = remaining_chars.saturating_sub(token_chars);
259        }
260    }
261
262    words
263}
264
265fn run_rapidocr_words(image: &GrayImage) -> Option<Vec<OcrWord>> {
266    let python = rapidocr_python_command()?;
267    let temp_dir = create_temp_dir(0).ok()?;
268    let image_path = temp_dir.join("ocr.png");
269    if image.save(&image_path).is_err() {
270        let _ = fs::remove_dir_all(&temp_dir);
271        return None;
272    }
273
274    let output = Command::new(python)
275        .current_dir(&temp_dir)
276        .arg("-c")
277        .arg(RAPIDOCR_RUNNER)
278        .arg("ocr.png")
279        .output()
280        .ok()?;
281    let _ = fs::remove_dir_all(&temp_dir);
282    if !output.status.success() {
283        return None;
284    }
285
286    let json = String::from_utf8_lossy(&output.stdout);
287    let lines: Vec<RapidOcrLine> = serde_json::from_str(&json).ok()?;
288    let words = rapidocr_lines_to_words(lines);
289    (!words.is_empty()).then_some(words)
290}
291
292/// Recover OCR text chunks for image-backed table regions on a single page.
293pub fn recover_raster_table_text_chunks(
294    input_path: &Path,
295    page_bbox: &BoundingBox,
296    page_number: u32,
297    text_chunks: &[TextChunk],
298    image_chunks: &[ImageChunk],
299) -> Vec<TextChunk> {
300    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
301        return Vec::new();
302    }
303
304    let candidates: Vec<&ImageChunk> = image_chunks
305        .iter()
306        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
307        .collect();
308    if candidates.is_empty() {
309        return Vec::new();
310    }
311
312    let temp_dir = match create_temp_dir(page_number) {
313        Ok(dir) => dir,
314        Err(_) => return Vec::new(),
315    };
316
317    let result =
318        recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
319
320    let _ = fs::remove_dir_all(&temp_dir);
321    result
322}
323
324/// Recover OCR text lines from dominant non-table page images.
325///
326/// This is for infographic-like pages where the PDF contains a large raster
327/// image but little or no native text. The extracted OCR signal is injected
328/// back into the normal text pipeline as line chunks so downstream grouping can
329/// rebuild headings, paragraphs, and lists.
330pub fn recover_dominant_image_text_chunks(
331    input_path: &Path,
332    page_bbox: &BoundingBox,
333    page_number: u32,
334    text_chunks: &[TextChunk],
335    image_chunks: &[ImageChunk],
336) -> Vec<TextChunk> {
337    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
338        return Vec::new();
339    }
340
341    let candidates: Vec<&ImageChunk> = image_chunks
342        .iter()
343        .filter(|image| is_dominant_image_text_candidate(image, page_bbox, text_chunks))
344        .collect();
345    if candidates.is_empty() {
346        return Vec::new();
347    }
348
349    let temp_dir = match create_temp_dir(page_number) {
350        Ok(dir) => dir,
351        Err(_) => return Vec::new(),
352    };
353
354    let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
355        Some(files) => files,
356        None => {
357            let _ = fs::remove_dir_all(&temp_dir);
358            return Vec::new();
359        }
360    };
361
362    let mut recovered = Vec::new();
363    for image in candidates {
364        let Some(image_index) = image.index else {
365            continue;
366        };
367        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
368            continue;
369        };
370        let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
371            continue;
372        };
373        if recover_bordered_raster_table_from_gray(&gray, image).is_some()
374            || is_obvious_bar_chart_raster(&gray)
375            || is_natural_photograph_raster(&gray)
376            || is_dark_ui_screenshot_raster(&gray)
377        {
378            continue;
379        }
380
381        let Some(words) = run_tesseract_tsv_words_best(&gray, &["11", "6"], |candidate| {
382            looks_like_dense_prose_image_ocr(candidate)
383        }) else {
384            continue;
385        };
386
387        recovered.extend(lines_from_ocr_words(
388            &words,
389            image,
390            gray.width(),
391            gray.height(),
392            text_chunks,
393        ));
394    }
395
396    let _ = fs::remove_dir_all(&temp_dir);
397    recovered
398}
399
400/// Recover synthetic table borders for strongly numeric raster tables.
401pub fn recover_raster_table_borders(
402    input_path: &Path,
403    page_bbox: &BoundingBox,
404    page_number: u32,
405    text_chunks: &[TextChunk],
406    image_chunks: &[ImageChunk],
407) -> Vec<TableBorder> {
408    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
409        return Vec::new();
410    }
411
412    let candidates: Vec<&ImageChunk> = image_chunks
413        .iter()
414        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
415        .collect();
416    if candidates.is_empty() {
417        return Vec::new();
418    }
419
420    let temp_dir = match create_temp_dir(page_number) {
421        Ok(dir) => dir,
422        Err(_) => return Vec::new(),
423    };
424
425    let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
426        Some(files) => files,
427        None => {
428            let _ = fs::remove_dir_all(&temp_dir);
429            return Vec::new();
430        }
431    };
432
433    let mut tables = Vec::new();
434    for image in candidates {
435        let Some(image_index) = image.index else {
436            continue;
437        };
438        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
439            continue;
440        };
441        let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
442            continue;
443        };
444        if is_obvious_bar_chart_raster(&gray)
445            || is_natural_photograph_raster(&gray)
446            || is_dark_ui_screenshot_raster(&gray)
447        {
448            continue;
449        }
450        if let Some(table) = recover_bordered_raster_table_from_gray(&gray, image) {
451            let chart_words = run_tesseract_tsv_words_best(&gray, &["6", "11"], |_| true);
452            if chart_words
453                .as_deref()
454                .is_some_and(looks_like_chart_label_ocr)
455            {
456                continue;
457            }
458            tables.push(table);
459            continue;
460        }
461        let Some(words) = run_tesseract_tsv_words_best(&gray, &["6", "11"], |candidate| {
462            looks_like_table_ocr(candidate)
463        }) else {
464            continue;
465        };
466
467        if looks_like_numeric_table_ocr(&words) {
468            if let Some(table) = build_numeric_table_border(&words, image) {
469                if is_matrixish_ocr_artifact_table(&table) {
470                    continue;
471                }
472                tables.push(table);
473                continue;
474            }
475        }
476
477        if let Some(table) = build_structured_ocr_table_border(&words, image) {
478            if is_matrixish_ocr_artifact_table(&table) {
479                continue;
480            }
481            tables.push(table);
482        }
483    }
484
485    let _ = fs::remove_dir_all(&temp_dir);
486    tables
487}
488
489/// Recover OCR text into empty bordered tables by rasterizing the full page.
490///
491/// This targets graphics-dominant pages where native PDF text is sparse but the
492/// page still exposes strong bordered geometry. It enriches existing empty
493/// `TableBorder` cells directly from the rendered page appearance.
494pub fn recover_page_raster_table_cell_text(
495    input_path: &Path,
496    page_bbox: &BoundingBox,
497    page_number: u32,
498    elements: &mut [ContentElement],
499) {
500    if page_bbox.area() <= 0.0 {
501        return;
502    }
503
504    let native_text_chars = page_native_text_chars(elements);
505    if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR {
506        return;
507    }
508
509    let candidate_indices: Vec<usize> = elements
510        .iter()
511        .enumerate()
512        .filter_map(|(idx, elem)| {
513            table_candidate_ref(elem)
514                .filter(|table| table_needs_page_raster_ocr(table))
515                .map(|_| idx)
516        })
517        .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
518        .collect();
519    if candidate_indices.is_empty() {
520        return;
521    }
522
523    let coverage: f64 = candidate_indices
524        .iter()
525        .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
526        .sum::<f64>()
527        / page_bbox.area().max(1.0);
528    if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
529        return;
530    }
531
532    let temp_dir = match create_temp_dir(page_number) {
533        Ok(dir) => dir,
534        Err(_) => return,
535    };
536    let prefix = temp_dir.join("page");
537    let status = Command::new("pdftoppm")
538        .arg("-png")
539        .arg("-f")
540        .arg(page_number.to_string())
541        .arg("-l")
542        .arg(page_number.to_string())
543        .arg("-singlefile")
544        .arg(input_path)
545        .arg(&prefix)
546        .status();
547    match status {
548        Ok(s) if s.success() => {}
549        _ => {
550            let _ = fs::remove_dir_all(&temp_dir);
551            return;
552        }
553    }
554
555    let page_image_path = prefix.with_extension("png");
556    let gray = match image::open(&page_image_path) {
557        Ok(img) => img.to_luma8(),
558        Err(_) => {
559            let _ = fs::remove_dir_all(&temp_dir);
560            return;
561        }
562    };
563
564    for idx in candidate_indices {
565        let Some(elem) = elements.get_mut(idx) else {
566            continue;
567        };
568        let Some(table) = table_candidate_mut(elem) else {
569            continue;
570        };
571        enrich_empty_table_from_page_raster(&gray, page_bbox, table);
572    }
573
574    let _ = fs::remove_dir_all(&temp_dir);
575}
576
577fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
578    match elem {
579        ContentElement::TableBorder(table) => Some(table),
580        ContentElement::Table(table) => Some(&table.table_border),
581        _ => None,
582    }
583}
584
585fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
586    match elem {
587        ContentElement::TableBorder(table) => Some(table),
588        ContentElement::Table(table) => Some(&mut table.table_border),
589        _ => None,
590    }
591}
592
593fn recover_from_page_images(
594    input_path: &Path,
595    temp_dir: &Path,
596    page_number: u32,
597    candidates: Vec<&ImageChunk>,
598    text_chunks: &[TextChunk],
599) -> Vec<TextChunk> {
600    let image_files = match extract_visible_page_image_files(input_path, page_number, temp_dir) {
601        Some(files) => files,
602        None => return Vec::new(),
603    };
604    if image_files.is_empty() {
605        return Vec::new();
606    }
607
608    let mut recovered = Vec::new();
609    for image in candidates {
610        let Some(image_index) = image.index else {
611            continue;
612        };
613        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
614            continue;
615        };
616        let bordered_table = recover_bordered_raster_table(image_path, image);
617        if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
618            recovered.push(caption);
619        }
620        if bordered_table.is_some() {
621            continue;
622        }
623        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
624            continue;
625        };
626        // Images extracted via pdfimages are at their native PDF DPI.
627        // We pass PDFTOPPM_DPI as a reasonable hint; Tesseract uses this only for
628        // geometry heuristics, not LSTM recognition, so approximate is fine.
629        let native_dpi = PDFTOPPM_DPI.to_string();
630        let Ok(tsv_output) = Command::new("tesseract")
631            .current_dir(temp_dir)
632            .arg(file_name)
633            .arg("stdout")
634            .arg("--dpi")
635            .arg(&native_dpi)
636            .arg("--psm")
637            .arg("6")
638            .arg("-c")
639            .arg("load_system_dawg=0")
640            .arg("-c")
641            .arg("load_freq_dawg=0")
642            .arg("tsv")
643            .output()
644        else {
645            continue;
646        };
647        if !tsv_output.status.success() {
648            continue;
649        }
650
651        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
652        let words = parse_tesseract_tsv(&tsv);
653        if !looks_like_table_ocr(&words) {
654            continue;
655        }
656
657        recovered.extend(words_to_text_chunks(&words, image, text_chunks));
658    }
659
660    recovered
661}
662
663fn page_native_text_chars(elements: &[ContentElement]) -> usize {
664    elements
665        .iter()
666        .map(|elem| match elem {
667            ContentElement::Paragraph(p) => p.base.value().chars().count(),
668            ContentElement::Heading(h) => h.base.base.value().chars().count(),
669            ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
670            ContentElement::TextBlock(tb) => tb.value().chars().count(),
671            ContentElement::TextLine(tl) => tl.value().chars().count(),
672            ContentElement::TextChunk(tc) => tc.value.chars().count(),
673            ContentElement::List(list) => list
674                .list_items
675                .iter()
676                .flat_map(|item| item.contents.iter())
677                .map(|content| match content {
678                    ContentElement::Paragraph(p) => p.base.value().chars().count(),
679                    ContentElement::TextBlock(tb) => tb.value().chars().count(),
680                    ContentElement::TextLine(tl) => tl.value().chars().count(),
681                    ContentElement::TextChunk(tc) => tc.value.chars().count(),
682                    _ => 0,
683                })
684                .sum(),
685            _ => 0,
686        })
687        .sum()
688}
689
690fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
691    table.num_rows >= 1
692        && table.num_columns >= 2
693        && table
694            .rows
695            .iter()
696            .flat_map(|row| row.cells.iter())
697            .all(|cell| {
698                !cell
699                    .content
700                    .iter()
701                    .any(|token| matches!(token.token_type, TableTokenType::Text))
702            })
703}
704
705fn enrich_empty_table_from_page_raster(
706    gray: &GrayImage,
707    page_bbox: &BoundingBox,
708    table: &mut TableBorder,
709) {
710    // Collect empty cells first, so we can OCR the whole table once and then
711    // distribute words into cells. This avoids calling tesseract per cell.
712    let mut empty_cells: Vec<EmptyCellRaster> = Vec::new();
713    for (row_idx, row) in table.rows.iter().enumerate() {
714        for (cell_idx, cell) in row.cells.iter().enumerate() {
715            if cell
716                .content
717                .iter()
718                .any(|token| matches!(token.token_type, TableTokenType::Text))
719            {
720                continue;
721            }
722            let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
723            else {
724                continue;
725            };
726            empty_cells.push(EmptyCellRaster {
727                row_idx,
728                cell_idx,
729                x1,
730                y1,
731                x2,
732                y2,
733            });
734        }
735    }
736    if empty_cells.is_empty() {
737        return;
738    }
739
740    // Fallback to legacy per-cell OCR when we can't build a stable table crop.
741    let Some((tx1, ty1, tx2, ty2)) = page_bbox_to_raster_box(gray, page_bbox, &table.bbox) else {
742        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
743        return;
744    };
745
746    let pad = CELL_INSET_PX * 2;
747    let crop_left = tx1.saturating_sub(pad);
748    let crop_top = ty1.saturating_sub(pad);
749    let crop_right = (tx2 + pad).min(gray.width());
750    let crop_bottom = (ty2 + pad).min(gray.height());
751    if crop_right <= crop_left || crop_bottom <= crop_top {
752        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
753        return;
754    }
755
756    let crop_width = crop_right - crop_left;
757    let crop_height = crop_bottom - crop_top;
758    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
759        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
760        return;
761    }
762
763    let cropped = gray
764        .view(crop_left, crop_top, crop_width, crop_height)
765        .to_image();
766    if is_obvious_bar_chart_raster(&cropped)
767        || is_natural_photograph_raster(&cropped)
768        || is_dark_ui_screenshot_raster(&cropped)
769    {
770        return;
771    }
772    let bordered = expand_white_border(&cropped, TABLE_RASTER_OCR_BORDER_PX);
773    let scaled = image::imageops::resize(
774        &bordered,
775        bordered.width() * OCR_SCALE_FACTOR,
776        bordered.height() * OCR_SCALE_FACTOR,
777        image::imageops::FilterType::Lanczos3,
778    );
779
780    let Some(words) = run_tesseract_tsv_words(&scaled, "6") else {
781        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
782        return;
783    };
784    if words.is_empty() {
785        fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
786        return;
787    }
788    if looks_like_chart_label_ocr(&words) {
789        return;
790    }
791
792    let mut buckets: Vec<Vec<(u32, u32, String)>> = vec![Vec::new(); empty_cells.len()];
793    let scale = f64::from(OCR_SCALE_FACTOR);
794    let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
795
796    for word in &words {
797        let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
798        let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
799
800        let cx_crop = cx_scaled / scale - border;
801        let cy_crop = cy_scaled / scale - border;
802        if cx_crop < 0.0 || cy_crop < 0.0 {
803            continue;
804        }
805
806        let cx_page = match u32::try_from(cx_crop.round() as i64) {
807            Ok(v) => crop_left.saturating_add(v),
808            Err(_) => continue,
809        };
810        let cy_page = match u32::try_from(cy_crop.round() as i64) {
811            Ok(v) => crop_top.saturating_add(v),
812            Err(_) => continue,
813        };
814
815        for (idx, cell) in empty_cells.iter().enumerate() {
816            if cx_page >= cell.x1 && cx_page < cell.x2 && cy_page >= cell.y1 && cy_page < cell.y2 {
817                buckets[idx].push((cy_page, cx_page, word.text.clone()));
818                break;
819            }
820        }
821    }
822
823    for (idx, cell) in empty_cells.iter().enumerate() {
824        let Some(row) = table.rows.get_mut(cell.row_idx) else {
825            continue;
826        };
827        let Some(target) = row.cells.get_mut(cell.cell_idx) else {
828            continue;
829        };
830        if target
831            .content
832            .iter()
833            .any(|token| matches!(token.token_type, TableTokenType::Text))
834        {
835            continue;
836        }
837        let mut parts = std::mem::take(&mut buckets[idx]);
838        if parts.is_empty() {
839            continue;
840        }
841        parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
842        let raw = parts
843            .into_iter()
844            .map(|(_, _, t)| t)
845            .collect::<Vec<_>>()
846            .join(" ");
847        let text = normalize_page_raster_cell_text(&target.bbox, raw);
848        if text.is_empty() {
849            continue;
850        }
851        target.content.push(TableToken {
852            base: TextChunk {
853                value: text,
854                bbox: target.bbox.clone(),
855                font_name: "OCR".to_string(),
856                font_size: target.bbox.height().max(6.0),
857                font_weight: 400.0,
858                italic_angle: 0.0,
859                font_color: "#000000".to_string(),
860                contrast_ratio: 21.0,
861                symbol_ends: Vec::new(),
862                text_format: TextFormat::Normal,
863                text_type: TextType::Regular,
864                pdf_layer: PdfLayer::Content,
865                ocg_visible: true,
866                index: None,
867                page_number: target.bbox.page_number,
868                level: None,
869                mcid: None,
870            },
871            token_type: TableTokenType::Text,
872        });
873    }
874}
875
876fn fill_cells_with_per_cell_ocr(
877    gray: &GrayImage,
878    table: &mut TableBorder,
879    empty_cells: &[EmptyCellRaster],
880) {
881    for cell in empty_cells {
882        let Some(row) = table.rows.get_mut(cell.row_idx) else {
883            continue;
884        };
885        let Some(target) = row.cells.get_mut(cell.cell_idx) else {
886            continue;
887        };
888        if target
889            .content
890            .iter()
891            .any(|token| matches!(token.token_type, TableTokenType::Text))
892        {
893            continue;
894        }
895        let Some(text) =
896            extract_page_raster_cell_text(gray, &target.bbox, cell.x1, cell.y1, cell.x2, cell.y2)
897        else {
898            continue;
899        };
900        if text.is_empty() {
901            continue;
902        }
903        target.content.push(TableToken {
904            base: TextChunk {
905                value: text,
906                bbox: target.bbox.clone(),
907                font_name: "OCR".to_string(),
908                font_size: target.bbox.height().max(6.0),
909                font_weight: 400.0,
910                italic_angle: 0.0,
911                font_color: "#000000".to_string(),
912                contrast_ratio: 21.0,
913                symbol_ends: Vec::new(),
914                text_format: TextFormat::Normal,
915                text_type: TextType::Regular,
916                pdf_layer: PdfLayer::Content,
917                ocg_visible: true,
918                index: None,
919                page_number: target.bbox.page_number,
920                level: None,
921                mcid: None,
922            },
923            token_type: TableTokenType::Text,
924        });
925    }
926}
927
928fn page_bbox_to_raster_box(
929    gray: &GrayImage,
930    page_bbox: &BoundingBox,
931    bbox: &BoundingBox,
932) -> Option<(u32, u32, u32, u32)> {
933    if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
934        return None;
935    }
936
937    let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
938        .clamp(0.0, f64::from(gray.width()));
939    let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
940        .clamp(0.0, f64::from(gray.width()));
941    let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
942        .clamp(0.0, f64::from(gray.height()));
943    let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
944        * f64::from(gray.height()))
945    .clamp(0.0, f64::from(gray.height()));
946
947    let x1 = left.floor() as u32;
948    let x2 = right.ceil() as u32;
949    let y1 = top.floor() as u32;
950    let y2 = bottom.ceil() as u32;
951    (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
952}
953
954fn extract_page_raster_cell_text(
955    gray: &GrayImage,
956    cell_bbox: &BoundingBox,
957    x1: u32,
958    y1: u32,
959    x2: u32,
960    y2: u32,
961) -> Option<String> {
962    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
963    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
964    let crop_left = x1 + inset_x;
965    let crop_top = y1 + inset_y;
966    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
967    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
968    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
969        return Some(String::new());
970    }
971
972    let cropped = gray
973        .view(crop_left, crop_top, crop_width, crop_height)
974        .to_image();
975    let bordered = expand_white_border(&cropped, 12);
976    let scaled = image::imageops::resize(
977        &bordered,
978        bordered.width() * OCR_SCALE_FACTOR,
979        bordered.height() * OCR_SCALE_FACTOR,
980        image::imageops::FilterType::Lanczos3,
981    );
982
983    // Improved PSM selection based on cell aspect ratio
984    let aspect_ratio = cell_bbox.width() / cell_bbox.height();
985    let is_vertical = aspect_ratio < 0.8;
986
987    // PSM modes ordered by likelihood of success for each cell shape.
988    // Typography rationale:
989    //   PSM 6  — single uniform text block (multi-line header/paragraph cells)
990    //   PSM 7  — single text line (most data cells; one baseline)
991    //   PSM 8  — single word (numeric data, codes, percentages — one token)
992    //   PSM 11 — sparse text (cells with scattered numbers / partial fills)
993    //   PSM 13 — raw line (bypasses heuristics; last resort for oddly typeset cells)
994    // PSM 10 (single character) is intentionally excluded: table cells always
995    // contain at least one full token, so char-level segmentation yields fragments.
996    let psm_modes: [&str; 5] = if is_vertical {
997        ["7", "8", "6", "11", "13"]
998    } else {
999        ["6", "7", "8", "11", "13"]
1000    };
1001
1002    let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
1003    Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
1004}
1005
1006fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
1007    let normalized = text
1008        .replace('|', " ")
1009        .replace('—', "-")
1010        .replace(['“', '”'], "\"")
1011        .replace('’', "'")
1012        .split_whitespace()
1013        .collect::<Vec<_>>()
1014        .join(" ");
1015
1016    if normalized.is_empty() {
1017        return normalized;
1018    }
1019
1020    let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
1021    if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
1022        return String::new();
1023    }
1024
1025    normalized
1026}
1027
1028fn is_ocr_candidate(
1029    image: &ImageChunk,
1030    page_bbox: &BoundingBox,
1031    text_chunks: &[TextChunk],
1032) -> bool {
1033    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1034    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1035    if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
1036        return false;
1037    }
1038
1039    let overlapping_chunks: Vec<&TextChunk> = text_chunks
1040        .iter()
1041        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1042        .collect();
1043    let native_text_chars: usize = overlapping_chunks
1044        .iter()
1045        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1046        .sum();
1047
1048    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
1049        || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
1050}
1051
1052fn is_dominant_image_text_candidate(
1053    image: &ImageChunk,
1054    page_bbox: &BoundingBox,
1055    text_chunks: &[TextChunk],
1056) -> bool {
1057    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1058    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1059    if width_ratio < MIN_DOMINANT_IMAGE_WIDTH_RATIO || area_ratio < MIN_DOMINANT_IMAGE_AREA_RATIO {
1060        return false;
1061    }
1062
1063    let native_text_chars: usize = text_chunks
1064        .iter()
1065        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1066        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1067        .sum();
1068
1069    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE
1070}
1071
1072fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
1073    let mut words = Vec::new();
1074    for line in tsv.lines().skip(1) {
1075        let mut cols = line.splitn(12, '\t');
1076        let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1077        if level != 5 {
1078            continue;
1079        }
1080        let _page_num = cols.next();
1081        let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1082        let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1083        let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1084        let _word_num = cols.next();
1085        let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1086        let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1087        let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1088        let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1089        let confidence = cols
1090            .next()
1091            .and_then(|s| s.parse::<f64>().ok())
1092            .unwrap_or(-1.0);
1093        let text = cols.next().unwrap_or("").trim().to_string();
1094        if !(MIN_OCR_WORD_CONFIDENCE..=MAX_OCR_WORD_CONFIDENCE).contains(&confidence)
1095            || text.is_empty()
1096            || width == 0
1097            || height == 0
1098            || !text.chars().any(|ch| ch.is_alphanumeric())
1099        {
1100            continue;
1101        }
1102        words.push(OcrWord {
1103            line_key: (block_num, par_num, line_num),
1104            left,
1105            top,
1106            width,
1107            height,
1108            text,
1109            confidence,
1110        });
1111    }
1112    words
1113}
1114
1115fn looks_like_chart_label_ocr(words: &[OcrWord]) -> bool {
1116    if words.len() < 8 {
1117        return false;
1118    }
1119
1120    let min_left = words.iter().map(|word| word.left).min().unwrap_or(0);
1121    let min_top = words.iter().map(|word| word.top).min().unwrap_or(0);
1122    let max_right = words
1123        .iter()
1124        .map(|word| word.left.saturating_add(word.width))
1125        .max()
1126        .unwrap_or(0);
1127    let max_bottom = words
1128        .iter()
1129        .map(|word| word.top.saturating_add(word.height))
1130        .max()
1131        .unwrap_or(0);
1132    let image_width = max_right.saturating_sub(min_left);
1133    let image_height = max_bottom.saturating_sub(min_top);
1134    if image_width < 160 || image_height < 120 {
1135        return false;
1136    }
1137
1138    let width_f = f64::from(image_width);
1139    let height_f = f64::from(image_height);
1140    let outer_x = width_f * 0.18;
1141    let outer_y = height_f * 0.18;
1142    let inner_left = width_f * 0.22;
1143    let inner_right = width_f * 0.78;
1144    let inner_top = height_f * 0.22;
1145    let inner_bottom = height_f * 0.78;
1146
1147    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1148    let mut outer_words = 0usize;
1149    let mut inner_words = 0usize;
1150
1151    for word in words {
1152        by_line.entry(word.line_key).or_default().push(word);
1153
1154        let center_x = f64::from(word.left.saturating_sub(min_left)) + f64::from(word.width) / 2.0;
1155        let center_y = f64::from(word.top.saturating_sub(min_top)) + f64::from(word.height) / 2.0;
1156
1157        if center_x <= outer_x
1158            || center_x >= width_f - outer_x
1159            || center_y <= outer_y
1160            || center_y >= height_f - outer_y
1161        {
1162            outer_words += 1;
1163        }
1164
1165        if center_x >= inner_left
1166            && center_x <= inner_right
1167            && center_y >= inner_top
1168            && center_y <= inner_bottom
1169        {
1170            inner_words += 1;
1171        }
1172    }
1173
1174    if by_line.len() < 5 {
1175        return false;
1176    }
1177
1178    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1179    let mut clusters: Vec<XCluster> = Vec::new();
1180    for line_words in by_line.values() {
1181        for word in line_words {
1182            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1183            if let Some(cluster) = clusters
1184                .iter_mut()
1185                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1186            {
1187                cluster.center =
1188                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1189                cluster.count += 1;
1190                cluster.lines.insert(word.line_key);
1191            } else {
1192                let mut lines = HashSet::new();
1193                lines.insert(word.line_key);
1194                clusters.push(XCluster {
1195                    center,
1196                    count: 1,
1197                    lines,
1198                });
1199            }
1200        }
1201    }
1202
1203    let stable_centers: Vec<f64> = clusters
1204        .iter()
1205        .filter(|cluster| cluster.lines.len() >= 4 && cluster.count >= 4)
1206        .map(|cluster| cluster.center)
1207        .collect();
1208    let mut sorted_stable_centers = stable_centers.clone();
1209    sorted_stable_centers
1210        .sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1211    let max_stable_gap = sorted_stable_centers
1212        .windows(2)
1213        .map(|pair| pair[1] - pair[0])
1214        .fold(0.0, f64::max);
1215    let spans_full_table_width = stable_centers.len() >= 3
1216        && stable_centers
1217            .iter()
1218            .any(|center| *center - f64::from(min_left) <= width_f * 0.25)
1219        && stable_centers
1220            .iter()
1221            .any(|center| *center - f64::from(min_left) >= width_f * 0.75)
1222        && stable_centers.iter().any(|center| {
1223            let rel = *center - f64::from(min_left);
1224            rel >= inner_left && rel <= inner_right
1225        })
1226        && max_stable_gap <= width_f * 0.45;
1227    if spans_full_table_width {
1228        let table_like_lines = by_line
1229            .values()
1230            .filter(|line_words| {
1231                let mut seen = HashSet::<usize>::new();
1232                for word in *line_words {
1233                    let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1234                    for (idx, stable_center) in stable_centers.iter().enumerate() {
1235                        if (center - stable_center).abs() <= tolerance {
1236                            seen.insert(idx);
1237                        }
1238                    }
1239                }
1240                seen.len() >= 3
1241            })
1242            .count();
1243        if table_like_lines >= 4 {
1244            return false;
1245        }
1246    }
1247
1248    let mut short_lines = 0usize;
1249    let mut peripheral_label_lines = 0usize;
1250    let mut wide_sentence_lines = 0usize;
1251    let mut axisish_numeric_lines = 0usize;
1252
1253    for line_words in by_line.values() {
1254        let line_left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1255        let line_top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1256        let line_right = line_words
1257            .iter()
1258            .map(|word| word.left.saturating_add(word.width))
1259            .max()
1260            .unwrap_or(0);
1261        let line_bottom = line_words
1262            .iter()
1263            .map(|word| word.top.saturating_add(word.height))
1264            .max()
1265            .unwrap_or(0);
1266        if line_right <= line_left || line_bottom <= line_top {
1267            continue;
1268        }
1269
1270        let word_count = line_words.len();
1271        let numeric_in_line = line_words
1272            .iter()
1273            .filter(|word| is_numeric_like(&word.text))
1274            .count();
1275        let line_width_ratio =
1276            f64::from(line_right.saturating_sub(line_left)) / f64::from(image_width.max(1));
1277        let touches_outer_band = f64::from(line_left.saturating_sub(min_left)) <= outer_x
1278            || f64::from(line_right.saturating_sub(min_left)) >= width_f - outer_x
1279            || f64::from(line_top.saturating_sub(min_top)) <= outer_y
1280            || f64::from(line_bottom.saturating_sub(min_top)) >= height_f - outer_y;
1281
1282        if word_count <= 3 {
1283            short_lines += 1;
1284        }
1285        if touches_outer_band && word_count <= 4 {
1286            peripheral_label_lines += 1;
1287        }
1288        if touches_outer_band && word_count <= 3 && numeric_in_line > 0 {
1289            axisish_numeric_lines += 1;
1290        }
1291        if word_count >= 4 && line_width_ratio >= 0.45 && numeric_in_line == 0 {
1292            wide_sentence_lines += 1;
1293        }
1294    }
1295
1296    let total_lines = by_line.len();
1297    let outer_dominant = outer_words * 10 >= words.len() * 5;
1298    let inner_sparse = inner_words * 10 <= words.len() * 5;
1299    let label_dominant = peripheral_label_lines * 10 >= total_lines * 6;
1300    let short_line_dominant = short_lines * 10 >= total_lines * 6;
1301    let axis_signal = axisish_numeric_lines >= 2;
1302
1303    outer_dominant
1304        && inner_sparse
1305        && label_dominant
1306        && short_line_dominant
1307        && axis_signal
1308        && wide_sentence_lines <= 2
1309}
1310
1311fn looks_like_matrix_formula_ocr(words: &[OcrWord]) -> bool {
1312    if words.len() < 6 {
1313        return false;
1314    }
1315
1316    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1317    for word in words {
1318        by_line.entry(word.line_key).or_default().push(word);
1319    }
1320
1321    if by_line.len() < 2 || by_line.len() > 4 {
1322        return false;
1323    }
1324
1325    let substantive_words = words
1326        .iter()
1327        .filter(|word| is_substantive_table_word(&word.text))
1328        .count();
1329    let short_formulaish_words = words
1330        .iter()
1331        .filter(|word| is_short_formulaish_word(&word.text))
1332        .count();
1333    let slash_words = words.iter().filter(|word| word.text.contains('/')).count();
1334    let equation_label_words = words
1335        .iter()
1336        .filter(|word| looks_like_equation_label_word(&word.text))
1337        .count();
1338    let dense_lines = by_line.values().filter(|line| line.len() >= 3).count();
1339    let short_lines = by_line
1340        .values()
1341        .filter(|line| line.iter().all(|word| is_short_formulaish_word(&word.text)))
1342        .count();
1343
1344    substantive_words == 0
1345        && dense_lines >= 2
1346        && short_lines * 10 >= by_line.len() * 7
1347        && short_formulaish_words * 10 >= words.len() * 7
1348        && (slash_words > 0 || equation_label_words >= 2)
1349}
1350
1351fn is_substantive_table_word(text: &str) -> bool {
1352    let normalized: String = text
1353        .chars()
1354        .filter(|ch| ch.is_alphanumeric())
1355        .flat_map(char::to_lowercase)
1356        .collect();
1357    if normalized.is_empty() {
1358        return false;
1359    }
1360
1361    let alpha_count = normalized.chars().filter(|ch| ch.is_alphabetic()).count();
1362    let digit_count = normalized.chars().filter(|ch| ch.is_ascii_digit()).count();
1363    let has_non_binary_digit = normalized
1364        .chars()
1365        .any(|ch| ch.is_ascii_digit() && !matches!(ch, '0' | '1'));
1366
1367    alpha_count >= 4
1368        || (digit_count >= 2 && alpha_count == 0 && has_non_binary_digit)
1369        || (normalized.len() >= 5 && alpha_count >= 2)
1370}
1371
1372fn is_short_formulaish_word(text: &str) -> bool {
1373    let normalized: String = text
1374        .chars()
1375        .filter(|ch| ch.is_alphanumeric())
1376        .flat_map(char::to_lowercase)
1377        .collect();
1378    if normalized.is_empty() {
1379        return true;
1380    }
1381
1382    normalized.len() <= 3 || (text.contains('/') && normalized.len() <= 4)
1383}
1384
1385fn looks_like_equation_label_word(text: &str) -> bool {
1386    let trimmed = text.trim_matches(|ch: char| !ch.is_alphanumeric());
1387    let mut chars = trimmed.chars();
1388    let Some(first) = chars.next() else {
1389        return false;
1390    };
1391    if !first.is_ascii_alphabetic() || !first.is_ascii_uppercase() {
1392        return false;
1393    }
1394
1395    let remainder: String = chars.collect();
1396    !remainder.is_empty() && remainder.len() <= 3 && remainder.chars().all(|ch| ch.is_ascii_digit())
1397}
1398
1399fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
1400    if words.len() < 8 {
1401        return false;
1402    }
1403
1404    if looks_like_chart_label_ocr(words) {
1405        return false;
1406    }
1407
1408    if looks_like_matrix_formula_ocr(words) {
1409        return false;
1410    }
1411
1412    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1413    for word in words {
1414        by_line.entry(word.line_key).or_default().push(word);
1415    }
1416
1417    let mut qualifying_lines = Vec::new();
1418    let mut numeric_like_count = 0usize;
1419    let mut max_right = 0u32;
1420    for line_words in by_line.values_mut() {
1421        line_words.sort_by_key(|word| word.left);
1422        let numeric_words = line_words
1423            .iter()
1424            .filter(|word| is_numeric_like(&word.text))
1425            .count();
1426        numeric_like_count += numeric_words;
1427        if line_words.len() >= 3 || numeric_words >= 2 {
1428            max_right = max_right.max(
1429                line_words
1430                    .iter()
1431                    .map(|word| word.left.saturating_add(word.width))
1432                    .max()
1433                    .unwrap_or(0),
1434            );
1435            qualifying_lines.push(line_words.clone());
1436        }
1437    }
1438
1439    if qualifying_lines.len() < 2 {
1440        return false;
1441    }
1442
1443    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1444    let mut clusters: Vec<XCluster> = Vec::new();
1445    for line in &qualifying_lines {
1446        for word in line {
1447            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1448            if let Some(cluster) = clusters
1449                .iter_mut()
1450                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1451            {
1452                cluster.center =
1453                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1454                cluster.count += 1;
1455                cluster.lines.insert(word.line_key);
1456            } else {
1457                let mut lines = HashSet::new();
1458                lines.insert(word.line_key);
1459                clusters.push(XCluster {
1460                    center,
1461                    count: 1,
1462                    lines,
1463                });
1464            }
1465        }
1466    }
1467
1468    let repeated_clusters: Vec<&XCluster> = clusters
1469        .iter()
1470        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1471        .collect();
1472    if repeated_clusters.len() < 3 {
1473        return false;
1474    }
1475
1476    let repeated_centers: Vec<f64> = repeated_clusters
1477        .iter()
1478        .map(|cluster| cluster.center)
1479        .collect();
1480    let structured_lines = qualifying_lines
1481        .iter()
1482        .filter(|line| {
1483            let mut seen = HashSet::<usize>::new();
1484            for word in *line {
1485                let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1486                for (idx, repeated_center) in repeated_centers.iter().enumerate() {
1487                    if (center - repeated_center).abs() <= tolerance {
1488                        seen.insert(idx);
1489                    }
1490                }
1491            }
1492            seen.len() >= 3
1493                || (seen.len() >= 2
1494                    && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
1495        })
1496        .count();
1497
1498    let alphabetic_words = words
1499        .iter()
1500        .filter(|word| word.text.chars().any(|ch| ch.is_alphabetic()))
1501        .count();
1502
1503    // Geometric guard: repeated vertical bands alone are not enough for tables.
1504    // Dense prose in infographics often forms stable x-clusters but lacks numeric
1505    // signal. Require either numeric evidence or stronger column multiplicity.
1506    if numeric_like_count == 0
1507        && alphabetic_words * 10 >= words.len() * 9
1508        && repeated_clusters.len() <= 4
1509    {
1510        return false;
1511    }
1512
1513    structured_lines >= 3
1514        || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
1515}
1516
1517fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
1518    if !looks_like_table_ocr(words) {
1519        return false;
1520    }
1521
1522    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1523    for word in words {
1524        by_line.entry(word.line_key).or_default().push(word);
1525    }
1526
1527    let numeric_like_count = words
1528        .iter()
1529        .filter(|word| is_numeric_like(&word.text))
1530        .count();
1531    let numeric_lines = by_line
1532        .values()
1533        .filter(|line| {
1534            line.iter()
1535                .filter(|word| is_numeric_like(&word.text))
1536                .count()
1537                >= 2
1538        })
1539        .count();
1540
1541    numeric_like_count >= 12 && numeric_lines >= 3
1542}
1543
1544fn looks_like_dense_prose_image_ocr(words: &[OcrWord]) -> bool {
1545    if words.len() < MIN_DOMINANT_IMAGE_OCR_WORDS || looks_like_table_ocr(words) {
1546        return false;
1547    }
1548
1549    if looks_like_chart_label_ocr(words) {
1550        return false;
1551    }
1552
1553    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1554    let mut alphabetic_words = 0usize;
1555    let mut numeric_like_words = 0usize;
1556    for word in words {
1557        by_line.entry(word.line_key).or_default().push(word);
1558        if word.text.chars().any(|ch| ch.is_alphabetic()) {
1559            alphabetic_words += 1;
1560        }
1561        if is_numeric_like(&word.text) {
1562            numeric_like_words += 1;
1563        }
1564    }
1565
1566    if by_line.len() < MIN_DOMINANT_IMAGE_TEXT_LINES || alphabetic_words * 3 < words.len() * 2 {
1567        return false;
1568    }
1569    if numeric_like_words * 4 > words.len() {
1570        return false;
1571    }
1572
1573    let multiword_lines = by_line
1574        .values()
1575        .filter(|line| line.iter().filter(|word| word.text.len() >= 2).count() >= 3)
1576        .count();
1577    multiword_lines >= 4 && has_dense_prose_block_geometry(words)
1578}
1579
1580fn has_dense_prose_block_geometry(words: &[OcrWord]) -> bool {
1581    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1582    for word in words {
1583        by_line.entry(word.line_key).or_default().push(word);
1584    }
1585
1586    let mut spatial_lines = Vec::new();
1587    for line_words in by_line.values() {
1588        if line_words.len() < 3 {
1589            continue;
1590        }
1591
1592        let left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1593        let right = line_words
1594            .iter()
1595            .map(|word| word.left.saturating_add(word.width))
1596            .max()
1597            .unwrap_or(0);
1598        let top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1599        let bottom = line_words
1600            .iter()
1601            .map(|word| word.top.saturating_add(word.height))
1602            .max()
1603            .unwrap_or(0);
1604
1605        if right <= left || bottom <= top {
1606            continue;
1607        }
1608
1609        spatial_lines.push(SpatialOcrLine {
1610            left,
1611            top,
1612            right,
1613            bottom,
1614            text: String::new(),
1615            word_count: line_words.len(),
1616            line_count: 1,
1617            line_height_sum: bottom.saturating_sub(top).max(1),
1618        });
1619    }
1620
1621    spatial_lines.sort_by_key(|line| (line.top, line.left));
1622    if spatial_lines.len() < MIN_DENSE_PROSE_BLOCK_LINES {
1623        return false;
1624    }
1625
1626    let image_width = spatial_lines
1627        .iter()
1628        .map(|line| line.right)
1629        .max()
1630        .unwrap_or(0);
1631    if image_width == 0 {
1632        return false;
1633    }
1634
1635    let median_height = {
1636        let mut heights: Vec<u32> = spatial_lines
1637            .iter()
1638            .map(|line| line.bottom.saturating_sub(line.top).max(1))
1639            .collect();
1640        heights.sort_unstable();
1641        heights[heights.len() / 2]
1642    };
1643
1644    let mut best_line_count = 1usize;
1645    let mut best_left = spatial_lines[0].left;
1646    let mut best_right = spatial_lines[0].right;
1647    let mut current_line_count = 1usize;
1648    let mut current_left = spatial_lines[0].left;
1649    let mut current_right = spatial_lines[0].right;
1650
1651    for pair in spatial_lines.windows(2) {
1652        let prev = &pair[0];
1653        let curr = &pair[1];
1654        if spatial_lines_share_block_geometry(prev, curr, image_width, median_height) {
1655            current_line_count += 1;
1656            current_left = current_left.min(curr.left);
1657            current_right = current_right.max(curr.right);
1658        } else {
1659            if current_line_count > best_line_count {
1660                best_line_count = current_line_count;
1661                best_left = current_left;
1662                best_right = current_right;
1663            }
1664            current_line_count = 1;
1665            current_left = curr.left;
1666            current_right = curr.right;
1667        }
1668    }
1669
1670    if current_line_count > best_line_count {
1671        best_line_count = current_line_count;
1672        best_left = current_left;
1673        best_right = current_right;
1674    }
1675
1676    let block_width_ratio =
1677        f64::from(best_right.saturating_sub(best_left)) / f64::from(image_width);
1678    best_line_count >= MIN_DENSE_PROSE_BLOCK_LINES
1679        && block_width_ratio >= MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO
1680}
1681
1682fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1683    let image_width = words
1684        .iter()
1685        .map(|word| word.left.saturating_add(word.width))
1686        .max()?;
1687    let image_height = words
1688        .iter()
1689        .map(|word| word.top.saturating_add(word.height))
1690        .max()?;
1691    if image_width == 0 || image_height == 0 {
1692        return None;
1693    }
1694
1695    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1696    for word in words {
1697        by_line.entry(word.line_key).or_default().push(word);
1698    }
1699
1700    let max_right = words
1701        .iter()
1702        .map(|word| word.left.saturating_add(word.width))
1703        .max()
1704        .unwrap_or(0);
1705    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1706
1707    let mut clusters: Vec<XCluster> = Vec::new();
1708    for line_words in by_line.values() {
1709        for word in line_words {
1710            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1711            if let Some(cluster) = clusters
1712                .iter_mut()
1713                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1714            {
1715                cluster.center =
1716                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1717                cluster.count += 1;
1718                cluster.lines.insert(word.line_key);
1719            } else {
1720                let mut lines = HashSet::new();
1721                lines.insert(word.line_key);
1722                clusters.push(XCluster {
1723                    center,
1724                    count: 1,
1725                    lines,
1726                });
1727            }
1728        }
1729    }
1730    let mut centers: Vec<f64> = clusters
1731        .into_iter()
1732        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1733        .map(|cluster| cluster.center)
1734        .collect();
1735    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1736    if centers.len() < 3 {
1737        return None;
1738    }
1739
1740    let mut built_rows = Vec::<OcrRowBuild>::new();
1741    let mut row_fill_counts = Vec::<usize>::new();
1742    for line_words in by_line.values() {
1743        let mut sorted_words = line_words.clone();
1744        sorted_words.sort_by_key(|word| word.left);
1745
1746        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1747        for word in &sorted_words {
1748            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1749            if let Some((col_idx, distance)) = centers
1750                .iter()
1751                .enumerate()
1752                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1753                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1754            {
1755                if distance <= tolerance {
1756                    cells[col_idx].push(word);
1757                }
1758            }
1759        }
1760
1761        let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
1762        let numeric_cells = cells
1763            .iter()
1764            .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
1765            .count();
1766        if filled_cells < 3 && numeric_cells < 2 {
1767            continue;
1768        }
1769        row_fill_counts.push(filled_cells);
1770
1771        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
1772        let bottom_px = sorted_words
1773            .iter()
1774            .map(|word| word.top.saturating_add(word.height))
1775            .max()
1776            .unwrap_or(0);
1777        let top_y =
1778            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
1779        let bottom_y = image.bbox.top_y
1780            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
1781        let cell_texts = cells
1782            .iter()
1783            .map(|cell_words| {
1784                cell_words
1785                    .iter()
1786                    .map(|word| word.text.as_str())
1787                    .collect::<Vec<_>>()
1788                    .join(" ")
1789            })
1790            .collect();
1791        built_rows.push(OcrRowBuild {
1792            top_y,
1793            bottom_y,
1794            cell_texts,
1795        });
1796    }
1797
1798    if built_rows.len() < 2 {
1799        return None;
1800    }
1801    if row_fill_counts.is_empty() {
1802        return None;
1803    }
1804
1805    let mut sorted_fill_counts = row_fill_counts.clone();
1806    sorted_fill_counts.sort_unstable();
1807    let median_fill_ratio =
1808        sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
1809    if median_fill_ratio < MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO {
1810        return None;
1811    }
1812
1813    built_rows.sort_by(|a, b| {
1814        b.top_y
1815            .partial_cmp(&a.top_y)
1816            .unwrap_or(std::cmp::Ordering::Equal)
1817    });
1818    let x_coordinates =
1819        build_boundaries_from_centers(&centers, image.bbox.left_x, image.bbox.right_x);
1820    let row_bounds: Vec<(f64, f64)> = built_rows
1821        .iter()
1822        .map(|row| (row.top_y, row.bottom_y))
1823        .collect();
1824    let y_coordinates = build_row_boundaries(&row_bounds);
1825    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
1826        return None;
1827    }
1828
1829    let mut rows = Vec::new();
1830    for (row_idx, row_build) in built_rows.iter().enumerate() {
1831        let row_bbox = BoundingBox::new(
1832            image.bbox.page_number,
1833            image.bbox.left_x,
1834            y_coordinates[row_idx + 1],
1835            image.bbox.right_x,
1836            y_coordinates[row_idx],
1837        );
1838        let mut cells = Vec::new();
1839        for col_idx in 0..centers.len() {
1840            let cell_bbox = BoundingBox::new(
1841                image.bbox.page_number,
1842                x_coordinates[col_idx],
1843                y_coordinates[row_idx + 1],
1844                x_coordinates[col_idx + 1],
1845                y_coordinates[row_idx],
1846            );
1847            let text = row_build
1848                .cell_texts
1849                .get(col_idx)
1850                .cloned()
1851                .unwrap_or_default();
1852            let mut content = Vec::new();
1853            if !text.trim().is_empty() {
1854                content.push(TableToken {
1855                    base: TextChunk {
1856                        value: text.trim().to_string(),
1857                        bbox: cell_bbox.clone(),
1858                        font_name: "OCR".to_string(),
1859                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
1860                        font_weight: 400.0,
1861                        italic_angle: 0.0,
1862                        font_color: "#000000".to_string(),
1863                        contrast_ratio: 21.0,
1864                        symbol_ends: Vec::new(),
1865                        text_format: TextFormat::Normal,
1866                        text_type: TextType::Regular,
1867                        pdf_layer: PdfLayer::Content,
1868                        ocg_visible: true,
1869                        index: None,
1870                        page_number: image.bbox.page_number,
1871                        level: None,
1872                        mcid: None,
1873                    },
1874                    token_type: TableTokenType::Text,
1875                });
1876            }
1877            cells.push(TableBorderCell {
1878                bbox: cell_bbox,
1879                index: None,
1880                level: None,
1881                row_number: row_idx,
1882                col_number: col_idx,
1883                row_span: 1,
1884                col_span: 1,
1885                content,
1886                contents: Vec::new(),
1887                semantic_type: None,
1888            });
1889        }
1890        rows.push(TableBorderRow {
1891            bbox: row_bbox,
1892            index: None,
1893            level: None,
1894            row_number: row_idx,
1895            cells,
1896            semantic_type: None,
1897        });
1898    }
1899
1900    Some(TableBorder {
1901        bbox: image.bbox.clone(),
1902        index: None,
1903        level: None,
1904        x_coordinates: x_coordinates.clone(),
1905        x_widths: vec![0.0; x_coordinates.len()],
1906        y_coordinates: y_coordinates.clone(),
1907        y_widths: vec![0.0; y_coordinates.len()],
1908        rows,
1909        num_rows: built_rows.len(),
1910        num_columns: centers.len(),
1911        is_bad_table: false,
1912        is_table_transformer: true,
1913        previous_table: None,
1914        next_table: None,
1915    })
1916}
1917
1918fn build_structured_ocr_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1919    let image_width = words
1920        .iter()
1921        .map(|word| word.left.saturating_add(word.width))
1922        .max()?;
1923    let image_height = words
1924        .iter()
1925        .map(|word| word.top.saturating_add(word.height))
1926        .max()?;
1927    if image_width == 0 || image_height == 0 {
1928        return None;
1929    }
1930
1931    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1932    for word in words {
1933        by_line.entry(word.line_key).or_default().push(word);
1934    }
1935
1936    let max_right = words
1937        .iter()
1938        .map(|word| word.left.saturating_add(word.width))
1939        .max()
1940        .unwrap_or(0);
1941    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1942
1943    let mut clusters: Vec<XCluster> = Vec::new();
1944    for line_words in by_line.values() {
1945        for word in line_words {
1946            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1947            if let Some(cluster) = clusters
1948                .iter_mut()
1949                .find(|cluster| (cluster.center - center).abs() <= tolerance)
1950            {
1951                cluster.center =
1952                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1953                cluster.count += 1;
1954                cluster.lines.insert(word.line_key);
1955            } else {
1956                let mut lines = HashSet::new();
1957                lines.insert(word.line_key);
1958                clusters.push(XCluster {
1959                    center,
1960                    count: 1,
1961                    lines,
1962                });
1963            }
1964        }
1965    }
1966
1967    let mut centers: Vec<f64> = clusters
1968        .into_iter()
1969        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1970        .map(|cluster| cluster.center)
1971        .collect();
1972    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1973    if centers.len() < 3 {
1974        return None;
1975    }
1976
1977    let mut built_rows = Vec::<OcrRowBuild>::new();
1978    let mut row_fill_counts = Vec::<usize>::new();
1979    let mut occupied_columns = vec![0usize; centers.len()];
1980
1981    for line_words in by_line.values() {
1982        let mut sorted_words = line_words.clone();
1983        sorted_words.sort_by_key(|word| word.left);
1984
1985        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1986        for word in &sorted_words {
1987            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1988            if let Some((col_idx, distance)) = centers
1989                .iter()
1990                .enumerate()
1991                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1992                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1993            {
1994                if distance <= tolerance {
1995                    cells[col_idx].push(word);
1996                }
1997            }
1998        }
1999
2000        let filled_indices: Vec<usize> = cells
2001            .iter()
2002            .enumerate()
2003            .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
2004            .collect();
2005        if filled_indices.len() < 2 {
2006            continue;
2007        }
2008
2009        let span = filled_indices.last().unwrap_or(&0) - filled_indices.first().unwrap_or(&0) + 1;
2010        if filled_indices.len() < 3 && span < 3 {
2011            continue;
2012        }
2013
2014        row_fill_counts.push(filled_indices.len());
2015        for idx in &filled_indices {
2016            if let Some(count) = occupied_columns.get_mut(*idx) {
2017                *count += 1;
2018            }
2019        }
2020
2021        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
2022        let bottom_px = sorted_words
2023            .iter()
2024            .map(|word| word.top.saturating_add(word.height))
2025            .max()
2026            .unwrap_or(0);
2027        let top_y =
2028            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
2029        let bottom_y = image.bbox.top_y
2030            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
2031        let cell_texts = cells
2032            .iter()
2033            .map(|cell_words| {
2034                let mut sorted_cell_words = cell_words.clone();
2035                sorted_cell_words.sort_by_key(|word| word.left);
2036                sorted_cell_words
2037                    .iter()
2038                    .map(|word| word.text.as_str())
2039                    .collect::<Vec<_>>()
2040                    .join(" ")
2041            })
2042            .collect();
2043        built_rows.push(OcrRowBuild {
2044            top_y,
2045            bottom_y,
2046            cell_texts,
2047        });
2048    }
2049
2050    if built_rows.len() < 3 || row_fill_counts.is_empty() {
2051        return None;
2052    }
2053
2054    let repeated_columns = occupied_columns.iter().filter(|count| **count >= 2).count();
2055    if repeated_columns < 3 {
2056        return None;
2057    }
2058
2059    let mut sorted_fill_counts = row_fill_counts.clone();
2060    sorted_fill_counts.sort_unstable();
2061    let median_fill_ratio =
2062        sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
2063    if median_fill_ratio < 0.5 {
2064        return None;
2065    }
2066
2067    built_rows.sort_by(|a, b| {
2068        b.top_y
2069            .partial_cmp(&a.top_y)
2070            .unwrap_or(std::cmp::Ordering::Equal)
2071    });
2072    let x_coordinates =
2073        build_boundaries_from_centers(&centers, image.bbox.left_x, image.bbox.right_x);
2074    let row_bounds: Vec<(f64, f64)> = built_rows
2075        .iter()
2076        .map(|row| (row.top_y, row.bottom_y))
2077        .collect();
2078    let y_coordinates = build_row_boundaries(&row_bounds);
2079    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
2080        return None;
2081    }
2082
2083    let mut rows = Vec::new();
2084    for (row_idx, row_build) in built_rows.iter().enumerate() {
2085        let row_bbox = BoundingBox::new(
2086            image.bbox.page_number,
2087            image.bbox.left_x,
2088            y_coordinates[row_idx + 1],
2089            image.bbox.right_x,
2090            y_coordinates[row_idx],
2091        );
2092        let mut cells = Vec::new();
2093        for col_idx in 0..centers.len() {
2094            let cell_bbox = BoundingBox::new(
2095                image.bbox.page_number,
2096                x_coordinates[col_idx],
2097                y_coordinates[row_idx + 1],
2098                x_coordinates[col_idx + 1],
2099                y_coordinates[row_idx],
2100            );
2101            let text = row_build
2102                .cell_texts
2103                .get(col_idx)
2104                .cloned()
2105                .unwrap_or_default();
2106            let mut content = Vec::new();
2107            if !text.trim().is_empty() {
2108                content.push(TableToken {
2109                    base: TextChunk {
2110                        value: text.trim().to_string(),
2111                        bbox: cell_bbox.clone(),
2112                        font_name: "OCR".to_string(),
2113                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
2114                        font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2115                        italic_angle: 0.0,
2116                        font_color: "#000000".to_string(),
2117                        contrast_ratio: 21.0,
2118                        symbol_ends: Vec::new(),
2119                        text_format: TextFormat::Normal,
2120                        text_type: TextType::Regular,
2121                        pdf_layer: PdfLayer::Content,
2122                        ocg_visible: true,
2123                        index: None,
2124                        page_number: image.bbox.page_number,
2125                        level: None,
2126                        mcid: None,
2127                    },
2128                    token_type: TableTokenType::Text,
2129                });
2130            }
2131            cells.push(TableBorderCell {
2132                bbox: cell_bbox,
2133                index: None,
2134                level: None,
2135                row_number: row_idx,
2136                col_number: col_idx,
2137                row_span: 1,
2138                col_span: 1,
2139                content,
2140                contents: Vec::new(),
2141                semantic_type: None,
2142            });
2143        }
2144        rows.push(TableBorderRow {
2145            bbox: row_bbox,
2146            index: None,
2147            level: None,
2148            row_number: row_idx,
2149            cells,
2150            semantic_type: None,
2151        });
2152    }
2153
2154    Some(TableBorder {
2155        bbox: image.bbox.clone(),
2156        index: None,
2157        level: None,
2158        x_coordinates: x_coordinates.clone(),
2159        x_widths: vec![0.0; x_coordinates.len()],
2160        y_coordinates: y_coordinates.clone(),
2161        y_widths: vec![0.0; y_coordinates.len()],
2162        rows,
2163        num_rows: built_rows.len(),
2164        num_columns: centers.len(),
2165        is_bad_table: false,
2166        is_table_transformer: true,
2167        previous_table: None,
2168        next_table: None,
2169    })
2170}
2171
2172fn is_matrixish_ocr_artifact_table(table: &TableBorder) -> bool {
2173    if !table.is_table_transformer
2174        || table.num_rows < 2
2175        || table.num_rows > 4
2176        || table.num_columns < 3
2177        || table.bbox.height() > table.bbox.width() * 0.55
2178    {
2179        return false;
2180    }
2181
2182    let texts: Vec<String> = table
2183        .rows
2184        .iter()
2185        .flat_map(|row| row.cells.iter())
2186        .map(table_cell_text)
2187        .filter(|text| !text.is_empty())
2188        .collect();
2189    if texts.len() < 6 {
2190        return false;
2191    }
2192
2193    let substantive_cells = texts
2194        .iter()
2195        .filter(|text| is_substantive_ocr_cell_text(text))
2196        .count();
2197    let short_cells = texts
2198        .iter()
2199        .filter(|text| is_short_ocr_cell_text(text))
2200        .count();
2201    let ambiguous_cells = texts
2202        .iter()
2203        .filter(|text| is_ambiguous_matrix_cell_text(text))
2204        .count();
2205
2206    substantive_cells == 0
2207        && short_cells * 10 >= texts.len() * 8
2208        && ambiguous_cells * 10 >= texts.len() * 5
2209}
2210
2211fn table_cell_text(cell: &TableBorderCell) -> String {
2212    cell.content
2213        .iter()
2214        .map(|token| token.base.value.trim())
2215        .filter(|value| !value.is_empty())
2216        .collect::<Vec<_>>()
2217        .join(" ")
2218}
2219
2220fn is_substantive_ocr_cell_text(text: &str) -> bool {
2221    text.split_whitespace().any(is_substantive_table_word)
2222}
2223
2224fn is_short_ocr_cell_text(text: &str) -> bool {
2225    let normalized: String = text
2226        .chars()
2227        .filter(|ch| ch.is_alphanumeric())
2228        .flat_map(char::to_lowercase)
2229        .collect();
2230    !normalized.is_empty() && normalized.len() <= 4
2231}
2232
2233fn is_ambiguous_matrix_cell_text(text: &str) -> bool {
2234    if text.contains(['/', '\\', '=', '|', '[', ']', '{', '}', '(', ')']) {
2235        return true;
2236    }
2237
2238    let normalized: String = text
2239        .chars()
2240        .filter(|ch| ch.is_alphanumeric())
2241        .flat_map(char::to_lowercase)
2242        .collect();
2243    !normalized.is_empty()
2244        && normalized.len() <= 4
2245        && normalized
2246            .chars()
2247            .all(|ch| matches!(ch, '0' | '1' | 'o' | 'd' | 'q' | 'i' | 'l'))
2248}
2249
2250fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
2251    let gray = image::open(image_path).ok()?.to_luma8();
2252    recover_bordered_raster_caption_from_gray(&gray, image)
2253}
2254
2255fn recover_bordered_raster_caption_from_gray(
2256    gray: &GrayImage,
2257    image: &ImageChunk,
2258) -> Option<TextChunk> {
2259    let grid = detect_bordered_raster_grid(gray)?;
2260    let first_h = *grid.horizontal_lines.first()?;
2261    if first_h <= 2 {
2262        return None;
2263    }
2264
2265    let crop = gray.view(0, 0, gray.width(), first_h).to_image();
2266    let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
2267    if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
2268        return None;
2269    }
2270
2271    let bbox = raster_box_to_page_bbox(
2272        image,
2273        0,
2274        0,
2275        gray.width(),
2276        first_h.max(1),
2277        gray.width().max(1),
2278        gray.height().max(1),
2279    )?;
2280    let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
2281    Some(TextChunk {
2282        value: caption_text,
2283        bbox,
2284        font_name: "OCR".to_string(),
2285        font_size,
2286        font_weight: 700.0,
2287        italic_angle: 0.0,
2288        font_color: "#000000".to_string(),
2289        contrast_ratio: 21.0,
2290        symbol_ends: Vec::new(),
2291        text_format: TextFormat::Normal,
2292        text_type: TextType::Regular,
2293        pdf_layer: PdfLayer::Content,
2294        ocg_visible: true,
2295        index: None,
2296        page_number: image.bbox.page_number,
2297        level: None,
2298        mcid: None,
2299    })
2300}
2301
2302fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
2303    let gray = image::open(image_path).ok()?.to_luma8();
2304    recover_bordered_raster_table_from_gray(&gray, image)
2305}
2306
2307fn recover_bordered_raster_table_from_gray(
2308    gray: &GrayImage,
2309    image: &ImageChunk,
2310) -> Option<TableBorder> {
2311    let grid = detect_bordered_raster_grid(gray)?;
2312    let num_cols = grid.vertical_lines.len().checked_sub(1)?;
2313    let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
2314    if num_cols < 2 || num_rows < 2 {
2315        return None;
2316    }
2317    let table_bbox = raster_box_to_page_bbox(
2318        image,
2319        *grid.vertical_lines.first()?,
2320        *grid.horizontal_lines.first()?,
2321        *grid.vertical_lines.last()?,
2322        *grid.horizontal_lines.last()?,
2323        gray.width(),
2324        gray.height(),
2325    )?;
2326
2327    let x_coordinates = raster_boundaries_to_page(
2328        &grid.vertical_lines,
2329        image.bbox.left_x,
2330        image.bbox.right_x,
2331        gray.width(),
2332    )?;
2333    let y_coordinates = raster_boundaries_to_page_desc(
2334        &grid.horizontal_lines,
2335        image.bbox.bottom_y,
2336        image.bbox.top_y,
2337        gray.height(),
2338    )?;
2339
2340    if !bordered_grid_has_cell_ink(gray, &grid) {
2341        return None;
2342    }
2343
2344    let mut rows = Vec::with_capacity(num_rows);
2345    let mut non_empty_cells = 0usize;
2346    let mut rows_with_text = 0usize;
2347    let mut total_cells = 0usize;
2348    for row_idx in 0..num_rows {
2349        let row_bbox = BoundingBox::new(
2350            image.bbox.page_number,
2351            image.bbox.left_x,
2352            y_coordinates[row_idx + 1],
2353            image.bbox.right_x,
2354            y_coordinates[row_idx],
2355        );
2356        let mut cells = Vec::with_capacity(num_cols);
2357        let mut row_has_text = false;
2358
2359        for col_idx in 0..num_cols {
2360            let x1 = grid.vertical_lines[col_idx];
2361            let x2 = grid.vertical_lines[col_idx + 1];
2362            let y1 = grid.horizontal_lines[row_idx];
2363            let y2 = grid.horizontal_lines[row_idx + 1];
2364            let cell_bbox = BoundingBox::new(
2365                image.bbox.page_number,
2366                x_coordinates[col_idx],
2367                y_coordinates[row_idx + 1],
2368                x_coordinates[col_idx + 1],
2369                y_coordinates[row_idx],
2370            );
2371            let text = extract_raster_cell_text(gray, row_idx, col_idx, x1, y1, x2, y2)
2372                .unwrap_or_default();
2373            total_cells += 1;
2374
2375            let mut content = Vec::new();
2376            if !text.is_empty() {
2377                row_has_text = true;
2378                non_empty_cells += 1;
2379                content.push(TableToken {
2380                    base: TextChunk {
2381                        value: text,
2382                        bbox: cell_bbox.clone(),
2383                        font_name: "OCR".to_string(),
2384                        font_size: (cell_bbox.height() * 0.55).max(6.0),
2385                        font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2386                        italic_angle: 0.0,
2387                        font_color: "#000000".to_string(),
2388                        contrast_ratio: 21.0,
2389                        symbol_ends: Vec::new(),
2390                        text_format: TextFormat::Normal,
2391                        text_type: TextType::Regular,
2392                        pdf_layer: PdfLayer::Content,
2393                        ocg_visible: true,
2394                        index: None,
2395                        page_number: image.bbox.page_number,
2396                        level: None,
2397                        mcid: None,
2398                    },
2399                    token_type: TableTokenType::Text,
2400                });
2401            }
2402
2403            cells.push(TableBorderCell {
2404                bbox: cell_bbox,
2405                index: None,
2406                level: None,
2407                row_number: row_idx,
2408                col_number: col_idx,
2409                row_span: 1,
2410                col_span: 1,
2411                content,
2412                contents: Vec::new(),
2413                semantic_type: None,
2414            });
2415        }
2416
2417        if row_has_text {
2418            rows_with_text += 1;
2419        }
2420
2421        rows.push(TableBorderRow {
2422            bbox: row_bbox,
2423            index: None,
2424            level: None,
2425            row_number: row_idx,
2426            cells,
2427            semantic_type: None,
2428        });
2429    }
2430
2431    if total_cells == 0 {
2432        return None;
2433    }
2434    let text_cell_ratio = non_empty_cells as f64 / total_cells as f64;
2435    if text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
2436        || rows_with_text < MIN_RASTER_TABLE_ROWS_WITH_TEXT
2437    {
2438        return None;
2439    }
2440
2441    Some(TableBorder {
2442        bbox: table_bbox,
2443        index: None,
2444        level: None,
2445        x_coordinates: x_coordinates.clone(),
2446        x_widths: vec![0.0; x_coordinates.len()],
2447        y_coordinates: y_coordinates.clone(),
2448        y_widths: vec![0.0; y_coordinates.len()],
2449        rows,
2450        num_rows,
2451        num_columns: num_cols,
2452        is_bad_table: false,
2453        is_table_transformer: true,
2454        previous_table: None,
2455        next_table: None,
2456    })
2457}
2458
2459fn is_obvious_bar_chart_raster(gray: &GrayImage) -> bool {
2460    let width = gray.width();
2461    let height = gray.height();
2462    if width < 160 || height < 120 {
2463        return false;
2464    }
2465
2466    let min_ink_pixels = (f64::from(width) * 0.35).ceil() as u32;
2467    let min_run_height = (height / 80).max(6);
2468    let wide_ink_row_runs = merge_runs(
2469        (0..height)
2470            .filter(|&y| count_ink_in_row(gray, y, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels),
2471    );
2472    let thick_runs = wide_ink_row_runs
2473        .into_iter()
2474        .filter(|(start, end)| end.saturating_sub(*start) + 1 >= min_run_height)
2475        .count();
2476
2477    thick_runs >= 3 || is_obvious_vertical_bar_chart_raster(gray)
2478}
2479
2480fn is_obvious_vertical_bar_chart_raster(gray: &GrayImage) -> bool {
2481    let width = gray.width();
2482    let height = gray.height();
2483    if width < 160 || height < 120 {
2484        return false;
2485    }
2486
2487    let min_ink_pixels = (f64::from(height) * 0.08).ceil() as u32;
2488    let min_bar_width = (width / 28).max(10);
2489    let min_bar_height = (height / 8).max(16);
2490    let max_baseline_delta = (height / 14).max(8);
2491    let min_fill_ratio = 0.10;
2492
2493    let candidate_runs =
2494        merge_runs((0..width).filter(|&x| {
2495            count_ink_in_column(gray, x, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels
2496        }));
2497    let mut baselines = Vec::new();
2498    let mut has_dominant_bar = false;
2499    let mut qualifying_bars = 0usize;
2500
2501    for (start, end) in candidate_runs {
2502        let run_width = end.saturating_sub(start) + 1;
2503        if run_width < min_bar_width {
2504            continue;
2505        }
2506
2507        let mut top = height;
2508        let mut bottom = 0u32;
2509        let mut ink_pixels = 0usize;
2510        for x in start..=end {
2511            for y in 0..height {
2512                if gray.get_pixel(x, y).0[0] < RASTER_CHART_INK_THRESHOLD {
2513                    top = top.min(y);
2514                    bottom = bottom.max(y);
2515                    ink_pixels += 1;
2516                }
2517            }
2518        }
2519
2520        if top >= height || bottom <= top {
2521            continue;
2522        }
2523
2524        let run_height = bottom.saturating_sub(top) + 1;
2525        if run_height < min_bar_height {
2526            continue;
2527        }
2528
2529        let bbox_area = run_width as usize * run_height as usize;
2530        if bbox_area == 0 {
2531            continue;
2532        }
2533
2534        let fill_ratio = ink_pixels as f64 / bbox_area as f64;
2535        if fill_ratio < min_fill_ratio {
2536            continue;
2537        }
2538
2539        qualifying_bars += 1;
2540        if run_width >= min_bar_width.saturating_mul(2) {
2541            has_dominant_bar = true;
2542        }
2543        baselines.push(bottom);
2544    }
2545
2546    if baselines.len() < 2 {
2547        return false;
2548    }
2549
2550    baselines.sort_unstable();
2551    let median_baseline = baselines[baselines.len() / 2];
2552    let aligned_baselines = baselines
2553        .iter()
2554        .filter(|baseline| baseline.abs_diff(median_baseline) <= max_baseline_delta)
2555        .count();
2556
2557    aligned_baselines >= 2 && (has_dominant_bar || (qualifying_bars >= 4 && aligned_baselines >= 4))
2558}
2559
2560/// Return true when the image appears to be a natural photograph rather than a
2561/// synthetic chart, diagram, or scanned document page.
2562///
2563/// Photographs have a broadly distributed pixel histogram — many mid-tone pixels
2564/// (neither pure white nor pure black).  Synthetic images (charts, tables,
2565/// diagrams) consist mostly of a white background (~255) with sparse dark ink
2566/// (~0-50).  We classify an image as photographic when either at least 30% of
2567/// its pixels fall in the mid-tone band [40, 215], or when a bright image still
2568/// shows photo-like tonal diversity via a wide histogram support and high
2569/// entropy. Numeric table recovery should be skipped for photographic images
2570/// because OCR'd annotation labels (axis ticks, caption fragments) are not table
2571/// data.
2572fn is_natural_photograph_raster(gray: &GrayImage) -> bool {
2573    let total = (gray.width() * gray.height()) as usize;
2574    if total < 400 {
2575        return false;
2576    }
2577
2578    let mut histogram = [0usize; 256];
2579    for pixel in gray.pixels() {
2580        histogram[pixel[0] as usize] += 1;
2581    }
2582
2583    let mid_tone_count: usize = histogram[40..=215].iter().sum();
2584    if mid_tone_count * 10 >= total * 3 {
2585        return true;
2586    }
2587
2588    let mut coarse_histogram = [0usize; 16];
2589    for (value, count) in histogram.iter().enumerate() {
2590        coarse_histogram[value / 16] += count;
2591    }
2592
2593    let occupied_bins = coarse_histogram
2594        .iter()
2595        .filter(|count| **count as f64 >= total as f64 * 0.01)
2596        .count();
2597    let entropy = coarse_histogram.iter().fold(0.0, |acc, count| {
2598        if *count == 0 {
2599            return acc;
2600        }
2601        let probability = *count as f64 / total as f64;
2602        acc - probability * probability.log2()
2603    });
2604
2605    mid_tone_count as f64 / total as f64 >= MIN_BRIGHT_PHOTO_MID_TONE_RATIO
2606        && occupied_bins >= MIN_BRIGHT_PHOTO_HISTOGRAM_BINS
2607        && entropy >= MIN_BRIGHT_PHOTO_ENTROPY
2608}
2609
2610/// Return true for dark UI or video-player screenshots that are visually rich
2611/// but not document tables.
2612fn is_dark_ui_screenshot_raster(gray: &GrayImage) -> bool {
2613    let total = (gray.width() * gray.height()) as usize;
2614    if total < 400 {
2615        return false;
2616    }
2617
2618    let very_dark_count = gray.pixels().filter(|p| p[0] <= 39).count();
2619    let non_extreme_count = gray.pixels().filter(|p| p[0] >= 15 && p[0] <= 240).count();
2620    let bright_detail_count = gray.pixels().filter(|p| p[0] >= 180 && p[0] <= 245).count();
2621
2622    very_dark_count * 20 >= total * 13
2623        && non_extreme_count * 2 >= total
2624        && bright_detail_count * 20 >= total
2625}
2626
2627fn bordered_grid_has_cell_ink(gray: &GrayImage, grid: &RasterTableGrid) -> bool {
2628    let num_cols = match grid.vertical_lines.len().checked_sub(1) {
2629        Some(value) => value,
2630        None => return false,
2631    };
2632    let num_rows = match grid.horizontal_lines.len().checked_sub(1) {
2633        Some(value) => value,
2634        None => return false,
2635    };
2636    if num_cols == 0 || num_rows == 0 {
2637        return false;
2638    }
2639
2640    let mut total_cells = 0usize;
2641    let mut inked_cells = 0usize;
2642    let mut rows_with_ink = 0usize;
2643
2644    for row_idx in 0..num_rows {
2645        let mut row_has_ink = false;
2646        for col_idx in 0..num_cols {
2647            total_cells += 1;
2648            let x1 = grid.vertical_lines[col_idx];
2649            let x2 = grid.vertical_lines[col_idx + 1];
2650            let y1 = grid.horizontal_lines[row_idx];
2651            let y2 = grid.horizontal_lines[row_idx + 1];
2652
2653            let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2654            let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2655            let crop_left = x1 + inset_x;
2656            let crop_top = y1 + inset_y;
2657            let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2658            let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2659            if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2660                continue;
2661            }
2662
2663            let dark_pixels = (crop_top..crop_top + crop_height)
2664                .flat_map(|y| (crop_left..crop_left + crop_width).map(move |x| (x, y)))
2665                .filter(|&(x, y)| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2666                .count();
2667            let area = (crop_width as usize) * (crop_height as usize);
2668            if area == 0 {
2669                continue;
2670            }
2671
2672            let dark_ratio = dark_pixels as f64 / area as f64;
2673            if dark_ratio >= MIN_BORDERED_CELL_DARK_RATIO {
2674                inked_cells += 1;
2675                row_has_ink = true;
2676            }
2677        }
2678        if row_has_ink {
2679            rows_with_ink += 1;
2680        }
2681    }
2682
2683    if total_cells == 0 {
2684        return false;
2685    }
2686
2687    (inked_cells as f64 / total_cells as f64) >= MIN_BORDERED_INKED_CELL_RATIO
2688        && rows_with_ink >= MIN_BORDERED_ROWS_WITH_INK
2689}
2690
2691fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
2692    let mut best_grid: Option<(RasterTableGrid, f64)> = None;
2693    for variant in build_ocr_variants(gray) {
2694        let Some((grid, score)) = detect_bordered_raster_grid_single(&variant) else {
2695            continue;
2696        };
2697        match &best_grid {
2698            Some((_, best_score)) if *best_score >= score => {}
2699            _ => best_grid = Some((grid, score)),
2700        }
2701    }
2702    best_grid.map(|(grid, _)| grid)
2703}
2704
2705fn detect_bordered_raster_grid_single(gray: &GrayImage) -> Option<(RasterTableGrid, f64)> {
2706    let width = gray.width();
2707    let height = gray.height();
2708    if width < 100 || height < 80 {
2709        return None;
2710    }
2711
2712    let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
2713    let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
2714
2715    let vertical_runs =
2716        merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
2717    let horizontal_runs =
2718        merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
2719    if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
2720        || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
2721    {
2722        return None;
2723    }
2724
2725    let mut vertical_lines: Vec<u32> = vertical_runs
2726        .into_iter()
2727        .map(|(start, end)| (start + end) / 2)
2728        .collect();
2729    let mut horizontal_lines: Vec<u32> = horizontal_runs
2730        .into_iter()
2731        .map(|(start, end)| (start + end) / 2)
2732        .collect();
2733
2734    let (&rough_min_x, &rough_max_x) = vertical_lines.first().zip(vertical_lines.last())?;
2735    let (&rough_min_y, &rough_max_y) = horizontal_lines.first().zip(horizontal_lines.last())?;
2736    if rough_max_x <= rough_min_x || rough_max_y <= rough_min_y {
2737        return None;
2738    }
2739
2740    vertical_lines.retain(|&x| {
2741        dark_ratio_in_column(gray, x, rough_min_y, rough_max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY
2742    });
2743    horizontal_lines.retain(|&y| {
2744        dark_ratio_in_row(gray, y, rough_min_x, rough_max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY
2745    });
2746    if vertical_lines.len() < MIN_BORDERED_VERTICAL_LINES
2747        || horizontal_lines.len() < MIN_BORDERED_HORIZONTAL_LINES
2748    {
2749        return None;
2750    }
2751
2752    if vertical_lines
2753        .windows(2)
2754        .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2755        || horizontal_lines
2756            .windows(2)
2757            .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2758    {
2759        return None;
2760    }
2761    if !grid_lines_are_continuous(&vertical_lines, &horizontal_lines, gray) {
2762        return None;
2763    }
2764
2765    let continuity = grid_continuity_score(&vertical_lines, &horizontal_lines, gray);
2766    let line_score = vertical_lines.len() as f64 + horizontal_lines.len() as f64;
2767    let score = continuity * 100.0 + line_score;
2768
2769    Some((
2770        RasterTableGrid {
2771            vertical_lines,
2772            horizontal_lines,
2773        },
2774        score,
2775    ))
2776}
2777
2778fn grid_lines_are_continuous(
2779    vertical_lines: &[u32],
2780    horizontal_lines: &[u32],
2781    gray: &GrayImage,
2782) -> bool {
2783    let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2784        return false;
2785    };
2786    let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2787        return false;
2788    };
2789    if max_x <= min_x || max_y <= min_y {
2790        return false;
2791    }
2792
2793    vertical_lines
2794        .iter()
2795        .all(|&x| dark_ratio_in_column(gray, x, min_y, max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2796        && horizontal_lines
2797            .iter()
2798            .all(|&y| dark_ratio_in_row(gray, y, min_x, max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2799}
2800
2801fn grid_continuity_score(
2802    vertical_lines: &[u32],
2803    horizontal_lines: &[u32],
2804    gray: &GrayImage,
2805) -> f64 {
2806    let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2807        return 0.0;
2808    };
2809    let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2810        return 0.0;
2811    };
2812    if max_x <= min_x || max_y <= min_y {
2813        return 0.0;
2814    }
2815
2816    let mut samples = 0usize;
2817    let mut sum = 0.0;
2818    for &x in vertical_lines {
2819        sum += dark_ratio_in_column(gray, x, min_y, max_y);
2820        samples += 1;
2821    }
2822    for &y in horizontal_lines {
2823        sum += dark_ratio_in_row(gray, y, min_x, max_x);
2824        samples += 1;
2825    }
2826    if samples == 0 {
2827        0.0
2828    } else {
2829        sum / samples as f64
2830    }
2831}
2832
2833fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
2834    count_ink_in_column(gray, x, RASTER_DARK_THRESHOLD)
2835}
2836
2837fn count_ink_in_column(gray: &GrayImage, x: u32, threshold: u8) -> u32 {
2838    (0..gray.height())
2839        .filter(|&y| gray.get_pixel(x, y).0[0] < threshold)
2840        .count() as u32
2841}
2842
2843fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
2844    count_ink_in_row(gray, y, RASTER_DARK_THRESHOLD)
2845}
2846
2847fn count_ink_in_row(gray: &GrayImage, y: u32, threshold: u8) -> u32 {
2848    (0..gray.width())
2849        .filter(|&x| gray.get_pixel(x, y).0[0] < threshold)
2850        .count() as u32
2851}
2852
2853fn dark_ratio_in_column(gray: &GrayImage, x: u32, y1: u32, y2: u32) -> f64 {
2854    if y2 <= y1 || x >= gray.width() {
2855        return 0.0;
2856    }
2857    let dark = (y1..=y2)
2858        .filter(|&y| y < gray.height() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2859        .count();
2860    dark as f64 / f64::from(y2 - y1 + 1)
2861}
2862
2863fn dark_ratio_in_row(gray: &GrayImage, y: u32, x1: u32, x2: u32) -> f64 {
2864    if x2 <= x1 || y >= gray.height() {
2865        return 0.0;
2866    }
2867    let dark = (x1..=x2)
2868        .filter(|&x| x < gray.width() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2869        .count();
2870    dark as f64 / f64::from(x2 - x1 + 1)
2871}
2872
2873fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
2874    let mut runs = Vec::new();
2875    let mut start = None;
2876    let mut prev = 0u32;
2877    for value in values {
2878        match start {
2879            None => {
2880                start = Some(value);
2881                prev = value;
2882            }
2883            Some(s) if value == prev + 1 => {
2884                prev = value;
2885                start = Some(s);
2886            }
2887            Some(s) => {
2888                runs.push((s, prev));
2889                start = Some(value);
2890                prev = value;
2891            }
2892        }
2893    }
2894    if let Some(s) = start {
2895        runs.push((s, prev));
2896    }
2897    runs
2898}
2899
2900fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
2901    let mut boundaries = Vec::with_capacity(centers.len() + 1);
2902    boundaries.push(left_edge);
2903    for pair in centers.windows(2) {
2904        boundaries.push((pair[0] + pair[1]) / 2.0);
2905    }
2906    boundaries.push(right_edge);
2907    boundaries
2908}
2909
2910fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
2911    let mut boundaries = Vec::with_capacity(rows.len() + 1);
2912    boundaries.push(rows[0].0);
2913    for pair in rows.windows(2) {
2914        boundaries.push((pair[0].1 + pair[1].0) / 2.0);
2915    }
2916    boundaries.push(rows[rows.len() - 1].1);
2917    boundaries
2918}
2919
2920fn raster_boundaries_to_page(
2921    lines: &[u32],
2922    left_edge: f64,
2923    right_edge: f64,
2924    image_width: u32,
2925) -> Option<Vec<f64>> {
2926    if image_width == 0 {
2927        return None;
2928    }
2929    let scale = (right_edge - left_edge) / f64::from(image_width);
2930    Some(
2931        lines
2932            .iter()
2933            .map(|line| left_edge + f64::from(*line) * scale)
2934            .collect(),
2935    )
2936}
2937
2938fn raster_boundaries_to_page_desc(
2939    lines: &[u32],
2940    bottom_edge: f64,
2941    top_edge: f64,
2942    image_height: u32,
2943) -> Option<Vec<f64>> {
2944    if image_height == 0 {
2945        return None;
2946    }
2947    let page_height = top_edge - bottom_edge;
2948    Some(
2949        lines
2950            .iter()
2951            .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
2952            .collect(),
2953    )
2954}
2955
2956fn raster_box_to_page_bbox(
2957    image: &ImageChunk,
2958    x1: u32,
2959    y1: u32,
2960    x2: u32,
2961    y2: u32,
2962    image_width: u32,
2963    image_height: u32,
2964) -> Option<BoundingBox> {
2965    if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
2966        return None;
2967    }
2968    let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
2969    let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
2970    let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
2971    let bottom_y =
2972        image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
2973    Some(BoundingBox::new(
2974        image.bbox.page_number,
2975        left_x,
2976        bottom_y,
2977        right_x,
2978        top_y,
2979    ))
2980}
2981
2982fn extract_raster_cell_text(
2983    gray: &GrayImage,
2984    row_idx: usize,
2985    col_idx: usize,
2986    x1: u32,
2987    y1: u32,
2988    x2: u32,
2989    y2: u32,
2990) -> Option<String> {
2991    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2992    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2993    let crop_left = x1 + inset_x;
2994    let crop_top = y1 + inset_y;
2995    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2996    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2997    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2998        return Some(String::new());
2999    }
3000
3001    let cropped = gray
3002        .view(crop_left, crop_top, crop_width, crop_height)
3003        .to_image();
3004    let bordered = expand_white_border(&cropped, 12);
3005    let scaled = image::imageops::resize(
3006        &bordered,
3007        bordered.width() * OCR_SCALE_FACTOR,
3008        bordered.height() * OCR_SCALE_FACTOR,
3009        image::imageops::FilterType::Lanczos3,
3010    );
3011    let psm_modes: [&str; 3] = if row_idx == 0 {
3012        ["6", "11", "7"]
3013    } else {
3014        ["7", "6", "11"]
3015    };
3016    let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
3017    Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
3018}
3019
3020fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
3021    let mut expanded = GrayImage::from_pixel(
3022        image.width() + border * 2,
3023        image.height() + border * 2,
3024        Luma([255]),
3025    );
3026    for y in 0..image.height() {
3027        for x in 0..image.width() {
3028            expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
3029        }
3030    }
3031    expanded
3032}
3033
3034fn run_tesseract_tsv_words(image: &GrayImage, psm: &str) -> Option<Vec<OcrWord>> {
3035    match selected_ocr_engine() {
3036        OcrEngine::RapidOcr => run_rapidocr_words(image),
3037        OcrEngine::Tesseract => run_tesseract_tsv_words_with_oem(image, psm, "3"),
3038    }
3039}
3040
3041fn run_tesseract_tsv_words_with_oem(
3042    image: &GrayImage,
3043    psm: &str,
3044    oem: &str,
3045) -> Option<Vec<OcrWord>> {
3046    let temp_dir = create_temp_dir(0).ok()?;
3047    let image_path = temp_dir.join("ocr.png");
3048    if image.save(&image_path).is_err() {
3049        let _ = fs::remove_dir_all(&temp_dir);
3050        return None;
3051    }
3052
3053    let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3054    let output = Command::new("tesseract")
3055        .current_dir(&temp_dir)
3056        .arg("ocr.png")
3057        .arg("stdout")
3058        // Tell Tesseract the actual DPI of the scaled image so its character-size
3059        // models are correctly calibrated (avoids ~72 DPI guess).
3060        .arg("--dpi")
3061        .arg(&dpi)
3062        .arg("--oem")
3063        .arg(oem)
3064        .arg("--psm")
3065        .arg(psm)
3066        // Disable word-frequency and system dictionaries: table cells contain
3067        // numeric codes, abbreviations, and domain-specific tokens that the
3068        // dictionary would "correct" into wrong English words.
3069        .arg("-c")
3070        .arg("load_system_dawg=0")
3071        .arg("-c")
3072        .arg("load_freq_dawg=0")
3073        .arg("tsv")
3074        .output()
3075        .ok()?;
3076    let _ = fs::remove_dir_all(&temp_dir);
3077    if !output.status.success() {
3078        return None;
3079    }
3080
3081    let tsv = String::from_utf8_lossy(&output.stdout);
3082    Some(parse_tesseract_tsv(&tsv))
3083}
3084
3085fn run_tesseract_cell_text_best(image: &GrayImage, psm_modes: &[&str]) -> Option<String> {
3086    let mut best: Option<(String, f64)> = None;
3087
3088    if matches!(selected_ocr_engine(), OcrEngine::Tesseract) {
3089        // First pass: collect consensus words across Tesseract perspectives.
3090        let consensus_words = collect_consensus_words(image, psm_modes);
3091        if !consensus_words.is_empty() {
3092            let text = words_to_plain_line_text(&consensus_words);
3093            if !text.is_empty() {
3094                let score = score_ocr_words(&consensus_words, image.width(), image.height());
3095                best = Some((text, score));
3096            }
3097        }
3098    }
3099
3100    // Fallback: standard best-variant approach if no consensus words found
3101    if best.is_none() {
3102        for variant in build_ocr_variants(image) {
3103            for psm in psm_modes {
3104                let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3105                    continue;
3106                };
3107                if words.is_empty() {
3108                    continue;
3109                }
3110                let text = words_to_plain_line_text(&words);
3111                if text.is_empty() {
3112                    continue;
3113                }
3114                let score = score_ocr_words(&words, variant.width(), variant.height());
3115                match &best {
3116                    Some((_, best_score)) if *best_score >= score => {}
3117                    _ => best = Some((text, score)),
3118                }
3119
3120                if let Some(text) = run_tesseract_plain_text_with_variant(&variant, psm) {
3121                    let norm_len = normalize_text(&text).len() as f64;
3122                    if norm_len > 0.0 {
3123                        match &best {
3124                            Some((_, best_score)) if *best_score >= norm_len => {}
3125                            _ => best = Some((text, norm_len)),
3126                        }
3127                    }
3128                }
3129            }
3130
3131            // Docling-inspired multi-engine path: when RapidOCR is available,
3132            // treat it as an additional OCR engine candidate rather than a hard
3133            // replacement. This keeps Tesseract's stronger word-level geometry
3134            // while allowing a modern detector/recognizer to win on difficult cells.
3135            if let Some(words) = run_rapidocr_words(&variant) {
3136                let text = words_to_plain_line_text(&words);
3137                if !text.is_empty() {
3138                    let score = score_ocr_words(&words, variant.width(), variant.height());
3139                    match &best {
3140                        Some((_, best_score)) if *best_score >= score => {}
3141                        _ => best = Some((text, score)),
3142                    }
3143                }
3144            }
3145        }
3146    }
3147
3148    best.map(|(text, _)| text)
3149}
3150
3151fn collect_consensus_words(image: &GrayImage, psm_modes: &[&str]) -> Vec<OcrWord> {
3152    let variants = build_ocr_variants(image);
3153
3154    // Collect words per (PSM, OEM) perspective. A "perspective" is an independent
3155    // Tesseract configuration; preprocessing variants are replicates of the same
3156    // perspective (same segmentation model, same language model).
3157    //
3158    // First-principles rationale:
3159    //   A real word should be detected by the correct PSM regardless of which
3160    //   preprocessed image variant is used. So consensus = "word appears under
3161    //   ≥2 distinct (PSM, OEM) combinations", NOT "≥25% of (variant×PSM×OEM)".
3162    //   The percentage approach breaks as more variants are added: threshold
3163    //   rises and real words get filtered out.
3164
3165    let oems = ["1", "3"]; // OEM 1 = legacy Tesseract; OEM 3 = LSTM neural
3166
3167    // For each (PSM, OEM) pair, keep the best-confidence word seen in any variant.
3168    let mut perspective_best: HashMap<(String, String, String), OcrWord> = HashMap::new();
3169
3170    for variant in &variants {
3171        for psm in psm_modes {
3172            for oem in oems {
3173                let Some(words) = run_tesseract_tsv_words_with_oem(variant, psm, oem) else {
3174                    continue;
3175                };
3176                for word in words {
3177                    let key = (psm.to_string(), oem.to_string(), word.text.to_lowercase());
3178                    perspective_best
3179                        .entry(key)
3180                        .and_modify(|best| {
3181                            if word.confidence > best.confidence {
3182                                *best = word.clone();
3183                            }
3184                        })
3185                        .or_insert(word);
3186                }
3187            }
3188        }
3189    }
3190
3191    // Count distinct (PSM, OEM) perspectives in which each word text appears.
3192    // Threshold: at least 2 independent configurations must agree.
3193    const MIN_PERSPECTIVES: usize = 2;
3194
3195    let mut text_to_perspectives: HashMap<String, HashSet<(String, String)>> = HashMap::new();
3196    for (psm, oem, norm_text) in perspective_best.keys() {
3197        text_to_perspectives
3198            .entry(norm_text.clone())
3199            .or_default()
3200            .insert((psm.clone(), oem.clone()));
3201    }
3202
3203    // Return the best-confidence word for each text that meets the threshold.
3204    let mut consensus: Vec<OcrWord> = text_to_perspectives
3205        .iter()
3206        .filter(|(_, perspectives)| perspectives.len() >= MIN_PERSPECTIVES)
3207        .filter_map(|(norm_text, _)| {
3208            perspective_best
3209                .iter()
3210                .filter(|((_, _, t), _)| t == norm_text)
3211                .max_by(|(_, a), (_, b)| {
3212                    a.confidence
3213                        .partial_cmp(&b.confidence)
3214                        .unwrap_or(std::cmp::Ordering::Equal)
3215                })
3216                .map(|(_, w)| w.clone())
3217        })
3218        .collect();
3219
3220    consensus.sort_by_key(|w| (w.top, w.left));
3221    consensus
3222}
3223
3224fn filter_words_by_spatial_coherence(words: &[OcrWord]) -> Vec<OcrWord> {
3225    if words.len() <= 1 {
3226        return words.to_vec();
3227    }
3228
3229    // First-principles thresholds derived from the actual character height in this
3230    // image — fully agnostic to DPI and scale factor. Typography conventions:
3231    //   • Word spacing within a line ≈ 0.25–0.33 em (em = cap height ≈ word.height)
3232    //   • A gap larger than 3 em between words on the same Tesseract line is
3233    //     almost certainly a segmentation error, not a legitimate space.
3234    //   • A single-word line smaller than 0.4 em wide is glyph noise.
3235    let median_h: u32 = {
3236        let mut heights: Vec<u32> = words.iter().map(|w| w.height.max(1)).collect();
3237        heights.sort_unstable();
3238        heights[heights.len() / 2]
3239    };
3240    // Gap beyond which two adjacent words on the same line are considered disjoint
3241    let gap_threshold = (median_h * 3).max(8);
3242    // Width below which a word on its own line looks like a glyph artifact
3243    let narrow_threshold = (median_h / 2).max(4);
3244    // Minimum bounding box for a credible isolated single-line word
3245    let min_iso_width = (median_h * 2 / 5).max(4);
3246    let min_iso_height = (median_h * 2 / 5).max(3);
3247
3248    // Split words into lines
3249    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3250    for word in words {
3251        by_line.entry(word.line_key).or_default().push(word);
3252    }
3253
3254    let mut filtered = Vec::new();
3255
3256    // For each line, filter out isolated words that are far from neighbors
3257    for line_words in by_line.values_mut() {
3258        if line_words.len() <= 1 {
3259            // Single-word lines: only keep if reasonably sized (not a stray pixel)
3260            if let Some(word) = line_words.first() {
3261                if word.width >= min_iso_width && word.height >= min_iso_height {
3262                    filtered.push((*word).clone());
3263                }
3264            }
3265            continue;
3266        }
3267
3268        line_words.sort_by_key(|word| word.left);
3269
3270        // Check spatial coherence within each word-to-word transition
3271        for (i, word) in line_words.iter().enumerate() {
3272            let is_isolated = if i > 0 {
3273                let prev = line_words[i - 1];
3274                let gap = word
3275                    .left
3276                    .saturating_sub(prev.left.saturating_add(prev.width));
3277                gap > gap_threshold && word.width < narrow_threshold
3278            } else if i < line_words.len() - 1 {
3279                let next = line_words[i + 1];
3280                let gap = next
3281                    .left
3282                    .saturating_sub(word.left.saturating_add(word.width));
3283                gap > gap_threshold && word.width < narrow_threshold
3284            } else {
3285                false
3286            };
3287
3288            if !is_isolated {
3289                filtered.push((*word).clone());
3290            }
3291        }
3292    }
3293
3294    filtered
3295}
3296
3297fn cluster_words_by_proximity(words: &[OcrWord], gap_tolerance: u32) -> Vec<Vec<OcrWord>> {
3298    if words.is_empty() {
3299        return Vec::new();
3300    }
3301
3302    let mut sorted_words = words.to_vec();
3303    sorted_words.sort_by_key(|w| (w.top, w.left));
3304
3305    // Vertical tolerance: two words are on the "same line" when their top edges
3306    // differ by less than half the median word height. This is typographically
3307    // correct: legitimate multi-word lines share a common baseline ± leading / 2.
3308    let median_h: i32 = {
3309        let mut heights: Vec<u32> = sorted_words.iter().map(|w| w.height.max(1)).collect();
3310        heights.sort_unstable();
3311        heights[heights.len() / 2] as i32
3312    };
3313    let vertical_tolerance = (median_h / 2).max(2);
3314
3315    let mut clusters: Vec<Vec<OcrWord>> = Vec::new();
3316    let mut current_cluster = vec![sorted_words[0].clone()];
3317
3318    for word in &sorted_words[1..] {
3319        if let Some(last) = current_cluster.last() {
3320            let vertical_gap = (word.top as i32 - last.top as i32).abs();
3321            let horizontal_gap = word
3322                .left
3323                .saturating_sub(last.left.saturating_add(last.width));
3324
3325            if vertical_gap <= vertical_tolerance && horizontal_gap <= gap_tolerance {
3326                current_cluster.push(word.clone());
3327            } else {
3328                clusters.push(current_cluster);
3329                current_cluster = vec![word.clone()];
3330            }
3331        }
3332    }
3333
3334    if !current_cluster.is_empty() {
3335        clusters.push(current_cluster);
3336    }
3337
3338    clusters
3339}
3340
3341fn words_to_plain_line_text(words: &[OcrWord]) -> String {
3342    // Apply spatial coherence filtering to remove isolated artifacts
3343    let filtered_words = filter_words_by_spatial_coherence(words);
3344
3345    if filtered_words.is_empty() {
3346        return String::new();
3347    }
3348
3349    // Cluster words by spatial proximity with adaptive gap tolerance
3350    let avg_word_width =
3351        filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3352    let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3353    let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3354
3355    let mut lines: Vec<String> = Vec::new();
3356    for cluster in clusters {
3357        let mut sorted_cluster = cluster;
3358        sorted_cluster.sort_by_key(|w| w.left);
3359
3360        let line = sorted_cluster
3361            .iter()
3362            .map(|word| word.text.as_str())
3363            .collect::<Vec<_>>()
3364            .join(" ")
3365            .trim()
3366            .to_string();
3367
3368        if !line.is_empty() {
3369            lines.push(line);
3370        }
3371    }
3372
3373    lines.join(" ")
3374}
3375
3376/// Apply common Tesseract OCR character corrections for table content.
3377///
3378/// Only applies corrections that are safe in all contexts — targeting
3379/// purely numeric tokens where digit/letter confusion is certain.
3380fn run_tesseract_tsv_words_best<F>(
3381    image: &GrayImage,
3382    psm_modes: &[&str],
3383    accept: F,
3384) -> Option<Vec<OcrWord>>
3385where
3386    F: Fn(&[OcrWord]) -> bool,
3387{
3388    let variants = build_ocr_variants(image);
3389    let mut best: Option<OcrCandidateScore> = None;
3390
3391    for variant in variants {
3392        for psm in psm_modes {
3393            let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3394                continue;
3395            };
3396            if !accept(&words) {
3397                continue;
3398            }
3399            let score = score_ocr_words(&words, variant.width(), variant.height());
3400            match &best {
3401                Some(current) if current.score >= score => {}
3402                _ => {
3403                    best = Some(OcrCandidateScore { words, score });
3404                }
3405            }
3406        }
3407    }
3408
3409    best.map(|candidate| candidate.words)
3410}
3411
3412fn score_ocr_words(words: &[OcrWord], width: u32, height: u32) -> f64 {
3413    if words.is_empty() || width == 0 || height == 0 {
3414        return 0.0;
3415    }
3416
3417    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3418    let mut alpha_words = 0usize;
3419    let mut area_coverage = 0f64;
3420    let mut vertical_spread_top = height;
3421    let mut vertical_spread_bottom = 0u32;
3422    let mut total_confidence = 0f64;
3423
3424    for word in words {
3425        by_line.entry(word.line_key).or_default().push(word);
3426        if word.text.chars().any(|ch| ch.is_alphabetic()) {
3427            alpha_words += 1;
3428        }
3429        area_coverage += f64::from(word.width.saturating_mul(word.height));
3430        vertical_spread_top = vertical_spread_top.min(word.top);
3431        vertical_spread_bottom = vertical_spread_bottom.max(word.top.saturating_add(word.height));
3432        total_confidence += word.confidence;
3433    }
3434
3435    let line_count = by_line.len() as f64;
3436    let alpha_ratio = alpha_words as f64 / words.len() as f64;
3437    let density = (area_coverage / f64::from(width.saturating_mul(height))).clamp(0.0, 1.0);
3438    let spread = if vertical_spread_bottom > vertical_spread_top {
3439        f64::from(vertical_spread_bottom - vertical_spread_top) / f64::from(height)
3440    } else {
3441        0.0
3442    };
3443    let avg_confidence = total_confidence / words.len() as f64;
3444    // Confidence bonus: normalize 0-100 range to 0-1 bonus multiplier
3445    let confidence_bonus = (avg_confidence / 100.0).clamp(0.0, 1.0);
3446
3447    // Horizontal spread bonus: reward words that span the full cell width
3448    let horizontal_spread = if words.is_empty() {
3449        0.0
3450    } else {
3451        let min_left = words.iter().map(|w| w.left).min().unwrap_or(0);
3452        let max_right = words
3453            .iter()
3454            .map(|w| w.left + w.width)
3455            .max()
3456            .unwrap_or(width);
3457        f64::from(max_right.saturating_sub(min_left)) / f64::from(width)
3458    };
3459
3460    words.len() as f64
3461        + line_count * 1.5
3462        + alpha_ratio * 6.0
3463        + density * 25.0
3464        + spread * 3.0
3465        + horizontal_spread * 2.0
3466        + confidence_bonus * 5.0 // High-confidence words get a boost
3467}
3468
3469fn build_ocr_variants(gray: &GrayImage) -> Vec<GrayImage> {
3470    vec![
3471        gray.clone(),
3472        contrast_stretch(gray),
3473        global_otsu_binarize(gray),
3474        local_mean_binarize(gray, LOCAL_BINARIZATION_RADIUS),
3475        // Add morphological cleaning as a variant for denoising
3476        morphological_clean(gray),
3477        // Sharpening (unsharp mask) helps Tesseract detect character boundaries on
3478        // blurry / low-DPI cells that survive from low-resolution source PDFs.
3479        unsharp_mask(gray, 1.5),
3480        // Gamma brightening improves contrast for very light ink cells.
3481        gamma_correct(gray, 0.6),
3482    ]
3483}
3484
3485/// Sharpen a grayscale image using an unsharp mask.
3486/// `amount` controls strength (1.5 = moderate). Uses i32 arithmetic throughout
3487/// to avoid u32 underflow when the 3×3 kernel straddles the x=0 or y=0 boundary.
3488fn unsharp_mask(gray: &GrayImage, amount: f32) -> GrayImage {
3489    let width = gray.width() as i32;
3490    let height = gray.height() as i32;
3491    let mut out = GrayImage::new(gray.width(), gray.height());
3492    for y in 0..height {
3493        for x in 0..width {
3494            let mut sum = 0i32;
3495            let mut count = 0i32;
3496            for dy in -1i32..=1 {
3497                for dx in -1i32..=1 {
3498                    let nx = x + dx;
3499                    let ny = y + dy;
3500                    if nx >= 0 && ny >= 0 && nx < width && ny < height {
3501                        sum += gray.get_pixel(nx as u32, ny as u32).0[0] as i32;
3502                        count += 1;
3503                    }
3504                }
3505            }
3506            let blurred = if count > 0 {
3507                sum / count
3508            } else {
3509                gray.get_pixel(x as u32, y as u32).0[0] as i32
3510            };
3511            let original = gray.get_pixel(x as u32, y as u32).0[0] as i32;
3512            let sharpened = original + ((original - blurred) as f32 * amount) as i32;
3513            out.put_pixel(x as u32, y as u32, Luma([sharpened.clamp(0, 255) as u8]));
3514        }
3515    }
3516    out
3517}
3518
3519/// Apply gamma correction to brighten or darken an image.
3520/// gamma < 1.0 brightens (helps see light ink); gamma > 1.0 darkens.
3521fn gamma_correct(gray: &GrayImage, gamma: f32) -> GrayImage {
3522    let mut out = GrayImage::new(gray.width(), gray.height());
3523    for (x, y, pixel) in gray.enumerate_pixels() {
3524        let v = pixel.0[0] as f32 / 255.0;
3525        let corrected = (v.powf(gamma) * 255.0).round() as u8;
3526        out.put_pixel(x, y, Luma([corrected]));
3527    }
3528    out
3529}
3530
3531fn contrast_stretch(gray: &GrayImage) -> GrayImage {
3532    let mut min_val = u8::MAX;
3533    let mut max_val = u8::MIN;
3534    for pixel in gray.pixels() {
3535        let value = pixel.0[0];
3536        min_val = min_val.min(value);
3537        max_val = max_val.max(value);
3538    }
3539
3540    if max_val <= min_val {
3541        return gray.clone();
3542    }
3543
3544    let in_range = (max_val - min_val) as f64;
3545    let mut out = GrayImage::new(gray.width(), gray.height());
3546    for (x, y, pixel) in gray.enumerate_pixels() {
3547        let value = pixel.0[0];
3548        let normalized = ((value.saturating_sub(min_val)) as f64 / in_range * 255.0).round() as u8;
3549        out.put_pixel(x, y, Luma([normalized]));
3550    }
3551    out
3552}
3553
3554fn global_otsu_binarize(gray: &GrayImage) -> GrayImage {
3555    let threshold = otsu_threshold(gray);
3556    let mut out = GrayImage::new(gray.width(), gray.height());
3557    for (x, y, pixel) in gray.enumerate_pixels() {
3558        let value = if pixel.0[0] <= threshold { 0 } else { 255 };
3559        out.put_pixel(x, y, Luma([value]));
3560    }
3561    out
3562}
3563
3564fn otsu_threshold(gray: &GrayImage) -> u8 {
3565    let mut histogram = [0u64; 256];
3566    for pixel in gray.pixels() {
3567        histogram[pixel.0[0] as usize] += 1;
3568    }
3569
3570    let total = (gray.width() as u64) * (gray.height() as u64);
3571    if total == 0 {
3572        return 127;
3573    }
3574
3575    let sum_total: f64 = histogram
3576        .iter()
3577        .enumerate()
3578        .map(|(idx, count)| idx as f64 * *count as f64)
3579        .sum();
3580
3581    let mut sum_background = 0f64;
3582    let mut weight_background = 0f64;
3583    let mut max_variance = -1f64;
3584    let mut best_threshold = 127u8;
3585
3586    for (idx, count) in histogram.iter().enumerate() {
3587        weight_background += *count as f64;
3588        if weight_background <= 0.0 {
3589            continue;
3590        }
3591
3592        let weight_foreground = total as f64 - weight_background;
3593        if weight_foreground <= 0.0 {
3594            break;
3595        }
3596
3597        sum_background += idx as f64 * *count as f64;
3598        let mean_background = sum_background / weight_background;
3599        let mean_foreground = (sum_total - sum_background) / weight_foreground;
3600        let between_class_variance =
3601            weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
3602
3603        if between_class_variance > max_variance {
3604            max_variance = between_class_variance;
3605            best_threshold = idx as u8;
3606        }
3607    }
3608
3609    best_threshold
3610}
3611
3612fn local_mean_binarize(gray: &GrayImage, radius: u32) -> GrayImage {
3613    if gray.width() == 0 || gray.height() == 0 {
3614        return gray.clone();
3615    }
3616
3617    let width = gray.width() as usize;
3618    let height = gray.height() as usize;
3619    let (integral, stride) = integral_image(gray);
3620    let mut out = GrayImage::new(gray.width(), gray.height());
3621
3622    for y in 0..height {
3623        for x in 0..width {
3624            let x1 = x.saturating_sub(radius as usize);
3625            let y1 = y.saturating_sub(radius as usize);
3626            let x2 = (x + radius as usize).min(width - 1);
3627            let y2 = (y + radius as usize).min(height - 1);
3628
3629            let area = (x2 - x1 + 1) * (y2 - y1 + 1);
3630            let sum = region_sum(&integral, stride, x1, y1, x2, y2);
3631            let local_mean = (sum as f64) / (area as f64);
3632            let offset = if area >= MIN_BINARIZATION_BLOCK_PIXELS {
3633                8.0
3634            } else {
3635                4.0
3636            };
3637            let threshold = (local_mean - offset).clamp(0.0, 255.0);
3638
3639            let pixel_value = gray.get_pixel(x as u32, y as u32).0[0] as f64;
3640            let value = if pixel_value <= threshold { 0 } else { 255 };
3641            out.put_pixel(x as u32, y as u32, Luma([value]));
3642        }
3643    }
3644
3645    out
3646}
3647
3648/// Morphological cleaning via dilation then erosion (closing operation)
3649/// Removes small noise and fills small holes in text
3650fn morphological_clean(gray: &GrayImage) -> GrayImage {
3651    if gray.width() == 0 || gray.height() == 0 {
3652        return gray.clone();
3653    }
3654
3655    // First binarize with otsu
3656    let binary = global_otsu_binarize(gray);
3657
3658    // Close operation: dilate then erode (fills small holes, connects broken text)
3659    let dilated = morphological_dilate(&binary, 2);
3660    morphological_erode(&dilated, 2)
3661}
3662
3663fn morphological_dilate(gray: &GrayImage, iterations: u32) -> GrayImage {
3664    let mut result = gray.clone();
3665    for _ in 0..iterations {
3666        let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3667
3668        for y in 1..gray.height().saturating_sub(1) {
3669            for x in 1..gray.width().saturating_sub(1) {
3670                // Check 3x3 neighborhood
3671                let mut has_black = false;
3672                for dy in 0..3 {
3673                    for dx in 0..3 {
3674                        let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3675                        if px < 128 {
3676                            has_black = true;
3677                            break;
3678                        }
3679                    }
3680                    if has_black {
3681                        break;
3682                    }
3683                }
3684                next.put_pixel(x, y, if has_black { Luma([0]) } else { Luma([255]) });
3685            }
3686        }
3687        result = next;
3688    }
3689    result
3690}
3691
3692fn morphological_erode(gray: &GrayImage, iterations: u32) -> GrayImage {
3693    let mut result = gray.clone();
3694    for _ in 0..iterations {
3695        let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3696
3697        for y in 1..gray.height().saturating_sub(1) {
3698            for x in 1..gray.width().saturating_sub(1) {
3699                // Erode black foreground: any white neighbor breaks the stroke,
3700                // otherwise the pixel remains black.
3701                let mut all_black = true;
3702                for dy in 0..3 {
3703                    for dx in 0..3 {
3704                        let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3705                        if px >= 128 {
3706                            all_black = false;
3707                            break;
3708                        }
3709                    }
3710                    if !all_black {
3711                        break;
3712                    }
3713                }
3714                next.put_pixel(x, y, if all_black { Luma([0]) } else { Luma([255]) });
3715            }
3716        }
3717        result = next;
3718    }
3719    result
3720}
3721
3722fn integral_image(gray: &GrayImage) -> (Vec<u64>, usize) {
3723    let width = gray.width() as usize;
3724    let height = gray.height() as usize;
3725    let stride = width + 1;
3726    let mut integral = vec![0u64; (width + 1) * (height + 1)];
3727
3728    for y in 0..height {
3729        let mut row_sum = 0u64;
3730        for x in 0..width {
3731            row_sum += gray.get_pixel(x as u32, y as u32).0[0] as u64;
3732            let idx = (y + 1) * stride + (x + 1);
3733            integral[idx] = integral[y * stride + (x + 1)] + row_sum;
3734        }
3735    }
3736
3737    (integral, stride)
3738}
3739
3740fn region_sum(integral: &[u64], stride: usize, x1: usize, y1: usize, x2: usize, y2: usize) -> u64 {
3741    let a = integral[y1 * stride + x1];
3742    let b = integral[y1 * stride + (x2 + 1)];
3743    let c = integral[(y2 + 1) * stride + x1];
3744    let d = integral[(y2 + 1) * stride + (x2 + 1)];
3745    d + a - b - c
3746}
3747
3748fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
3749    run_tesseract_plain_text_with_variant(image, psm)
3750}
3751
3752fn run_tesseract_plain_text_with_variant(image: &GrayImage, psm: &str) -> Option<String> {
3753    if matches!(selected_ocr_engine(), OcrEngine::RapidOcr) {
3754        return run_rapidocr_words(image).map(|words| words_to_plain_line_text(&words));
3755    }
3756
3757    let temp_dir = create_temp_dir(0).ok()?;
3758    let image_path = temp_dir.join("ocr.png");
3759    if image.save(&image_path).is_err() {
3760        let _ = fs::remove_dir_all(&temp_dir);
3761        return None;
3762    }
3763
3764    let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3765    let output = Command::new("tesseract")
3766        .current_dir(&temp_dir)
3767        .arg("ocr.png")
3768        .arg("stdout")
3769        .arg("--dpi")
3770        .arg(&dpi)
3771        .arg("--oem")
3772        .arg("3")
3773        .arg("--psm")
3774        .arg(psm)
3775        .arg("-c")
3776        .arg("load_system_dawg=0")
3777        .arg("-c")
3778        .arg("load_freq_dawg=0")
3779        .output()
3780        .ok()?;
3781    let _ = fs::remove_dir_all(&temp_dir);
3782    if !output.status.success() {
3783        return None;
3784    }
3785
3786    Some(
3787        String::from_utf8_lossy(&output.stdout)
3788            .replace('\n', " ")
3789            .split_whitespace()
3790            .collect::<Vec<_>>()
3791            .join(" "),
3792    )
3793}
3794
3795fn words_to_text_chunks(
3796    words: &[OcrWord],
3797    image: &ImageChunk,
3798    text_chunks: &[TextChunk],
3799) -> Vec<TextChunk> {
3800    let mut image_size = (0u32, 0u32);
3801    for word in words {
3802        image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
3803        image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
3804    }
3805    if image_size.0 == 0 || image_size.1 == 0 {
3806        return Vec::new();
3807    }
3808
3809    let mut dedupe: HashMap<String, usize> = HashMap::new();
3810    for chunk in text_chunks {
3811        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3812    }
3813
3814    let mut recovered = Vec::new();
3815    for word in words {
3816        let normalized = normalize_text(&word.text);
3817        if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
3818            continue;
3819        }
3820
3821        let left_ratio = f64::from(word.left) / f64::from(image_size.0);
3822        let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
3823        let top_ratio = f64::from(word.top) / f64::from(image_size.1);
3824        let bottom_ratio =
3825            f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
3826
3827        let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
3828        let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
3829        let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
3830        let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
3831        if right_x <= left_x || top_y <= bottom_y {
3832            continue;
3833        }
3834
3835        recovered.push(TextChunk {
3836            value: word.text.clone(),
3837            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3838            font_name: "OCR".to_string(),
3839            font_size: (top_y - bottom_y).max(6.0),
3840            font_weight: 400.0,
3841            italic_angle: 0.0,
3842            font_color: "#000000".to_string(),
3843            contrast_ratio: 21.0,
3844            symbol_ends: Vec::new(),
3845            text_format: TextFormat::Normal,
3846            text_type: TextType::Regular,
3847            pdf_layer: PdfLayer::Content,
3848            ocg_visible: true,
3849            index: None,
3850            page_number: image.bbox.page_number,
3851            level: None,
3852            mcid: None,
3853        });
3854    }
3855
3856    recovered
3857}
3858
3859fn lines_from_ocr_words(
3860    words: &[OcrWord],
3861    image: &ImageChunk,
3862    image_width: u32,
3863    image_height: u32,
3864    text_chunks: &[TextChunk],
3865) -> Vec<TextChunk> {
3866    if image_width == 0 || image_height == 0 {
3867        return Vec::new();
3868    }
3869
3870    let mut dedupe: HashMap<String, usize> = HashMap::new();
3871    for chunk in text_chunks {
3872        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3873    }
3874
3875    let spatial_lines = build_spatial_ocr_lines(words);
3876    if spatial_lines.is_empty() {
3877        return Vec::new();
3878    }
3879
3880    let blocks = merge_spatial_ocr_lines_into_blocks(&spatial_lines, image_width);
3881    if blocks.is_empty() {
3882        return Vec::new();
3883    }
3884
3885    let mut recovered = Vec::new();
3886    for block in blocks {
3887        let normalized = normalize_text(&block.text);
3888        if normalized.len() >= 8 && dedupe.contains_key(&normalized) {
3889            continue;
3890        }
3891
3892        if block.right <= block.left || block.bottom <= block.top {
3893            continue;
3894        }
3895
3896        let left_x = image.bbox.left_x
3897            + image.bbox.width() * (f64::from(block.left) / f64::from(image_width));
3898        let right_x = image.bbox.left_x
3899            + image.bbox.width() * (f64::from(block.right) / f64::from(image_width));
3900        let top_y = image.bbox.top_y
3901            - image.bbox.height() * (f64::from(block.top) / f64::from(image_height));
3902        let bottom_y = image.bbox.top_y
3903            - image.bbox.height() * (f64::from(block.bottom) / f64::from(image_height));
3904        if right_x <= left_x || top_y <= bottom_y {
3905            continue;
3906        }
3907
3908        recovered.push(TextChunk {
3909            value: block.text,
3910            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3911            font_name: "OCR".to_string(),
3912            font_size: (f64::from(block.line_height_sum) / block.line_count.max(1) as f64).max(6.0),
3913            font_weight: 400.0,
3914            italic_angle: 0.0,
3915            font_color: "#000000".to_string(),
3916            contrast_ratio: 21.0,
3917            symbol_ends: Vec::new(),
3918            text_format: TextFormat::Normal,
3919            text_type: TextType::Regular,
3920            pdf_layer: PdfLayer::Content,
3921            ocg_visible: true,
3922            index: None,
3923            page_number: image.bbox.page_number,
3924            level: None,
3925            mcid: None,
3926        });
3927    }
3928
3929    recovered
3930}
3931
3932#[derive(Debug, Clone)]
3933struct SpatialOcrLine {
3934    left: u32,
3935    top: u32,
3936    right: u32,
3937    bottom: u32,
3938    text: String,
3939    word_count: usize,
3940    line_count: usize,
3941    line_height_sum: u32,
3942}
3943
3944fn build_spatial_ocr_lines(words: &[OcrWord]) -> Vec<SpatialOcrLine> {
3945    let filtered_words = filter_words_by_spatial_coherence(words);
3946    if filtered_words.is_empty() {
3947        return Vec::new();
3948    }
3949
3950    let avg_word_width =
3951        filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3952    let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3953    let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3954
3955    let mut lines = Vec::new();
3956    for mut cluster in clusters {
3957        cluster.sort_by_key(|word| word.left);
3958        let text = cluster
3959            .iter()
3960            .map(|word| word.text.as_str())
3961            .collect::<Vec<_>>()
3962            .join(" ")
3963            .trim()
3964            .to_string();
3965        if text.is_empty() {
3966            continue;
3967        }
3968
3969        let left = cluster.iter().map(|word| word.left).min().unwrap_or(0);
3970        let right = cluster
3971            .iter()
3972            .map(|word| word.left.saturating_add(word.width))
3973            .max()
3974            .unwrap_or(0);
3975        let top = cluster.iter().map(|word| word.top).min().unwrap_or(0);
3976        let bottom = cluster
3977            .iter()
3978            .map(|word| word.top.saturating_add(word.height))
3979            .max()
3980            .unwrap_or(0);
3981        if right <= left || bottom <= top {
3982            continue;
3983        }
3984
3985        lines.push(SpatialOcrLine {
3986            left,
3987            top,
3988            right,
3989            bottom,
3990            text,
3991            word_count: cluster.len(),
3992            line_count: 1,
3993            line_height_sum: bottom.saturating_sub(top).max(1),
3994        });
3995    }
3996
3997    lines.sort_by_key(|line| (line.top, line.left));
3998    lines
3999}
4000
4001fn merge_spatial_ocr_lines_into_blocks(
4002    lines: &[SpatialOcrLine],
4003    image_width: u32,
4004) -> Vec<SpatialOcrLine> {
4005    if lines.is_empty() {
4006        return Vec::new();
4007    }
4008
4009    let median_height = {
4010        let mut heights: Vec<u32> = lines
4011            .iter()
4012            .map(|line| line.bottom.saturating_sub(line.top).max(1))
4013            .collect();
4014        heights.sort_unstable();
4015        heights[heights.len() / 2]
4016    };
4017    let vertical_tolerance = (median_height / 2).max(3);
4018    let max_vertical_gap = median_height.saturating_mul(2).max(8);
4019
4020    let mut blocks: Vec<SpatialOcrLine> = Vec::new();
4021    for line in lines {
4022        let merge_idx = blocks.iter().rposition(|block| {
4023            let vertical_gap = line.top.saturating_sub(block.bottom);
4024            if vertical_gap > max_vertical_gap {
4025                return false;
4026            }
4027            if line.top + vertical_tolerance < block.bottom {
4028                return false;
4029            }
4030
4031            spatial_lines_share_block_geometry(block, line, image_width, median_height)
4032        });
4033
4034        if let Some(merge_idx) = merge_idx {
4035            let block = &mut blocks[merge_idx];
4036            block.left = block.left.min(line.left);
4037            block.top = block.top.min(line.top);
4038            block.right = block.right.max(line.right);
4039            block.bottom = block.bottom.max(line.bottom);
4040            block.word_count += line.word_count;
4041            block.line_count += line.line_count;
4042            block.line_height_sum = block.line_height_sum.saturating_add(line.line_height_sum);
4043            if !block.text.ends_with('-') {
4044                block.text.push(' ');
4045            }
4046            block.text.push_str(&line.text);
4047            continue;
4048        }
4049
4050        blocks.push(line.clone());
4051    }
4052
4053    blocks
4054        .into_iter()
4055        .filter_map(|mut block| {
4056            block.text = block.text.split_whitespace().collect::<Vec<_>>().join(" ");
4057            let alphabetic = block.text.chars().filter(|ch| ch.is_alphabetic()).count();
4058            let min_chars = if block.word_count >= 4 { 10 } else { 16 };
4059            if block.text.len() < min_chars || alphabetic < 4 {
4060                return None;
4061            }
4062            Some(block)
4063        })
4064        .collect()
4065}
4066
4067fn spatial_lines_share_block_geometry(
4068    upper: &SpatialOcrLine,
4069    lower: &SpatialOcrLine,
4070    image_width: u32,
4071    median_height: u32,
4072) -> bool {
4073    let overlap_left = upper.left.max(lower.left);
4074    let overlap_right = upper.right.min(lower.right);
4075    let overlap = overlap_right.saturating_sub(overlap_left);
4076    let upper_width = upper.right.saturating_sub(upper.left).max(1);
4077    let lower_width = lower.right.saturating_sub(lower.left).max(1);
4078    let min_width = upper_width.min(lower_width);
4079    let max_width = upper_width.max(lower_width);
4080    let overlap_ratio = overlap as f64 / min_width as f64;
4081    let width_ratio = min_width as f64 / max_width as f64;
4082    let max_left_shift = ((f64::from(image_width) * 0.045).round() as u32)
4083        .max(median_height.saturating_mul(2))
4084        .max(8);
4085    let left_shift = upper.left.abs_diff(lower.left);
4086
4087    overlap_ratio >= 0.40
4088        || (overlap_ratio >= 0.15 && left_shift <= max_left_shift && width_ratio >= 0.55)
4089}
4090
4091fn is_numeric_like(text: &str) -> bool {
4092    text.chars().any(|ch| ch.is_ascii_digit())
4093}
4094
4095fn normalize_text(text: &str) -> String {
4096    text.chars()
4097        .filter(|ch| ch.is_alphanumeric())
4098        .flat_map(|ch| ch.to_lowercase())
4099        .collect()
4100}
4101
4102fn normalize_caption_text(text: &str) -> String {
4103    text.replace("CarolinaBLUTM", "CarolinaBLU™")
4104        .replace("CarolinaBLU™™", "CarolinaBLU™")
4105        .trim()
4106        .to_string()
4107}
4108
4109fn normalize_raster_cell_text(row_idx: usize, _col_idx: usize, text: String) -> String {
4110    let mut normalized = text
4111        .replace('|', " ")
4112        .replace('—', "-")
4113        .replace("AorB", "A or B")
4114        .replace("Aor B", "A or B")
4115        .replace("H,O", "H2O")
4116        .split_whitespace()
4117        .collect::<Vec<_>>()
4118        .join(" ");
4119
4120    if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
4121        return String::new();
4122    }
4123    if row_idx > 0
4124        && normalized
4125            .chars()
4126            .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
4127    {
4128        return String::new();
4129    }
4130
4131    normalized = normalized
4132        .replace(" ywL", " μL")
4133        .replace(" yuL", " μL")
4134        .replace(" yL", " μL")
4135        .replace(" wL", " μL")
4136        .replace(" uL", " μL")
4137        .replace(" pL", " μL");
4138
4139    normalized.trim().to_string()
4140}
4141
4142fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
4143    let unique = SystemTime::now()
4144        .duration_since(UNIX_EPOCH)
4145        .unwrap_or_default()
4146        .as_nanos();
4147    let dir = std::env::temp_dir().join(format!(
4148        "edgeparse-raster-ocr-{}-{}-{}",
4149        std::process::id(),
4150        page_number,
4151        unique
4152    ));
4153    fs::create_dir_all(&dir)?;
4154    Ok(dir)
4155}
4156
4157fn extract_visible_page_image_files(
4158    input_path: &Path,
4159    page_number: u32,
4160    temp_dir: &Path,
4161) -> Option<Vec<PathBuf>> {
4162    let list_output = Command::new("pdfimages")
4163        .arg("-f")
4164        .arg(page_number.to_string())
4165        .arg("-l")
4166        .arg(page_number.to_string())
4167        .arg("-list")
4168        .arg(input_path)
4169        .output()
4170        .ok()?;
4171    if !list_output.status.success() {
4172        return None;
4173    }
4174
4175    let entries = parse_pdfimages_list(&String::from_utf8_lossy(&list_output.stdout));
4176    let visible_indices: Vec<usize> = entries
4177        .iter()
4178        .enumerate()
4179        .filter_map(|(idx, entry)| (entry.image_type == "image").then_some(idx))
4180        .collect();
4181    if visible_indices.is_empty() {
4182        return Some(Vec::new());
4183    }
4184
4185    let prefix = temp_dir.join("img");
4186    let status = Command::new("pdfimages")
4187        .arg("-f")
4188        .arg(page_number.to_string())
4189        .arg("-l")
4190        .arg(page_number.to_string())
4191        .arg("-png")
4192        .arg(input_path)
4193        .arg(&prefix)
4194        .status()
4195        .ok()?;
4196    if !status.success() {
4197        return None;
4198    }
4199
4200    let mut image_files: Vec<PathBuf> = fs::read_dir(temp_dir)
4201        .ok()?
4202        .filter_map(|entry| entry.ok().map(|e| e.path()))
4203        .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
4204        .collect();
4205    image_files.sort();
4206
4207    let visible_files: Vec<PathBuf> = visible_indices
4208        .into_iter()
4209        .filter_map(|idx| image_files.get(idx).cloned())
4210        .collect();
4211    Some(visible_files)
4212}
4213
4214fn parse_pdfimages_list(output: &str) -> Vec<PdfImagesListEntry> {
4215    let mut entries = Vec::new();
4216    let mut in_rows = false;
4217
4218    for line in output.lines() {
4219        let trimmed = line.trim();
4220        if trimmed.is_empty() {
4221            continue;
4222        }
4223        if trimmed.starts_with("---") {
4224            in_rows = true;
4225            continue;
4226        }
4227        if !in_rows {
4228            continue;
4229        }
4230
4231        let mut cols = trimmed.split_whitespace();
4232        let Some(_page) = cols.next() else {
4233            continue;
4234        };
4235        let Some(_num) = cols.next() else {
4236            continue;
4237        };
4238        let Some(image_type) = cols.next() else {
4239            continue;
4240        };
4241
4242        entries.push(PdfImagesListEntry {
4243            image_type: image_type.to_string(),
4244        });
4245    }
4246
4247    entries
4248}
4249
4250#[cfg(test)]
4251mod tests {
4252    use super::*;
4253    use image::GrayImage;
4254
4255    fn image_chunk() -> ImageChunk {
4256        ImageChunk {
4257            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 400.0, 400.0),
4258            index: Some(1),
4259            level: None,
4260        }
4261    }
4262
4263    fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
4264        OcrWord {
4265            line_key: line,
4266            left,
4267            top: 0,
4268            width: 40,
4269            height: 12,
4270            text: text.to_string(),
4271            confidence: 90.0,
4272        }
4273    }
4274
4275    fn word_at(line: (u32, u32, u32), left: u32, top: u32, width: u32, text: &str) -> OcrWord {
4276        OcrWord {
4277            line_key: line,
4278            left,
4279            top,
4280            width,
4281            height: 12,
4282            text: text.to_string(),
4283            confidence: 90.0,
4284        }
4285    }
4286
4287    fn test_cell_text(cell: &TableBorderCell) -> String {
4288        cell.content
4289            .iter()
4290            .map(|token| token.base.value.trim())
4291            .filter(|value| !value.is_empty())
4292            .collect::<Vec<_>>()
4293            .join(" ")
4294    }
4295
4296    #[test]
4297    fn test_table_like_ocr_detects_repeated_columns() {
4298        let words = vec![
4299            word((1, 1, 1), 10, "Temperature"),
4300            word((1, 1, 1), 120, "Viscosity"),
4301            word((1, 1, 1), 240, "Temperature"),
4302            word((1, 1, 1), 360, "Viscosity"),
4303            word((1, 1, 2), 10, "0"),
4304            word((1, 1, 2), 120, "1.793E-06"),
4305            word((1, 1, 2), 240, "25"),
4306            word((1, 1, 2), 360, "8.930E-07"),
4307            word((1, 1, 3), 10, "1"),
4308            word((1, 1, 3), 120, "1.732E-06"),
4309            word((1, 1, 3), 240, "26"),
4310            word((1, 1, 3), 360, "8.760E-07"),
4311        ];
4312        assert!(!looks_like_chart_label_ocr(&words));
4313        assert!(looks_like_table_ocr(&words));
4314    }
4315
4316    #[test]
4317    fn test_structured_ocr_table_border_recovers_non_numeric_table() {
4318        let image = image_chunk();
4319        let words = vec![
4320            word_at((1, 1, 1), 10, 10, 80, "Tube"),
4321            word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4322            word_at((1, 1, 1), 305, 10, 70, "DNA"),
4323            word_at((1, 1, 2), 10, 42, 80, "1"),
4324            word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4325            word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4326            word_at((1, 1, 3), 10, 74, 80, "2"),
4327            word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4328            word_at((1, 1, 3), 305, 74, 70, "lambda"),
4329            word_at((1, 1, 4), 10, 106, 80, "3"),
4330            word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4331            word_at((1, 1, 4), 305, 106, 70, "control"),
4332        ];
4333
4334        assert!(!looks_like_chart_label_ocr(&words));
4335        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4336        assert_eq!(table.num_columns, 3);
4337        assert_eq!(table.num_rows, 4);
4338        assert_eq!(test_cell_text(&table.rows[0].cells[0]), "Tube");
4339        assert_eq!(test_cell_text(&table.rows[1].cells[1]), "BamHI");
4340        assert_eq!(test_cell_text(&table.rows[3].cells[2]), "control");
4341    }
4342
4343    #[test]
4344    fn test_chart_label_ocr_does_not_reject_five_row_table() {
4345        let words = vec![
4346            word_at((1, 1, 1), 10, 10, 80, "Tube"),
4347            word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4348            word_at((1, 1, 1), 305, 10, 70, "DNA"),
4349            word_at((1, 1, 2), 10, 42, 80, "1"),
4350            word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4351            word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4352            word_at((1, 1, 3), 10, 74, 80, "2"),
4353            word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4354            word_at((1, 1, 3), 305, 74, 70, "lambda"),
4355            word_at((1, 1, 4), 10, 106, 80, "3"),
4356            word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4357            word_at((1, 1, 4), 305, 106, 70, "control"),
4358            word_at((1, 1, 5), 10, 138, 80, "4"),
4359            word_at((1, 1, 5), 145, 138, 110, "NotI"),
4360            word_at((1, 1, 5), 305, 138, 70, "sample"),
4361        ];
4362
4363        assert!(!looks_like_chart_label_ocr(&words));
4364        assert!(looks_like_table_ocr(&words));
4365    }
4366
4367    #[test]
4368    fn test_structured_ocr_table_border_rejects_two_column_prose_layout() {
4369        let image = image_chunk();
4370        let words = vec![
4371            word_at((1, 1, 1), 10, 10, 90, "Summary"),
4372            word_at((1, 1, 1), 220, 10, 120, "Detailed findings"),
4373            word_at((1, 1, 2), 10, 42, 90, "Background"),
4374            word_at((1, 1, 2), 220, 42, 120, "Additional context"),
4375            word_at((1, 1, 3), 10, 74, 90, "Notes"),
4376            word_at((1, 1, 3), 220, 74, 120, "Further explanation"),
4377        ];
4378
4379        assert!(build_structured_ocr_table_border(&words, &image).is_none());
4380    }
4381
4382    #[test]
4383    fn test_parse_pdfimages_list_ignores_smask_entries() {
4384        let output = "page   num  type   width height color comp bpc  enc interp  object ID x-ppi y-ppi size ratio\n--------------------------------------------------------------------------------------------\n   1     0 image    1320   358  icc     3   8  image  no        46  0   208   208 63.5K 4.6%\n   1     1 smask    1320   358  gray    1   8  image  no        46  0   208   208  483B 0.1%\n";
4385
4386        let entries = parse_pdfimages_list(output);
4387        assert_eq!(entries.len(), 2);
4388        assert_eq!(entries[0].image_type, "image");
4389        assert_eq!(entries[1].image_type, "smask");
4390    }
4391
4392    #[test]
4393    fn test_table_like_ocr_rejects_single_line_caption() {
4394        let words = vec![
4395            word((1, 1, 1), 10, "Figure"),
4396            word((1, 1, 1), 90, "7.2"),
4397            word((1, 1, 1), 150, "Viscosity"),
4398            word((1, 1, 1), 260, "of"),
4399            word((1, 1, 1), 300, "Water"),
4400        ];
4401        assert!(!looks_like_table_ocr(&words));
4402    }
4403
4404    #[test]
4405    fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
4406        assert_eq!(
4407            normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
4408            "3 μL"
4409        );
4410        assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
4411        assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
4412    }
4413
4414    #[test]
4415    fn test_detect_bordered_raster_grid_finds_strong_lines() {
4416        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
4417        for x in [10, 40, 80, 110] {
4418            for y in 10..71 {
4419                image.put_pixel(x, y, Luma([0]));
4420            }
4421        }
4422        for y in [10, 30, 50, 70] {
4423            for x in 10..111 {
4424                image.put_pixel(x, y, Luma([0]));
4425            }
4426        }
4427
4428        let grid = detect_bordered_raster_grid(&image).expect("grid");
4429        assert_eq!(grid.vertical_lines.len(), 4);
4430        assert_eq!(grid.horizontal_lines.len(), 4);
4431    }
4432
4433    #[test]
4434    fn test_obvious_bar_chart_raster_is_rejected() {
4435        let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4436        for &(y1, y2) in &[(25, 40), (70, 85), (115, 130), (160, 175)] {
4437            for y in y1..y2 {
4438                for x in 40..280 {
4439                    image.put_pixel(x, y, Luma([80]));
4440                }
4441            }
4442        }
4443
4444        assert!(is_obvious_bar_chart_raster(&image));
4445    }
4446
4447    #[test]
4448    fn test_vertical_bar_chart_raster_is_rejected() {
4449        let mut image = GrayImage::from_pixel(360, 240, Luma([255]));
4450        for &(x1, x2, y1) in &[
4451            (40, 78, 52),
4452            (92, 126, 118),
4453            (140, 170, 146),
4454            (184, 210, 162),
4455        ] {
4456            for x in x1..x2 {
4457                for y in y1..212 {
4458                    image.put_pixel(x, y, Luma([90]));
4459                }
4460            }
4461        }
4462
4463        assert!(is_obvious_bar_chart_raster(&image));
4464    }
4465
4466    #[test]
4467    fn test_light_fill_vertical_bar_chart_raster_is_rejected() {
4468        let mut image = GrayImage::from_pixel(420, 260, Luma([255]));
4469        for x in 24..396 {
4470            image.put_pixel(x, 222, Luma([170]));
4471        }
4472        for &(x1, x2, y1, shade) in &[
4473            (46, 82, 132, 222),
4474            (104, 140, 84, 214),
4475            (162, 198, 62, 206),
4476            (220, 256, 144, 228),
4477        ] {
4478            for x in x1..x2 {
4479                for y in y1..222 {
4480                    image.put_pixel(x, y, Luma([shade]));
4481                }
4482            }
4483        }
4484
4485        assert!(is_obvious_bar_chart_raster(&image));
4486    }
4487
4488    #[test]
4489    fn test_grouped_vertical_bar_chart_raster_is_rejected() {
4490        let mut image = GrayImage::from_pixel(420, 240, Luma([255]));
4491        for x in 28..392 {
4492            image.put_pixel(x, 214, Luma([175]));
4493        }
4494        for &(x1, x2, y1, shade) in &[
4495            (44, 60, 98, 210),
4496            (64, 80, 140, 225),
4497            (108, 124, 116, 214),
4498            (128, 144, 148, 229),
4499            (172, 188, 88, 206),
4500            (192, 208, 128, 222),
4501            (236, 252, 104, 212),
4502            (256, 272, 156, 228),
4503        ] {
4504            for x in x1..x2 {
4505                for y in y1..214 {
4506                    image.put_pixel(x, y, Luma([shade]));
4507                }
4508            }
4509        }
4510
4511        assert!(is_obvious_bar_chart_raster(&image));
4512    }
4513
4514    #[test]
4515    fn test_natural_photograph_raster_is_detected() {
4516        // Create a photo-like image: wide histogram spread across [20, 230] mid-tones
4517        let w = 100u32;
4518        let h = 100u32;
4519        let mut image = GrayImage::new(w, h);
4520        // Fill with a gradient covering the full range — most pixels will be mid-tone
4521        for y in 0..h {
4522            for x in 0..w {
4523                let v = ((x + y) * 255 / (w + h - 2)) as u8;
4524                image.put_pixel(x, y, Luma([v]));
4525            }
4526        }
4527        // Should be classified as photographic (≥30% mid-tone pixels)
4528        assert!(is_natural_photograph_raster(&image));
4529    }
4530
4531    #[test]
4532    fn test_chart_image_is_not_classified_as_photograph() {
4533        // Chart-like image: mostly white with a few dark lines (no mid-tone content)
4534        let mut image = GrayImage::from_pixel(200, 160, Luma([255]));
4535        // A few thin dark lines (table borders or chart axes)
4536        for x in 20..180 {
4537            image.put_pixel(x, 20, Luma([0]));
4538            image.put_pixel(x, 80, Luma([0]));
4539            image.put_pixel(x, 140, Luma([0]));
4540        }
4541        for y in 20..141 {
4542            image.put_pixel(20, y, Luma([0]));
4543            image.put_pixel(180, y, Luma([0]));
4544        }
4545        // Very few mid-tone pixels — should NOT be classified as photograph
4546        assert!(!is_natural_photograph_raster(&image));
4547        assert!(!is_dark_ui_screenshot_raster(&image));
4548    }
4549
4550    #[test]
4551    fn test_bright_natural_photograph_raster_is_detected() {
4552        let mut image = GrayImage::from_pixel(240, 180, Luma([250]));
4553        for y in 24..148 {
4554            for x in 52..156 {
4555                let tone = 72 + (((x - 52) * 11 + (y - 24) * 7) % 132) as u8;
4556                image.put_pixel(x, y, Luma([tone]));
4557            }
4558        }
4559
4560        assert!(is_natural_photograph_raster(&image));
4561    }
4562
4563    #[test]
4564    fn test_dark_ui_screenshot_raster_is_detected() {
4565        let mut image = GrayImage::from_pixel(260, 180, Luma([20]));
4566        for x in 18..242 {
4567            for y in 18..34 {
4568                image.put_pixel(x, y, Luma([210]));
4569            }
4570        }
4571        for &(x1, y1, x2, y2, shade) in &[
4572            (26, 58, 84, 108, 198),
4573            (94, 58, 152, 108, 210),
4574            (162, 58, 220, 108, 192),
4575            (26, 118, 220, 134, 224),
4576        ] {
4577            for x in x1..x2 {
4578                for y in y1..y2 {
4579                    image.put_pixel(x, y, Luma([shade]));
4580                }
4581            }
4582        }
4583
4584        assert!(is_dark_ui_screenshot_raster(&image));
4585    }
4586
4587    #[test]
4588    fn test_table_like_ocr_rejects_matrix_formula_layout() {
4589        let words = vec![
4590            word_at((1, 1, 1), 14, 10, 36, "B23"),
4591            word_at((1, 1, 1), 160, 10, 22, "C1"),
4592            word_at((1, 1, 1), 230, 10, 22, "C2"),
4593            word_at((1, 1, 1), 300, 10, 22, "C3"),
4594            word_at((1, 1, 2), 20, 44, 24, "0/0"),
4595            word_at((1, 1, 2), 150, 44, 18, "0"),
4596            word_at((1, 1, 2), 220, 44, 28, "001"),
4597            word_at((1, 1, 2), 300, 44, 28, "000"),
4598            word_at((1, 1, 3), 20, 76, 24, "0/1"),
4599            word_at((1, 1, 3), 150, 76, 28, "000"),
4600            word_at((1, 1, 3), 220, 76, 28, "010"),
4601            word_at((1, 1, 3), 300, 76, 28, "000"),
4602        ];
4603
4604        assert!(looks_like_matrix_formula_ocr(&words));
4605        assert!(!looks_like_table_ocr(&words));
4606    }
4607
4608    #[test]
4609    fn test_table_like_ocr_keeps_small_numeric_table_with_real_headers() {
4610        let words = vec![
4611            word_at((1, 1, 1), 10, 10, 64, "Year"),
4612            word_at((1, 1, 1), 130, 10, 28, "Q1"),
4613            word_at((1, 1, 1), 220, 10, 28, "Q2"),
4614            word_at((1, 1, 1), 310, 10, 28, "Q3"),
4615            word_at((1, 1, 2), 10, 42, 64, "2022"),
4616            word_at((1, 1, 2), 130, 42, 24, "10"),
4617            word_at((1, 1, 2), 220, 42, 24, "25"),
4618            word_at((1, 1, 2), 310, 42, 24, "30"),
4619            word_at((1, 1, 3), 10, 74, 64, "2023"),
4620            word_at((1, 1, 3), 130, 74, 24, "11"),
4621            word_at((1, 1, 3), 220, 74, 24, "26"),
4622            word_at((1, 1, 3), 310, 74, 24, "31"),
4623        ];
4624
4625        assert!(!looks_like_matrix_formula_ocr(&words));
4626        assert!(looks_like_table_ocr(&words));
4627    }
4628
4629    #[test]
4630    fn test_matrixish_small_ocr_table_is_rejected_after_build() {
4631        let image = ImageChunk {
4632            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 120.0),
4633            index: Some(1),
4634            level: None,
4635        };
4636        let words = vec![
4637            word_at((1, 1, 1), 14, 10, 36, "B23"),
4638            word_at((1, 1, 1), 160, 10, 22, "C1"),
4639            word_at((1, 1, 1), 230, 10, 22, "C2"),
4640            word_at((1, 1, 1), 300, 10, 22, "C3"),
4641            word_at((1, 1, 2), 20, 44, 24, "0/0"),
4642            word_at((1, 1, 2), 150, 44, 18, "0"),
4643            word_at((1, 1, 2), 220, 44, 28, "001"),
4644            word_at((1, 1, 2), 300, 44, 28, "000"),
4645            word_at((1, 1, 3), 20, 76, 24, "0/1"),
4646            word_at((1, 1, 3), 150, 76, 28, "000"),
4647            word_at((1, 1, 3), 220, 76, 28, "010"),
4648            word_at((1, 1, 3), 300, 76, 28, "000"),
4649        ];
4650
4651        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4652        assert!(is_matrixish_ocr_artifact_table(&table));
4653    }
4654
4655    #[test]
4656    fn test_small_numeric_table_with_real_headers_is_not_rejected_after_build() {
4657        let image = ImageChunk {
4658            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 140.0),
4659            index: Some(1),
4660            level: None,
4661        };
4662        let words = vec![
4663            word_at((1, 1, 1), 10, 10, 64, "Year"),
4664            word_at((1, 1, 1), 130, 10, 28, "Q1"),
4665            word_at((1, 1, 1), 220, 10, 28, "Q2"),
4666            word_at((1, 1, 1), 310, 10, 28, "Q3"),
4667            word_at((1, 1, 2), 10, 42, 64, "2022"),
4668            word_at((1, 1, 2), 130, 42, 24, "10"),
4669            word_at((1, 1, 2), 220, 42, 24, "25"),
4670            word_at((1, 1, 2), 310, 42, 24, "30"),
4671            word_at((1, 1, 3), 10, 74, 64, "2023"),
4672            word_at((1, 1, 3), 130, 74, 24, "11"),
4673            word_at((1, 1, 3), 220, 74, 24, "26"),
4674            word_at((1, 1, 3), 310, 74, 24, "31"),
4675        ];
4676
4677        let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4678        assert!(!is_matrixish_ocr_artifact_table(&table));
4679    }
4680
4681    #[test]
4682    fn test_bordered_table_raster_is_not_rejected_as_chart() {
4683        let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4684        for x in [20, 110, 210, 300] {
4685            for y in 20..181 {
4686                image.put_pixel(x, y, Luma([0]));
4687            }
4688        }
4689        for y in [20, 70, 120, 180] {
4690            for x in 20..301 {
4691                image.put_pixel(x, y, Luma([0]));
4692            }
4693        }
4694
4695        assert!(!is_obvious_bar_chart_raster(&image));
4696    }
4697
4698    #[test]
4699    fn test_morphological_erode_preserves_white_background() {
4700        let image = GrayImage::from_fn(9, 9, |x, y| {
4701            if x == 4 || y == 4 {
4702                Luma([0])
4703            } else {
4704                Luma([255])
4705            }
4706        });
4707
4708        let eroded = morphological_erode(&image, 1);
4709
4710        assert_eq!(eroded.get_pixel(0, 0).0[0], 255);
4711        assert_eq!(eroded.get_pixel(8, 8).0[0], 255);
4712        assert_eq!(eroded.get_pixel(4, 4).0[0], 255);
4713    }
4714
4715    #[test]
4716    fn test_dense_prose_image_ocr_detects_infographic_text() {
4717        let mut words = Vec::new();
4718        let mut top = 20;
4719        for line_num in 1..=8 {
4720            for (idx, (left, text)) in [
4721                (20, "Copyright"),
4722                (120, "protects"),
4723                (240, "creative"),
4724                (350, "work"),
4725            ]
4726            .into_iter()
4727            .enumerate()
4728            {
4729                words.push(OcrWord {
4730                    line_key: (1, 1, line_num),
4731                    left,
4732                    top,
4733                    width: 60,
4734                    height: 14,
4735                    confidence: 85.0,
4736                    text: if idx == 0 && line_num % 2 == 0 {
4737                        "Creators".to_string()
4738                    } else {
4739                        text.to_string()
4740                    },
4741                });
4742            }
4743            top += 22;
4744        }
4745
4746        assert!(looks_like_dense_prose_image_ocr(&words));
4747    }
4748
4749    #[test]
4750    fn test_dense_prose_image_ocr_rejects_chart_like_words() {
4751        let words = vec![
4752            word((1, 1, 1), 10, "70.2"),
4753            word((1, 1, 1), 90, "75.6"),
4754            word((1, 1, 1), 170, "92.4"),
4755            word((1, 1, 2), 10, "80.4"),
4756            word((1, 1, 2), 90, "94.2"),
4757            word((1, 1, 2), 170, "95.5"),
4758            word((1, 1, 3), 10, "Company"),
4759            word((1, 1, 3), 90, "A"),
4760            word((1, 1, 3), 170, "B"),
4761            word((1, 1, 4), 10, "Scene"),
4762            word((1, 1, 4), 90, "Document"),
4763            word((1, 1, 5), 10, "65"),
4764            word((1, 1, 5), 90, "70"),
4765            word((1, 1, 5), 170, "75"),
4766            word((1, 1, 6), 10, "80"),
4767            word((1, 1, 6), 90, "85"),
4768            word((1, 1, 6), 170, "90"),
4769            word((1, 1, 7), 10, "95"),
4770            word((1, 1, 7), 90, "100"),
4771        ];
4772
4773        assert!(!looks_like_dense_prose_image_ocr(&words));
4774    }
4775
4776    #[test]
4777    fn test_dense_prose_image_ocr_rejects_scattered_chart_labels() {
4778        let words = vec![
4779            word_at((1, 1, 1), 20, 20, 80, "Participation"),
4780            word_at((1, 1, 1), 120, 20, 70, "of"),
4781            word_at((1, 1, 1), 210, 20, 90, "Institutions"),
4782            word_at((1, 1, 2), 310, 50, 50, "57"),
4783            word_at((1, 1, 2), 380, 50, 60, "(24%)"),
4784            word_at((1, 1, 3), 290, 86, 40, "20"),
4785            word_at((1, 1, 3), 345, 86, 50, "(8%)"),
4786            word_at((1, 1, 4), 80, 124, 120, "Government"),
4787            word_at((1, 1, 4), 260, 124, 90, "Other"),
4788            word_at((1, 1, 4), 360, 124, 60, "State"),
4789            word_at((1, 1, 5), 70, 160, 80, "Civil"),
4790            word_at((1, 1, 5), 170, 160, 80, "Society"),
4791            word_at((1, 1, 5), 280, 160, 110, "Organizations"),
4792            word_at((1, 1, 6), 300, 194, 50, "31"),
4793            word_at((1, 1, 6), 365, 194, 60, "(13%)"),
4794            word_at((1, 1, 7), 35, 228, 120, "Educational"),
4795            word_at((1, 1, 7), 180, 228, 100, "Institution"),
4796            word_at((1, 1, 8), 250, 262, 40, "16"),
4797            word_at((1, 1, 8), 305, 262, 50, "(7%)"),
4798        ];
4799
4800        assert!(looks_like_chart_label_ocr(&words));
4801        assert!(!looks_like_table_ocr(&words));
4802        assert!(!looks_like_dense_prose_image_ocr(&words));
4803    }
4804
4805    #[test]
4806    fn test_chart_label_ocr_detects_stacked_bar_chart_legend_layout() {
4807        let words = vec![
4808            word_at((1, 1, 1), 10, 15, 22, "ano"),
4809            word_at((1, 1, 1), 10, 8, 24, "MW."),
4810            word_at((1, 1, 2), 410, 25, 38, "Waste"),
4811            word_at((1, 1, 2), 452, 25, 55, "materials"),
4812            word_at((1, 1, 3), 11, 38, 21, "350"),
4813            word_at((1, 1, 4), 11, 61, 21, "300"),
4814            word_at((1, 1, 4), 411, 56, 38, "Biogas"),
4815            word_at((1, 1, 5), 7, 79, 25, "250"),
4816            word_at((1, 1, 5), 399, 87, 8, "'™"),
4817            word_at((1, 1, 5), 411, 87, 75, "Construction"),
4818            word_at((1, 1, 5), 490, 86, 33, "wood"),
4819            word_at((1, 1, 5), 527, 87, 35, "waste"),
4820            word_at((1, 1, 6), 11, 106, 21, "200"),
4821            word_at((1, 1, 7), 411, 117, 59, "General"),
4822            word_at((1, 1, 7), 467, 116, 27, "wood"),
4823            word_at((1, 1, 7), 499, 116, 54, "(10MWs)"),
4824            word_at((1, 1, 8), 11, 129, 21, "150"),
4825            word_at((1, 1, 9), 11, 152, 21, "100"),
4826            word_at((1, 1, 9), 399, 148, 7, "="),
4827            word_at((1, 1, 9), 411, 135, 46, "General"),
4828            word_at((1, 1, 9), 464, 135, 27, "wood"),
4829            word_at((1, 1, 9), 498, 146, 56, "(<LOMW)"),
4830            word_at((1, 1, 10), 13, 163, 18, "50"),
4831            word_at((1, 1, 10), 399, 178, 7, "="),
4832            word_at((1, 1, 10), 411, 176, 73, "Unutilised"),
4833            word_at((1, 1, 10), 480, 166, 29, "wood"),
4834            word_at((1, 1, 10), 516, 176, 45, "(2MWs)"),
4835            word_at((1, 1, 11), 24, 197, 7, "o"),
4836            word_at((1, 1, 12), 399, 208, 8, "m="),
4837            word_at((1, 1, 12), 411, 206, 59, "Unutilised"),
4838            word_at((1, 1, 12), 474, 206, 33, "wood"),
4839            word_at((1, 1, 12), 512, 206, 48, "(<2MW)"),
4840            word_at((1, 1, 13), 51, 217, 32, "12-13"),
4841            word_at((1, 1, 13), 96, 217, 28, "2014"),
4842            word_at((1, 1, 13), 139, 217, 28, "2015"),
4843            word_at((1, 1, 13), 182, 217, 28, "2016"),
4844            word_at((1, 1, 13), 225, 217, 28, "2017"),
4845            word_at((1, 1, 13), 268, 217, 28, "2018"),
4846            word_at((1, 1, 13), 311, 217, 28, "2019"),
4847            word_at((1, 1, 13), 354, 217, 28, "2020"),
4848        ];
4849
4850        assert!(looks_like_chart_label_ocr(&words));
4851        assert!(!looks_like_table_ocr(&words));
4852    }
4853
4854    #[test]
4855    fn test_build_numeric_table_border_rejects_sparse_chart_layout() {
4856        let image = image_chunk();
4857        let mut words = Vec::new();
4858        let columns = [20, 55, 90, 125, 160, 195, 230, 265, 300, 335, 370, 405];
4859
4860        for (idx, left) in columns.iter().enumerate() {
4861            words.push(word_at((1, 1, 1), *left, 20, 22, &format!("H{}", idx + 1)));
4862        }
4863        for (idx, left) in [20, 160, 300].into_iter().enumerate() {
4864            words.push(word_at((1, 1, 2), left, 52, 22, &format!("{}", idx + 1)));
4865        }
4866        for (idx, left) in [55, 195, 335].into_iter().enumerate() {
4867            words.push(word_at((1, 1, 3), left, 84, 22, &format!("{}", idx + 4)));
4868        }
4869        for (idx, left) in [90, 230, 370].into_iter().enumerate() {
4870            words.push(word_at((1, 1, 4), left, 116, 22, &format!("{}", idx + 7)));
4871        }
4872        for (idx, left) in columns.iter().enumerate() {
4873            words.push(word_at((1, 1, 5), *left, 148, 22, &format!("{}", idx + 10)));
4874        }
4875
4876        assert!(looks_like_chart_label_ocr(&words));
4877        assert!(!looks_like_table_ocr(&words));
4878        assert!(!looks_like_numeric_table_ocr(&words));
4879        assert!(build_numeric_table_border(&words, &image).is_none());
4880    }
4881
4882    #[test]
4883    fn test_lines_from_ocr_words_merges_wrapped_lines_into_blocks() {
4884        let words = vec![
4885            word_at((1, 1, 1), 20, 20, 64, "Copyright"),
4886            word_at((1, 1, 1), 100, 20, 56, "protects"),
4887            word_at((1, 1, 2), 20, 38, 52, "creative"),
4888            word_at((1, 1, 2), 84, 38, 36, "work"),
4889            word_at((1, 1, 3), 240, 20, 52, "Public"),
4890            word_at((1, 1, 3), 304, 20, 40, "domain"),
4891            word_at((1, 1, 4), 240, 38, 60, "expires"),
4892            word_at((1, 1, 4), 312, 38, 44, "later"),
4893        ];
4894
4895        let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &[]);
4896
4897        assert_eq!(recovered.len(), 2);
4898        assert_eq!(recovered[0].value, "Copyright protects creative work");
4899        assert_eq!(recovered[1].value, "Public domain expires later");
4900    }
4901
4902    #[test]
4903    fn test_page_raster_ocr_skips_bar_chart_tables() {
4904        let mut chart = GrayImage::from_pixel(420, 260, Luma([255]));
4905        for x in 24..396 {
4906            chart.put_pixel(x, 222, Luma([170]));
4907        }
4908        for &(x1, x2, y1, shade) in &[
4909            (46, 82, 132, 222),
4910            (104, 140, 84, 214),
4911            (162, 198, 62, 206),
4912            (220, 256, 144, 228),
4913        ] {
4914            for x in x1..x2 {
4915                for y in y1..222 {
4916                    chart.put_pixel(x, y, Luma([shade]));
4917                }
4918            }
4919        }
4920
4921        let page_bbox = BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0);
4922        let mut table = TableBorder {
4923            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0),
4924            index: None,
4925            level: None,
4926            x_coordinates: vec![0.0, 210.0, 420.0],
4927            x_widths: vec![0.0; 3],
4928            y_coordinates: vec![260.0, 130.0, 0.0],
4929            y_widths: vec![0.0; 3],
4930            rows: vec![
4931                TableBorderRow {
4932                    bbox: BoundingBox::new(Some(1), 0.0, 130.0, 420.0, 260.0),
4933                    index: None,
4934                    level: None,
4935                    row_number: 0,
4936                    cells: vec![
4937                        TableBorderCell {
4938                            bbox: BoundingBox::new(Some(1), 0.0, 130.0, 210.0, 260.0),
4939                            index: None,
4940                            level: None,
4941                            row_number: 0,
4942                            col_number: 0,
4943                            row_span: 1,
4944                            col_span: 1,
4945                            content: Vec::new(),
4946                            contents: Vec::new(),
4947                            semantic_type: None,
4948                        },
4949                        TableBorderCell {
4950                            bbox: BoundingBox::new(Some(1), 210.0, 130.0, 420.0, 260.0),
4951                            index: None,
4952                            level: None,
4953                            row_number: 0,
4954                            col_number: 1,
4955                            row_span: 1,
4956                            col_span: 1,
4957                            content: Vec::new(),
4958                            contents: Vec::new(),
4959                            semantic_type: None,
4960                        },
4961                    ],
4962                    semantic_type: None,
4963                },
4964                TableBorderRow {
4965                    bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 130.0),
4966                    index: None,
4967                    level: None,
4968                    row_number: 1,
4969                    cells: vec![
4970                        TableBorderCell {
4971                            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 210.0, 130.0),
4972                            index: None,
4973                            level: None,
4974                            row_number: 1,
4975                            col_number: 0,
4976                            row_span: 1,
4977                            col_span: 1,
4978                            content: Vec::new(),
4979                            contents: Vec::new(),
4980                            semantic_type: None,
4981                        },
4982                        TableBorderCell {
4983                            bbox: BoundingBox::new(Some(1), 210.0, 0.0, 420.0, 130.0),
4984                            index: None,
4985                            level: None,
4986                            row_number: 1,
4987                            col_number: 1,
4988                            row_span: 1,
4989                            col_span: 1,
4990                            content: Vec::new(),
4991                            contents: Vec::new(),
4992                            semantic_type: None,
4993                        },
4994                    ],
4995                    semantic_type: None,
4996                },
4997            ],
4998            num_rows: 2,
4999            num_columns: 2,
5000            is_bad_table: false,
5001            is_table_transformer: true,
5002            previous_table: None,
5003            next_table: None,
5004        };
5005
5006        enrich_empty_table_from_page_raster(&chart, &page_bbox, &mut table);
5007
5008        assert!(table
5009            .rows
5010            .iter()
5011            .flat_map(|row| row.cells.iter())
5012            .all(|cell| cell.content.is_empty()));
5013    }
5014
5015    #[test]
5016    fn test_lines_from_ocr_words_dedupes_against_native_text() {
5017        let words = vec![
5018            word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5019            word_at((1, 1, 1), 100, 20, 56, "protects"),
5020            word_at((1, 1, 2), 20, 38, 52, "creative"),
5021            word_at((1, 1, 2), 84, 38, 36, "work"),
5022        ];
5023        let native = vec![TextChunk {
5024            value: "Copyright protects creative work".to_string(),
5025            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 10.0, 10.0),
5026            font_name: "Native".to_string(),
5027            font_size: 12.0,
5028            font_weight: 400.0,
5029            italic_angle: 0.0,
5030            font_color: "#000000".to_string(),
5031            contrast_ratio: 21.0,
5032            symbol_ends: Vec::new(),
5033            text_format: TextFormat::Normal,
5034            text_type: TextType::Regular,
5035            pdf_layer: PdfLayer::Content,
5036            ocg_visible: true,
5037            index: None,
5038            page_number: Some(1),
5039            level: None,
5040            mcid: None,
5041        }];
5042
5043        let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &native);
5044
5045        assert!(recovered.is_empty());
5046    }
5047}