Skip to main content

edgeparse_core/pdf/
raster_table_ocr.rs

1//! Recover text signal from raster table images using local OCR.
2
3use std::collections::{BTreeMap, HashMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::time::{SystemTime, UNIX_EPOCH};
8
9use image::{GenericImageView, GrayImage, Luma};
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::{ImageChunk, TextChunk};
13use crate::models::content::ContentElement;
14use crate::models::enums::{PdfLayer, TextFormat, TextType};
15use crate::models::table::{
16    TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
17};
18
19const MIN_IMAGE_WIDTH_RATIO: f64 = 0.45;
20const MIN_IMAGE_AREA_RATIO: f64 = 0.045;
21const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
22const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
23const MIN_OCR_WORD_CONFIDENCE: f64 = 35.0;
24const RASTER_DARK_THRESHOLD: u8 = 180;
25const MIN_BORDERED_VERTICAL_LINES: usize = 4;
26const MIN_BORDERED_HORIZONTAL_LINES: usize = 4;
27const MIN_LINE_DARK_RATIO: f64 = 0.55;
28const MIN_CELL_SIZE_PX: u32 = 10;
29const CELL_INSET_PX: u32 = 4;
30const OCR_SCALE_FACTOR: u32 = 3;
31const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
32const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
33const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
34
35#[derive(Debug, Clone)]
36struct OcrWord {
37    line_key: (u32, u32, u32),
38    left: u32,
39    top: u32,
40    width: u32,
41    height: u32,
42    text: String,
43}
44
45#[derive(Debug, Clone)]
46struct XCluster {
47    center: f64,
48    count: usize,
49    lines: HashSet<(u32, u32, u32)>,
50}
51
52#[derive(Clone)]
53struct OcrRowBuild {
54    top_y: f64,
55    bottom_y: f64,
56    cell_texts: Vec<String>,
57}
58
59#[derive(Debug, Clone)]
60struct RasterTableGrid {
61    vertical_lines: Vec<u32>,
62    horizontal_lines: Vec<u32>,
63}
64
65/// Recover OCR text chunks for image-backed table regions on a single page.
66pub fn recover_raster_table_text_chunks(
67    input_path: &Path,
68    page_bbox: &BoundingBox,
69    page_number: u32,
70    text_chunks: &[TextChunk],
71    image_chunks: &[ImageChunk],
72) -> Vec<TextChunk> {
73    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
74        return Vec::new();
75    }
76
77    let candidates: Vec<&ImageChunk> = image_chunks
78        .iter()
79        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
80        .collect();
81    if candidates.is_empty() {
82        return Vec::new();
83    }
84
85    let temp_dir = match create_temp_dir(page_number) {
86        Ok(dir) => dir,
87        Err(_) => return Vec::new(),
88    };
89
90    let result =
91        recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
92
93    let _ = fs::remove_dir_all(&temp_dir);
94    result
95}
96
97/// Recover synthetic table borders for strongly numeric raster tables.
98pub fn recover_raster_table_borders(
99    input_path: &Path,
100    page_bbox: &BoundingBox,
101    page_number: u32,
102    text_chunks: &[TextChunk],
103    image_chunks: &[ImageChunk],
104) -> Vec<TableBorder> {
105    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
106        return Vec::new();
107    }
108
109    let candidates: Vec<&ImageChunk> = image_chunks
110        .iter()
111        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
112        .collect();
113    if candidates.is_empty() {
114        return Vec::new();
115    }
116
117    let temp_dir = match create_temp_dir(page_number) {
118        Ok(dir) => dir,
119        Err(_) => return Vec::new(),
120    };
121
122    let prefix = temp_dir.join("img");
123    let status = Command::new("pdfimages")
124        .arg("-f")
125        .arg(page_number.to_string())
126        .arg("-l")
127        .arg(page_number.to_string())
128        .arg("-png")
129        .arg(input_path)
130        .arg(&prefix)
131        .status();
132    match status {
133        Ok(s) if s.success() => {}
134        _ => {
135            let _ = fs::remove_dir_all(&temp_dir);
136            return Vec::new();
137        }
138    }
139
140    let mut image_files: Vec<PathBuf> = match fs::read_dir(&temp_dir) {
141        Ok(read_dir) => read_dir
142            .filter_map(|entry| entry.ok().map(|e| e.path()))
143            .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
144            .collect(),
145        Err(_) => {
146            let _ = fs::remove_dir_all(&temp_dir);
147            return Vec::new();
148        }
149    };
150    image_files.sort();
151
152    let mut tables = Vec::new();
153    for image in candidates {
154        let Some(image_index) = image.index else {
155            continue;
156        };
157        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
158            continue;
159        };
160        if let Some(table) = recover_bordered_raster_table(image_path, image) {
161            tables.push(table);
162            continue;
163        }
164        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
165            continue;
166        };
167        let Ok(tsv_output) = Command::new("tesseract")
168            .current_dir(&temp_dir)
169            .arg(file_name)
170            .arg("stdout")
171            .arg("--psm")
172            .arg("6")
173            .arg("tsv")
174            .output()
175        else {
176            continue;
177        };
178        if !tsv_output.status.success() {
179            continue;
180        }
181
182        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
183        let words = parse_tesseract_tsv(&tsv);
184        if looks_like_numeric_table_ocr(&words) {
185            if let Some(table) = build_numeric_table_border(&words, image) {
186                tables.push(table);
187            }
188        }
189    }
190
191    let _ = fs::remove_dir_all(&temp_dir);
192    tables
193}
194
195/// Recover OCR text into empty bordered tables by rasterizing the full page.
196///
197/// This targets graphics-dominant pages where native PDF text is sparse but the
198/// page still exposes strong bordered geometry. It enriches existing empty
199/// `TableBorder` cells directly from the rendered page appearance.
200pub fn recover_page_raster_table_cell_text(
201    input_path: &Path,
202    page_bbox: &BoundingBox,
203    page_number: u32,
204    elements: &mut [ContentElement],
205) {
206    if page_bbox.area() <= 0.0 {
207        return;
208    }
209
210    let native_text_chars = page_native_text_chars(elements);
211    if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR {
212        return;
213    }
214
215    let candidate_indices: Vec<usize> = elements
216        .iter()
217        .enumerate()
218        .filter_map(|(idx, elem)| {
219            table_candidate_ref(elem)
220                .filter(|table| table_needs_page_raster_ocr(table))
221                .map(|_| idx)
222        })
223        .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
224        .collect();
225    if candidate_indices.is_empty() {
226        return;
227    }
228
229    let coverage: f64 = candidate_indices
230        .iter()
231        .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
232        .sum::<f64>()
233        / page_bbox.area().max(1.0);
234    if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
235        return;
236    }
237
238    let temp_dir = match create_temp_dir(page_number) {
239        Ok(dir) => dir,
240        Err(_) => return,
241    };
242    let prefix = temp_dir.join("page");
243    let status = Command::new("pdftoppm")
244        .arg("-png")
245        .arg("-f")
246        .arg(page_number.to_string())
247        .arg("-l")
248        .arg(page_number.to_string())
249        .arg("-singlefile")
250        .arg(input_path)
251        .arg(&prefix)
252        .status();
253    match status {
254        Ok(s) if s.success() => {}
255        _ => {
256            let _ = fs::remove_dir_all(&temp_dir);
257            return;
258        }
259    }
260
261    let page_image_path = prefix.with_extension("png");
262    let gray = match image::open(&page_image_path) {
263        Ok(img) => img.to_luma8(),
264        Err(_) => {
265            let _ = fs::remove_dir_all(&temp_dir);
266            return;
267        }
268    };
269
270    for idx in candidate_indices {
271        let Some(elem) = elements.get_mut(idx) else {
272            continue;
273        };
274        let Some(table) = table_candidate_mut(elem) else {
275            continue;
276        };
277        enrich_empty_table_from_page_raster(&gray, page_bbox, table);
278    }
279
280    let _ = fs::remove_dir_all(&temp_dir);
281}
282
283fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
284    match elem {
285        ContentElement::TableBorder(table) => Some(table),
286        ContentElement::Table(table) => Some(&table.table_border),
287        _ => None,
288    }
289}
290
291fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
292    match elem {
293        ContentElement::TableBorder(table) => Some(table),
294        ContentElement::Table(table) => Some(&mut table.table_border),
295        _ => None,
296    }
297}
298
299fn recover_from_page_images(
300    input_path: &Path,
301    temp_dir: &Path,
302    page_number: u32,
303    candidates: Vec<&ImageChunk>,
304    text_chunks: &[TextChunk],
305) -> Vec<TextChunk> {
306    let prefix = temp_dir.join("img");
307    let status = Command::new("pdfimages")
308        .arg("-f")
309        .arg(page_number.to_string())
310        .arg("-l")
311        .arg(page_number.to_string())
312        .arg("-png")
313        .arg(input_path)
314        .arg(&prefix)
315        .status();
316    match status {
317        Ok(s) if s.success() => {}
318        _ => return Vec::new(),
319    }
320
321    let mut image_files: Vec<PathBuf> = match fs::read_dir(temp_dir) {
322        Ok(read_dir) => read_dir
323            .filter_map(|entry| entry.ok().map(|e| e.path()))
324            .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
325            .collect(),
326        Err(_) => return Vec::new(),
327    };
328    image_files.sort();
329    if image_files.is_empty() {
330        return Vec::new();
331    }
332
333    let mut recovered = Vec::new();
334    for image in candidates {
335        let Some(image_index) = image.index else {
336            continue;
337        };
338        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
339            continue;
340        };
341        let bordered_table = recover_bordered_raster_table(image_path, image);
342        if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
343            recovered.push(caption);
344        }
345        if bordered_table.is_some() {
346            continue;
347        }
348        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
349            continue;
350        };
351        let Ok(tsv_output) = Command::new("tesseract")
352            .current_dir(temp_dir)
353            .arg(file_name)
354            .arg("stdout")
355            .arg("--psm")
356            .arg("6")
357            .arg("tsv")
358            .output()
359        else {
360            continue;
361        };
362        if !tsv_output.status.success() {
363            continue;
364        }
365
366        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
367        let words = parse_tesseract_tsv(&tsv);
368        if !looks_like_table_ocr(&words) {
369            continue;
370        }
371
372        recovered.extend(words_to_text_chunks(&words, image, text_chunks));
373    }
374
375    recovered
376}
377
378fn page_native_text_chars(elements: &[ContentElement]) -> usize {
379    elements
380        .iter()
381        .map(|elem| match elem {
382            ContentElement::Paragraph(p) => p.base.value().chars().count(),
383            ContentElement::Heading(h) => h.base.base.value().chars().count(),
384            ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
385            ContentElement::TextBlock(tb) => tb.value().chars().count(),
386            ContentElement::TextLine(tl) => tl.value().chars().count(),
387            ContentElement::TextChunk(tc) => tc.value.chars().count(),
388            ContentElement::List(list) => list
389                .list_items
390                .iter()
391                .flat_map(|item| item.contents.iter())
392                .map(|content| match content {
393                    ContentElement::Paragraph(p) => p.base.value().chars().count(),
394                    ContentElement::TextBlock(tb) => tb.value().chars().count(),
395                    ContentElement::TextLine(tl) => tl.value().chars().count(),
396                    ContentElement::TextChunk(tc) => tc.value.chars().count(),
397                    _ => 0,
398                })
399                .sum(),
400            _ => 0,
401        })
402        .sum()
403}
404
405fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
406    table.num_rows >= 1
407        && table.num_columns >= 2
408        && table
409            .rows
410            .iter()
411            .flat_map(|row| row.cells.iter())
412            .all(|cell| {
413                !cell
414                    .content
415                    .iter()
416                    .any(|token| matches!(token.token_type, TableTokenType::Text))
417            })
418}
419
420fn enrich_empty_table_from_page_raster(
421    gray: &GrayImage,
422    page_bbox: &BoundingBox,
423    table: &mut TableBorder,
424) {
425    for row in &mut table.rows {
426        for cell in &mut row.cells {
427            if cell
428                .content
429                .iter()
430                .any(|token| matches!(token.token_type, TableTokenType::Text))
431            {
432                continue;
433            }
434            let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
435            else {
436                continue;
437            };
438            let Some(text) = extract_page_raster_cell_text(gray, &cell.bbox, x1, y1, x2, y2) else {
439                continue;
440            };
441            if text.is_empty() {
442                continue;
443            }
444            cell.content.push(TableToken {
445                base: TextChunk {
446                    value: text,
447                    bbox: cell.bbox.clone(),
448                    font_name: "OCR".to_string(),
449                    font_size: cell.bbox.height().max(6.0),
450                    font_weight: 400.0,
451                    italic_angle: 0.0,
452                    font_color: "#000000".to_string(),
453                    contrast_ratio: 21.0,
454                    symbol_ends: Vec::new(),
455                    text_format: TextFormat::Normal,
456                    text_type: TextType::Regular,
457                    pdf_layer: PdfLayer::Content,
458                    ocg_visible: true,
459                    index: None,
460                    page_number: cell.bbox.page_number,
461                    level: None,
462                    mcid: None,
463                },
464                token_type: TableTokenType::Text,
465            });
466        }
467    }
468}
469
470fn page_bbox_to_raster_box(
471    gray: &GrayImage,
472    page_bbox: &BoundingBox,
473    bbox: &BoundingBox,
474) -> Option<(u32, u32, u32, u32)> {
475    if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
476        return None;
477    }
478
479    let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
480        .clamp(0.0, f64::from(gray.width()));
481    let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
482        .clamp(0.0, f64::from(gray.width()));
483    let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
484        .clamp(0.0, f64::from(gray.height()));
485    let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
486        * f64::from(gray.height()))
487    .clamp(0.0, f64::from(gray.height()));
488
489    let x1 = left.floor() as u32;
490    let x2 = right.ceil() as u32;
491    let y1 = top.floor() as u32;
492    let y2 = bottom.ceil() as u32;
493    (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
494}
495
496fn extract_page_raster_cell_text(
497    gray: &GrayImage,
498    cell_bbox: &BoundingBox,
499    x1: u32,
500    y1: u32,
501    x2: u32,
502    y2: u32,
503) -> Option<String> {
504    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
505    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
506    let crop_left = x1 + inset_x;
507    let crop_top = y1 + inset_y;
508    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
509    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
510    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
511        return Some(String::new());
512    }
513
514    let cropped = gray
515        .view(crop_left, crop_top, crop_width, crop_height)
516        .to_image();
517    let bordered = expand_white_border(&cropped, 12);
518    let scaled = image::imageops::resize(
519        &bordered,
520        bordered.width() * OCR_SCALE_FACTOR,
521        bordered.height() * OCR_SCALE_FACTOR,
522        image::imageops::FilterType::Lanczos3,
523    );
524    let psm = if cell_bbox.width() <= cell_bbox.height() * 1.15 {
525        "10"
526    } else {
527        "6"
528    };
529    let raw_text = run_tesseract_plain_text(&scaled, psm)?;
530    Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
531}
532
533fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
534    let normalized = text
535        .replace('|', " ")
536        .replace('—', "-")
537        .replace(['“', '”'], "\"")
538        .replace('’', "'")
539        .split_whitespace()
540        .collect::<Vec<_>>()
541        .join(" ");
542
543    if normalized.is_empty() {
544        return normalized;
545    }
546
547    let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
548    if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
549        return String::new();
550    }
551
552    normalized
553}
554
555fn is_ocr_candidate(
556    image: &ImageChunk,
557    page_bbox: &BoundingBox,
558    text_chunks: &[TextChunk],
559) -> bool {
560    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
561    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
562    if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
563        return false;
564    }
565
566    let overlapping_chunks: Vec<&TextChunk> = text_chunks
567        .iter()
568        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
569        .collect();
570    let native_text_chars: usize = overlapping_chunks
571        .iter()
572        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
573        .sum();
574
575    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
576        || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
577}
578
579fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
580    let mut words = Vec::new();
581    for line in tsv.lines().skip(1) {
582        let mut cols = line.splitn(12, '\t');
583        let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
584        if level != 5 {
585            continue;
586        }
587        let _page_num = cols.next();
588        let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
589        let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
590        let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
591        let _word_num = cols.next();
592        let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
593        let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
594        let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
595        let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
596        let confidence = cols
597            .next()
598            .and_then(|s| s.parse::<f64>().ok())
599            .unwrap_or(-1.0);
600        let text = cols.next().unwrap_or("").trim().to_string();
601        if confidence < MIN_OCR_WORD_CONFIDENCE
602            || text.is_empty()
603            || width == 0
604            || height == 0
605            || !text.chars().any(|ch| ch.is_alphanumeric())
606        {
607            continue;
608        }
609        words.push(OcrWord {
610            line_key: (block_num, par_num, line_num),
611            left,
612            top,
613            width,
614            height,
615            text,
616        });
617    }
618    words
619}
620
621fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
622    if words.len() < 8 {
623        return false;
624    }
625
626    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
627    for word in words {
628        by_line.entry(word.line_key).or_default().push(word);
629    }
630
631    let mut qualifying_lines = Vec::new();
632    let mut numeric_like_count = 0usize;
633    let mut max_right = 0u32;
634    for line_words in by_line.values_mut() {
635        line_words.sort_by_key(|word| word.left);
636        let numeric_words = line_words
637            .iter()
638            .filter(|word| is_numeric_like(&word.text))
639            .count();
640        numeric_like_count += numeric_words;
641        if line_words.len() >= 3 || numeric_words >= 2 {
642            max_right = max_right.max(
643                line_words
644                    .iter()
645                    .map(|word| word.left.saturating_add(word.width))
646                    .max()
647                    .unwrap_or(0),
648            );
649            qualifying_lines.push(line_words.clone());
650        }
651    }
652
653    if qualifying_lines.len() < 2 {
654        return false;
655    }
656
657    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
658    let mut clusters: Vec<XCluster> = Vec::new();
659    for line in &qualifying_lines {
660        for word in line {
661            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
662            if let Some(cluster) = clusters
663                .iter_mut()
664                .find(|cluster| (cluster.center - center).abs() <= tolerance)
665            {
666                cluster.center =
667                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
668                cluster.count += 1;
669                cluster.lines.insert(word.line_key);
670            } else {
671                let mut lines = HashSet::new();
672                lines.insert(word.line_key);
673                clusters.push(XCluster {
674                    center,
675                    count: 1,
676                    lines,
677                });
678            }
679        }
680    }
681
682    let repeated_clusters: Vec<&XCluster> = clusters
683        .iter()
684        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
685        .collect();
686    if repeated_clusters.len() < 3 {
687        return false;
688    }
689
690    let repeated_centers: Vec<f64> = repeated_clusters
691        .iter()
692        .map(|cluster| cluster.center)
693        .collect();
694    let structured_lines = qualifying_lines
695        .iter()
696        .filter(|line| {
697            let mut seen = HashSet::<usize>::new();
698            for word in *line {
699                let center = f64::from(word.left) + f64::from(word.width) / 2.0;
700                for (idx, repeated_center) in repeated_centers.iter().enumerate() {
701                    if (center - repeated_center).abs() <= tolerance {
702                        seen.insert(idx);
703                    }
704                }
705            }
706            seen.len() >= 3
707                || (seen.len() >= 2
708                    && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
709        })
710        .count();
711
712    structured_lines >= 3
713        || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
714}
715
716fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
717    if !looks_like_table_ocr(words) {
718        return false;
719    }
720
721    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
722    for word in words {
723        by_line.entry(word.line_key).or_default().push(word);
724    }
725
726    let numeric_like_count = words
727        .iter()
728        .filter(|word| is_numeric_like(&word.text))
729        .count();
730    let numeric_lines = by_line
731        .values()
732        .filter(|line| {
733            line.iter()
734                .filter(|word| is_numeric_like(&word.text))
735                .count()
736                >= 2
737        })
738        .count();
739
740    numeric_like_count >= 12 && numeric_lines >= 3
741}
742
743fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
744    let image_width = words
745        .iter()
746        .map(|word| word.left.saturating_add(word.width))
747        .max()?;
748    let image_height = words
749        .iter()
750        .map(|word| word.top.saturating_add(word.height))
751        .max()?;
752    if image_width == 0 || image_height == 0 {
753        return None;
754    }
755
756    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
757    for word in words {
758        by_line.entry(word.line_key).or_default().push(word);
759    }
760
761    let max_right = words
762        .iter()
763        .map(|word| word.left.saturating_add(word.width))
764        .max()
765        .unwrap_or(0);
766    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
767
768    let mut clusters: Vec<XCluster> = Vec::new();
769    for line_words in by_line.values() {
770        for word in line_words {
771            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
772            if let Some(cluster) = clusters
773                .iter_mut()
774                .find(|cluster| (cluster.center - center).abs() <= tolerance)
775            {
776                cluster.center =
777                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
778                cluster.count += 1;
779                cluster.lines.insert(word.line_key);
780            } else {
781                let mut lines = HashSet::new();
782                lines.insert(word.line_key);
783                clusters.push(XCluster {
784                    center,
785                    count: 1,
786                    lines,
787                });
788            }
789        }
790    }
791    let mut centers: Vec<f64> = clusters
792        .into_iter()
793        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
794        .map(|cluster| cluster.center)
795        .collect();
796    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
797    if centers.len() < 3 {
798        return None;
799    }
800
801    let mut built_rows = Vec::<OcrRowBuild>::new();
802    for line_words in by_line.values() {
803        let mut sorted_words = line_words.clone();
804        sorted_words.sort_by_key(|word| word.left);
805
806        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
807        for word in &sorted_words {
808            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
809            if let Some((col_idx, distance)) = centers
810                .iter()
811                .enumerate()
812                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
813                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
814            {
815                if distance <= tolerance {
816                    cells[col_idx].push(word);
817                }
818            }
819        }
820
821        let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
822        let numeric_cells = cells
823            .iter()
824            .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
825            .count();
826        if filled_cells < 3 && numeric_cells < 2 {
827            continue;
828        }
829
830        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
831        let bottom_px = sorted_words
832            .iter()
833            .map(|word| word.top.saturating_add(word.height))
834            .max()
835            .unwrap_or(0);
836        let top_y =
837            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
838        let bottom_y = image.bbox.top_y
839            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
840        let cell_texts = cells
841            .iter()
842            .map(|cell_words| {
843                cell_words
844                    .iter()
845                    .map(|word| word.text.as_str())
846                    .collect::<Vec<_>>()
847                    .join(" ")
848            })
849            .collect();
850        built_rows.push(OcrRowBuild {
851            top_y,
852            bottom_y,
853            cell_texts,
854        });
855    }
856
857    if built_rows.len() < 2 {
858        return None;
859    }
860
861    built_rows.sort_by(|a, b| {
862        b.top_y
863            .partial_cmp(&a.top_y)
864            .unwrap_or(std::cmp::Ordering::Equal)
865    });
866    let x_coordinates =
867        build_boundaries_from_centers(&centers, image.bbox.left_x, image.bbox.right_x);
868    let row_bounds: Vec<(f64, f64)> = built_rows
869        .iter()
870        .map(|row| (row.top_y, row.bottom_y))
871        .collect();
872    let y_coordinates = build_row_boundaries(&row_bounds);
873    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
874        return None;
875    }
876
877    let mut rows = Vec::new();
878    for (row_idx, row_build) in built_rows.iter().enumerate() {
879        let row_bbox = BoundingBox::new(
880            image.bbox.page_number,
881            image.bbox.left_x,
882            y_coordinates[row_idx + 1],
883            image.bbox.right_x,
884            y_coordinates[row_idx],
885        );
886        let mut cells = Vec::new();
887        for col_idx in 0..centers.len() {
888            let cell_bbox = BoundingBox::new(
889                image.bbox.page_number,
890                x_coordinates[col_idx],
891                y_coordinates[row_idx + 1],
892                x_coordinates[col_idx + 1],
893                y_coordinates[row_idx],
894            );
895            let text = row_build
896                .cell_texts
897                .get(col_idx)
898                .cloned()
899                .unwrap_or_default();
900            let mut content = Vec::new();
901            if !text.trim().is_empty() {
902                content.push(TableToken {
903                    base: TextChunk {
904                        value: text.trim().to_string(),
905                        bbox: cell_bbox.clone(),
906                        font_name: "OCR".to_string(),
907                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
908                        font_weight: 400.0,
909                        italic_angle: 0.0,
910                        font_color: "#000000".to_string(),
911                        contrast_ratio: 21.0,
912                        symbol_ends: Vec::new(),
913                        text_format: TextFormat::Normal,
914                        text_type: TextType::Regular,
915                        pdf_layer: PdfLayer::Content,
916                        ocg_visible: true,
917                        index: None,
918                        page_number: image.bbox.page_number,
919                        level: None,
920                        mcid: None,
921                    },
922                    token_type: TableTokenType::Text,
923                });
924            }
925            cells.push(TableBorderCell {
926                bbox: cell_bbox,
927                index: None,
928                level: None,
929                row_number: row_idx,
930                col_number: col_idx,
931                row_span: 1,
932                col_span: 1,
933                content,
934                contents: Vec::new(),
935                semantic_type: None,
936            });
937        }
938        rows.push(TableBorderRow {
939            bbox: row_bbox,
940            index: None,
941            level: None,
942            row_number: row_idx,
943            cells,
944            semantic_type: None,
945        });
946    }
947
948    Some(TableBorder {
949        bbox: image.bbox.clone(),
950        index: None,
951        level: None,
952        x_coordinates: x_coordinates.clone(),
953        x_widths: vec![0.0; x_coordinates.len()],
954        y_coordinates: y_coordinates.clone(),
955        y_widths: vec![0.0; y_coordinates.len()],
956        rows,
957        num_rows: built_rows.len(),
958        num_columns: centers.len(),
959        is_bad_table: false,
960        is_table_transformer: true,
961        previous_table: None,
962        next_table: None,
963    })
964}
965
966fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
967    let gray = image::open(image_path).ok()?.to_luma8();
968    let grid = detect_bordered_raster_grid(&gray)?;
969    let first_h = *grid.horizontal_lines.first()?;
970    if first_h <= 2 {
971        return None;
972    }
973
974    let crop = gray.view(0, 0, gray.width(), first_h).to_image();
975    let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
976    if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
977        return None;
978    }
979
980    let bbox = raster_box_to_page_bbox(
981        image,
982        0,
983        0,
984        gray.width(),
985        first_h.max(1),
986        gray.width().max(1),
987        gray.height().max(1),
988    )?;
989    let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
990    Some(TextChunk {
991        value: caption_text,
992        bbox,
993        font_name: "OCR".to_string(),
994        font_size,
995        font_weight: 700.0,
996        italic_angle: 0.0,
997        font_color: "#000000".to_string(),
998        contrast_ratio: 21.0,
999        symbol_ends: Vec::new(),
1000        text_format: TextFormat::Normal,
1001        text_type: TextType::Regular,
1002        pdf_layer: PdfLayer::Content,
1003        ocg_visible: true,
1004        index: None,
1005        page_number: image.bbox.page_number,
1006        level: None,
1007        mcid: None,
1008    })
1009}
1010
1011fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
1012    let gray = image::open(image_path).ok()?.to_luma8();
1013    let grid = detect_bordered_raster_grid(&gray)?;
1014    let num_cols = grid.vertical_lines.len().checked_sub(1)?;
1015    let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
1016    if num_cols < 2 || num_rows < 2 {
1017        return None;
1018    }
1019    let table_bbox = raster_box_to_page_bbox(
1020        image,
1021        *grid.vertical_lines.first()?,
1022        *grid.horizontal_lines.first()?,
1023        *grid.vertical_lines.last()?,
1024        *grid.horizontal_lines.last()?,
1025        gray.width(),
1026        gray.height(),
1027    )?;
1028
1029    let x_coordinates = raster_boundaries_to_page(
1030        &grid.vertical_lines,
1031        image.bbox.left_x,
1032        image.bbox.right_x,
1033        gray.width(),
1034    )?;
1035    let y_coordinates = raster_boundaries_to_page_desc(
1036        &grid.horizontal_lines,
1037        image.bbox.bottom_y,
1038        image.bbox.top_y,
1039        gray.height(),
1040    )?;
1041
1042    let mut rows = Vec::with_capacity(num_rows);
1043    for row_idx in 0..num_rows {
1044        let row_bbox = BoundingBox::new(
1045            image.bbox.page_number,
1046            image.bbox.left_x,
1047            y_coordinates[row_idx + 1],
1048            image.bbox.right_x,
1049            y_coordinates[row_idx],
1050        );
1051        let mut cells = Vec::with_capacity(num_cols);
1052
1053        for col_idx in 0..num_cols {
1054            let x1 = grid.vertical_lines[col_idx];
1055            let x2 = grid.vertical_lines[col_idx + 1];
1056            let y1 = grid.horizontal_lines[row_idx];
1057            let y2 = grid.horizontal_lines[row_idx + 1];
1058            let cell_bbox = BoundingBox::new(
1059                image.bbox.page_number,
1060                x_coordinates[col_idx],
1061                y_coordinates[row_idx + 1],
1062                x_coordinates[col_idx + 1],
1063                y_coordinates[row_idx],
1064            );
1065            let text = extract_raster_cell_text(&gray, row_idx, col_idx, x1, y1, x2, y2)?;
1066
1067            let mut content = Vec::new();
1068            if !text.is_empty() {
1069                content.push(TableToken {
1070                    base: TextChunk {
1071                        value: text,
1072                        bbox: cell_bbox.clone(),
1073                        font_name: "OCR".to_string(),
1074                        font_size: (cell_bbox.height() * 0.55).max(6.0),
1075                        font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
1076                        italic_angle: 0.0,
1077                        font_color: "#000000".to_string(),
1078                        contrast_ratio: 21.0,
1079                        symbol_ends: Vec::new(),
1080                        text_format: TextFormat::Normal,
1081                        text_type: TextType::Regular,
1082                        pdf_layer: PdfLayer::Content,
1083                        ocg_visible: true,
1084                        index: None,
1085                        page_number: image.bbox.page_number,
1086                        level: None,
1087                        mcid: None,
1088                    },
1089                    token_type: TableTokenType::Text,
1090                });
1091            }
1092
1093            cells.push(TableBorderCell {
1094                bbox: cell_bbox,
1095                index: None,
1096                level: None,
1097                row_number: row_idx,
1098                col_number: col_idx,
1099                row_span: 1,
1100                col_span: 1,
1101                content,
1102                contents: Vec::new(),
1103                semantic_type: None,
1104            });
1105        }
1106
1107        rows.push(TableBorderRow {
1108            bbox: row_bbox,
1109            index: None,
1110            level: None,
1111            row_number: row_idx,
1112            cells,
1113            semantic_type: None,
1114        });
1115    }
1116
1117    Some(TableBorder {
1118        bbox: table_bbox,
1119        index: None,
1120        level: None,
1121        x_coordinates: x_coordinates.clone(),
1122        x_widths: vec![0.0; x_coordinates.len()],
1123        y_coordinates: y_coordinates.clone(),
1124        y_widths: vec![0.0; y_coordinates.len()],
1125        rows,
1126        num_rows,
1127        num_columns: num_cols,
1128        is_bad_table: false,
1129        is_table_transformer: true,
1130        previous_table: None,
1131        next_table: None,
1132    })
1133}
1134
1135fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
1136    let width = gray.width();
1137    let height = gray.height();
1138    if width < 100 || height < 80 {
1139        return None;
1140    }
1141
1142    let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
1143    let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
1144
1145    let vertical_runs =
1146        merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
1147    let horizontal_runs =
1148        merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
1149    if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
1150        || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
1151    {
1152        return None;
1153    }
1154
1155    let vertical_lines: Vec<u32> = vertical_runs
1156        .into_iter()
1157        .map(|(start, end)| (start + end) / 2)
1158        .collect();
1159    let horizontal_lines: Vec<u32> = horizontal_runs
1160        .into_iter()
1161        .map(|(start, end)| (start + end) / 2)
1162        .collect();
1163    if vertical_lines
1164        .windows(2)
1165        .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
1166        || horizontal_lines
1167            .windows(2)
1168            .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
1169    {
1170        return None;
1171    }
1172
1173    Some(RasterTableGrid {
1174        vertical_lines,
1175        horizontal_lines,
1176    })
1177}
1178
1179fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
1180    (0..gray.height())
1181        .filter(|&y| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
1182        .count() as u32
1183}
1184
1185fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
1186    (0..gray.width())
1187        .filter(|&x| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
1188        .count() as u32
1189}
1190
1191fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
1192    let mut runs = Vec::new();
1193    let mut start = None;
1194    let mut prev = 0u32;
1195    for value in values {
1196        match start {
1197            None => {
1198                start = Some(value);
1199                prev = value;
1200            }
1201            Some(s) if value == prev + 1 => {
1202                prev = value;
1203                start = Some(s);
1204            }
1205            Some(s) => {
1206                runs.push((s, prev));
1207                start = Some(value);
1208                prev = value;
1209            }
1210        }
1211    }
1212    if let Some(s) = start {
1213        runs.push((s, prev));
1214    }
1215    runs
1216}
1217
1218fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
1219    let mut boundaries = Vec::with_capacity(centers.len() + 1);
1220    boundaries.push(left_edge);
1221    for pair in centers.windows(2) {
1222        boundaries.push((pair[0] + pair[1]) / 2.0);
1223    }
1224    boundaries.push(right_edge);
1225    boundaries
1226}
1227
1228fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
1229    let mut boundaries = Vec::with_capacity(rows.len() + 1);
1230    boundaries.push(rows[0].0);
1231    for pair in rows.windows(2) {
1232        boundaries.push((pair[0].1 + pair[1].0) / 2.0);
1233    }
1234    boundaries.push(rows[rows.len() - 1].1);
1235    boundaries
1236}
1237
1238fn raster_boundaries_to_page(
1239    lines: &[u32],
1240    left_edge: f64,
1241    right_edge: f64,
1242    image_width: u32,
1243) -> Option<Vec<f64>> {
1244    if image_width == 0 {
1245        return None;
1246    }
1247    let scale = (right_edge - left_edge) / f64::from(image_width);
1248    Some(
1249        lines
1250            .iter()
1251            .map(|line| left_edge + f64::from(*line) * scale)
1252            .collect(),
1253    )
1254}
1255
1256fn raster_boundaries_to_page_desc(
1257    lines: &[u32],
1258    bottom_edge: f64,
1259    top_edge: f64,
1260    image_height: u32,
1261) -> Option<Vec<f64>> {
1262    if image_height == 0 {
1263        return None;
1264    }
1265    let page_height = top_edge - bottom_edge;
1266    Some(
1267        lines
1268            .iter()
1269            .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
1270            .collect(),
1271    )
1272}
1273
1274fn raster_box_to_page_bbox(
1275    image: &ImageChunk,
1276    x1: u32,
1277    y1: u32,
1278    x2: u32,
1279    y2: u32,
1280    image_width: u32,
1281    image_height: u32,
1282) -> Option<BoundingBox> {
1283    if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
1284        return None;
1285    }
1286    let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
1287    let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
1288    let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
1289    let bottom_y =
1290        image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
1291    Some(BoundingBox::new(
1292        image.bbox.page_number,
1293        left_x,
1294        bottom_y,
1295        right_x,
1296        top_y,
1297    ))
1298}
1299
1300fn extract_raster_cell_text(
1301    gray: &GrayImage,
1302    row_idx: usize,
1303    col_idx: usize,
1304    x1: u32,
1305    y1: u32,
1306    x2: u32,
1307    y2: u32,
1308) -> Option<String> {
1309    let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
1310    let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
1311    let crop_left = x1 + inset_x;
1312    let crop_top = y1 + inset_y;
1313    let crop_width = x2.saturating_sub(x1 + inset_x * 2);
1314    let crop_height = y2.saturating_sub(y1 + inset_y * 2);
1315    if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
1316        return Some(String::new());
1317    }
1318
1319    let cropped = gray
1320        .view(crop_left, crop_top, crop_width, crop_height)
1321        .to_image();
1322    let bordered = expand_white_border(&cropped, 12);
1323    let scaled = image::imageops::resize(
1324        &bordered,
1325        bordered.width() * OCR_SCALE_FACTOR,
1326        bordered.height() * OCR_SCALE_FACTOR,
1327        image::imageops::FilterType::Lanczos3,
1328    );
1329    let raw_text = run_tesseract_plain_text(&scaled, if row_idx == 0 { "6" } else { "7" })?;
1330    Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
1331}
1332
1333fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
1334    let mut expanded = GrayImage::from_pixel(
1335        image.width() + border * 2,
1336        image.height() + border * 2,
1337        Luma([255]),
1338    );
1339    for y in 0..image.height() {
1340        for x in 0..image.width() {
1341            expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
1342        }
1343    }
1344    expanded
1345}
1346
1347fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
1348    let temp_dir = create_temp_dir(0).ok()?;
1349    let image_path = temp_dir.join("ocr.png");
1350    if image.save(&image_path).is_err() {
1351        let _ = fs::remove_dir_all(&temp_dir);
1352        return None;
1353    }
1354
1355    let output = Command::new("tesseract")
1356        .current_dir(&temp_dir)
1357        .arg("ocr.png")
1358        .arg("stdout")
1359        .arg("--psm")
1360        .arg(psm)
1361        .output()
1362        .ok()?;
1363    let _ = fs::remove_dir_all(&temp_dir);
1364    if !output.status.success() {
1365        return None;
1366    }
1367
1368    Some(
1369        String::from_utf8_lossy(&output.stdout)
1370            .replace('\n', " ")
1371            .split_whitespace()
1372            .collect::<Vec<_>>()
1373            .join(" "),
1374    )
1375}
1376
1377fn words_to_text_chunks(
1378    words: &[OcrWord],
1379    image: &ImageChunk,
1380    text_chunks: &[TextChunk],
1381) -> Vec<TextChunk> {
1382    let mut image_size = (0u32, 0u32);
1383    for word in words {
1384        image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
1385        image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
1386    }
1387    if image_size.0 == 0 || image_size.1 == 0 {
1388        return Vec::new();
1389    }
1390
1391    let mut dedupe: HashMap<String, usize> = HashMap::new();
1392    for chunk in text_chunks {
1393        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
1394    }
1395
1396    let mut recovered = Vec::new();
1397    for word in words {
1398        let normalized = normalize_text(&word.text);
1399        if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
1400            continue;
1401        }
1402
1403        let left_ratio = f64::from(word.left) / f64::from(image_size.0);
1404        let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
1405        let top_ratio = f64::from(word.top) / f64::from(image_size.1);
1406        let bottom_ratio =
1407            f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
1408
1409        let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
1410        let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
1411        let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
1412        let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
1413        if right_x <= left_x || top_y <= bottom_y {
1414            continue;
1415        }
1416
1417        recovered.push(TextChunk {
1418            value: word.text.clone(),
1419            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
1420            font_name: "OCR".to_string(),
1421            font_size: (top_y - bottom_y).max(6.0),
1422            font_weight: 400.0,
1423            italic_angle: 0.0,
1424            font_color: "#000000".to_string(),
1425            contrast_ratio: 21.0,
1426            symbol_ends: Vec::new(),
1427            text_format: TextFormat::Normal,
1428            text_type: TextType::Regular,
1429            pdf_layer: PdfLayer::Content,
1430            ocg_visible: true,
1431            index: None,
1432            page_number: image.bbox.page_number,
1433            level: None,
1434            mcid: None,
1435        });
1436    }
1437
1438    recovered
1439}
1440
1441fn is_numeric_like(text: &str) -> bool {
1442    text.chars().any(|ch| ch.is_ascii_digit())
1443}
1444
1445fn normalize_text(text: &str) -> String {
1446    text.chars()
1447        .filter(|ch| ch.is_alphanumeric())
1448        .flat_map(|ch| ch.to_lowercase())
1449        .collect()
1450}
1451
1452fn normalize_caption_text(text: &str) -> String {
1453    text.replace("CarolinaBLUTM", "CarolinaBLU™")
1454        .replace("CarolinaBLU™™", "CarolinaBLU™")
1455        .trim()
1456        .to_string()
1457}
1458
1459fn normalize_raster_cell_text(row_idx: usize, col_idx: usize, text: String) -> String {
1460    let mut normalized = text
1461        .replace('|', " ")
1462        .replace('—', "-")
1463        .replace("AorB", "A or B")
1464        .replace("Aor B", "A or B")
1465        .replace("H,O", "H2O")
1466        .replace("Buffer-RNave", "Buffer-RNase")
1467        .replace("Buffer RNave", "Buffer-RNase")
1468        .replace("Buffer-RNasee", "Buffer-RNase")
1469        .replace("Buffer-—RNase", "Buffer-RNase")
1470        .replace("Buffer—RNase", "Buffer-RNase")
1471        .replace("BamHI-Hindill", "BamHI-HindIII")
1472        .replace("BamHli-Hindlll", "BamHI-HindIII")
1473        .replace("BamHIi-Hindlll", "BamHI-HindIII")
1474        .replace("Hindlll", "HindIII")
1475        .split_whitespace()
1476        .collect::<Vec<_>>()
1477        .join(" ");
1478
1479    if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
1480        return String::new();
1481    }
1482    if row_idx > 0
1483        && normalized
1484            .chars()
1485            .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
1486    {
1487        return String::new();
1488    }
1489
1490    normalized = normalized
1491        .replace(" ywL", " μL")
1492        .replace(" yuL", " μL")
1493        .replace(" yL", " μL")
1494        .replace(" wL", " μL")
1495        .replace(" uL", " μL")
1496        .replace(" pL", " μL");
1497
1498    if row_idx == 0 {
1499        if col_idx == 1 {
1500            normalized = "BamHI-HindIII restriction enzyme mixture".to_string();
1501        } else if col_idx == 2 {
1502            normalized = "Restriction Buffer-RNase".to_string();
1503        } else if col_idx == 3 {
1504            normalized = "Suspect 1 DNA".to_string();
1505        } else if col_idx == 4 {
1506            normalized = "Suspect 2 DNA".to_string();
1507        } else if col_idx == 5 {
1508            normalized = "Evidence A or B".to_string();
1509        } else if col_idx == 6 {
1510            normalized = "H2O".to_string();
1511        }
1512    }
1513
1514    normalized.trim().to_string()
1515}
1516
1517fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
1518    let unique = SystemTime::now()
1519        .duration_since(UNIX_EPOCH)
1520        .unwrap_or_default()
1521        .as_nanos();
1522    let dir = std::env::temp_dir().join(format!(
1523        "edgeparse-raster-ocr-{}-{}-{}",
1524        std::process::id(),
1525        page_number,
1526        unique
1527    ));
1528    fs::create_dir_all(&dir)?;
1529    Ok(dir)
1530}
1531
1532#[cfg(test)]
1533mod tests {
1534    use super::*;
1535    use image::GrayImage;
1536
1537    fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
1538        OcrWord {
1539            line_key: line,
1540            left,
1541            top: 0,
1542            width: 40,
1543            height: 12,
1544            text: text.to_string(),
1545        }
1546    }
1547
1548    #[test]
1549    fn test_table_like_ocr_detects_repeated_columns() {
1550        let words = vec![
1551            word((1, 1, 1), 10, "Temperature"),
1552            word((1, 1, 1), 120, "Viscosity"),
1553            word((1, 1, 1), 240, "Temperature"),
1554            word((1, 1, 1), 360, "Viscosity"),
1555            word((1, 1, 2), 10, "0"),
1556            word((1, 1, 2), 120, "1.793E-06"),
1557            word((1, 1, 2), 240, "25"),
1558            word((1, 1, 2), 360, "8.930E-07"),
1559            word((1, 1, 3), 10, "1"),
1560            word((1, 1, 3), 120, "1.732E-06"),
1561            word((1, 1, 3), 240, "26"),
1562            word((1, 1, 3), 360, "8.760E-07"),
1563        ];
1564        assert!(looks_like_table_ocr(&words));
1565    }
1566
1567    #[test]
1568    fn test_table_like_ocr_rejects_single_line_caption() {
1569        let words = vec![
1570            word((1, 1, 1), 10, "Figure"),
1571            word((1, 1, 1), 90, "7.2"),
1572            word((1, 1, 1), 150, "Viscosity"),
1573            word((1, 1, 1), 260, "of"),
1574            word((1, 1, 1), 300, "Water"),
1575        ];
1576        assert!(!looks_like_table_ocr(&words));
1577    }
1578
1579    #[test]
1580    fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
1581        assert_eq!(
1582            normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
1583            "3 μL"
1584        );
1585        assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
1586        assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
1587    }
1588
1589    #[test]
1590    fn test_detect_bordered_raster_grid_finds_strong_lines() {
1591        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1592        for x in [10, 40, 80, 110] {
1593            for y in 10..71 {
1594                image.put_pixel(x, y, Luma([0]));
1595            }
1596        }
1597        for y in [10, 30, 50, 70] {
1598            for x in 10..111 {
1599                image.put_pixel(x, y, Luma([0]));
1600            }
1601        }
1602
1603        let grid = detect_bordered_raster_grid(&image).expect("grid");
1604        assert_eq!(grid.vertical_lines.len(), 4);
1605        assert_eq!(grid.horizontal_lines.len(), 4);
1606    }
1607}