edgeparse_core/pdf/
raster_table_ocr.rs

1//! Recover text signal from raster table images using local OCR.
2
3use std::collections::{BTreeMap, HashMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::time::{SystemTime, UNIX_EPOCH};
8
9use crate::models::bbox::BoundingBox;
10use crate::models::chunks::{ImageChunk, TextChunk};
11use crate::models::enums::{PdfLayer, TextFormat, TextType};
12use crate::models::table::{
13    TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
14};
15
16const MIN_IMAGE_WIDTH_RATIO: f64 = 0.45;
17const MIN_IMAGE_AREA_RATIO: f64 = 0.045;
18const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
19const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
20const MIN_OCR_WORD_CONFIDENCE: f64 = 35.0;
21
22#[derive(Debug, Clone)]
23struct OcrWord {
24    line_key: (u32, u32, u32),
25    left: u32,
26    top: u32,
27    width: u32,
28    height: u32,
29    text: String,
30}
31
32#[derive(Debug, Clone)]
33struct XCluster {
34    center: f64,
35    count: usize,
36    lines: HashSet<(u32, u32, u32)>,
37}
38
39#[derive(Clone)]
40struct OcrRowBuild {
41    top_y: f64,
42    bottom_y: f64,
43    cell_texts: Vec<String>,
44}
45
46/// Recover OCR text chunks for image-backed table regions on a single page.
47pub fn recover_raster_table_text_chunks(
48    input_path: &Path,
49    page_bbox: &BoundingBox,
50    page_number: u32,
51    text_chunks: &[TextChunk],
52    image_chunks: &[ImageChunk],
53) -> Vec<TextChunk> {
54    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
55        return Vec::new();
56    }
57
58    let candidates: Vec<&ImageChunk> = image_chunks
59        .iter()
60        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
61        .collect();
62    if candidates.is_empty() {
63        return Vec::new();
64    }
65
66    let temp_dir = match create_temp_dir(page_number) {
67        Ok(dir) => dir,
68        Err(_) => return Vec::new(),
69    };
70
71    let result =
72        recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
73
74    let _ = fs::remove_dir_all(&temp_dir);
75    result
76}
77
78/// Recover synthetic table borders for strongly numeric raster tables.
79pub fn recover_raster_table_borders(
80    input_path: &Path,
81    page_bbox: &BoundingBox,
82    page_number: u32,
83    text_chunks: &[TextChunk],
84    image_chunks: &[ImageChunk],
85) -> Vec<TableBorder> {
86    if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
87        return Vec::new();
88    }
89
90    let candidates: Vec<&ImageChunk> = image_chunks
91        .iter()
92        .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
93        .collect();
94    if candidates.is_empty() {
95        return Vec::new();
96    }
97
98    let temp_dir = match create_temp_dir(page_number) {
99        Ok(dir) => dir,
100        Err(_) => return Vec::new(),
101    };
102
103    let prefix = temp_dir.join("img");
104    let status = Command::new("pdfimages")
105        .arg("-f")
106        .arg(page_number.to_string())
107        .arg("-l")
108        .arg(page_number.to_string())
109        .arg("-png")
110        .arg(input_path)
111        .arg(&prefix)
112        .status();
113    match status {
114        Ok(s) if s.success() => {}
115        _ => {
116            let _ = fs::remove_dir_all(&temp_dir);
117            return Vec::new();
118        }
119    }
120
121    let mut image_files: Vec<PathBuf> = match fs::read_dir(&temp_dir) {
122        Ok(read_dir) => read_dir
123            .filter_map(|entry| entry.ok().map(|e| e.path()))
124            .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
125            .collect(),
126        Err(_) => {
127            let _ = fs::remove_dir_all(&temp_dir);
128            return Vec::new();
129        }
130    };
131    image_files.sort();
132
133    let mut tables = Vec::new();
134    for image in candidates {
135        let Some(image_index) = image.index else {
136            continue;
137        };
138        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
139            continue;
140        };
141        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
142            continue;
143        };
144        let Ok(tsv_output) = Command::new("tesseract")
145            .current_dir(&temp_dir)
146            .arg(file_name)
147            .arg("stdout")
148            .arg("--psm")
149            .arg("6")
150            .arg("tsv")
151            .output()
152        else {
153            continue;
154        };
155        if !tsv_output.status.success() {
156            continue;
157        }
158
159        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
160        let words = parse_tesseract_tsv(&tsv);
161        if looks_like_numeric_table_ocr(&words) {
162            if let Some(table) = build_numeric_table_border(&words, image) {
163                tables.push(table);
164            }
165        }
166    }
167
168    let _ = fs::remove_dir_all(&temp_dir);
169    tables
170}
171
172fn recover_from_page_images(
173    input_path: &Path,
174    temp_dir: &Path,
175    page_number: u32,
176    candidates: Vec<&ImageChunk>,
177    text_chunks: &[TextChunk],
178) -> Vec<TextChunk> {
179    let prefix = temp_dir.join("img");
180    let status = Command::new("pdfimages")
181        .arg("-f")
182        .arg(page_number.to_string())
183        .arg("-l")
184        .arg(page_number.to_string())
185        .arg("-png")
186        .arg(input_path)
187        .arg(&prefix)
188        .status();
189    match status {
190        Ok(s) if s.success() => {}
191        _ => return Vec::new(),
192    }
193
194    let mut image_files: Vec<PathBuf> = match fs::read_dir(temp_dir) {
195        Ok(read_dir) => read_dir
196            .filter_map(|entry| entry.ok().map(|e| e.path()))
197            .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
198            .collect(),
199        Err(_) => return Vec::new(),
200    };
201    image_files.sort();
202    if image_files.is_empty() {
203        return Vec::new();
204    }
205
206    let mut recovered = Vec::new();
207    for image in candidates {
208        let Some(image_index) = image.index else {
209            continue;
210        };
211        let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
212            continue;
213        };
214        let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
215            continue;
216        };
217        let Ok(tsv_output) = Command::new("tesseract")
218            .current_dir(temp_dir)
219            .arg(file_name)
220            .arg("stdout")
221            .arg("--psm")
222            .arg("6")
223            .arg("tsv")
224            .output()
225        else {
226            continue;
227        };
228        if !tsv_output.status.success() {
229            continue;
230        }
231
232        let tsv = String::from_utf8_lossy(&tsv_output.stdout);
233        let words = parse_tesseract_tsv(&tsv);
234        if !looks_like_table_ocr(&words) {
235            continue;
236        }
237
238        recovered.extend(words_to_text_chunks(&words, image, text_chunks));
239    }
240
241    recovered
242}
243
244fn is_ocr_candidate(
245    image: &ImageChunk,
246    page_bbox: &BoundingBox,
247    text_chunks: &[TextChunk],
248) -> bool {
249    let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
250    let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
251    if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
252        return false;
253    }
254
255    let overlapping_chunks: Vec<&TextChunk> = text_chunks
256        .iter()
257        .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
258        .collect();
259    let native_text_chars: usize = overlapping_chunks
260        .iter()
261        .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
262        .sum();
263
264    native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
265        || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
266}
267
268fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
269    let mut words = Vec::new();
270    for line in tsv.lines().skip(1) {
271        let mut cols = line.splitn(12, '\t');
272        let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
273        if level != 5 {
274            continue;
275        }
276        let _page_num = cols.next();
277        let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
278        let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
279        let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
280        let _word_num = cols.next();
281        let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
282        let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
283        let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
284        let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
285        let confidence = cols
286            .next()
287            .and_then(|s| s.parse::<f64>().ok())
288            .unwrap_or(-1.0);
289        let text = cols.next().unwrap_or("").trim().to_string();
290        if confidence < MIN_OCR_WORD_CONFIDENCE
291            || text.is_empty()
292            || width == 0
293            || height == 0
294            || !text.chars().any(|ch| ch.is_alphanumeric())
295        {
296            continue;
297        }
298        words.push(OcrWord {
299            line_key: (block_num, par_num, line_num),
300            left,
301            top,
302            width,
303            height,
304            text,
305        });
306    }
307    words
308}
309
310fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
311    if words.len() < 8 {
312        return false;
313    }
314
315    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
316    for word in words {
317        by_line.entry(word.line_key).or_default().push(word);
318    }
319
320    let mut qualifying_lines = Vec::new();
321    let mut numeric_like_count = 0usize;
322    let mut max_right = 0u32;
323    for line_words in by_line.values_mut() {
324        line_words.sort_by_key(|word| word.left);
325        let numeric_words = line_words
326            .iter()
327            .filter(|word| is_numeric_like(&word.text))
328            .count();
329        numeric_like_count += numeric_words;
330        if line_words.len() >= 3 || numeric_words >= 2 {
331            max_right = max_right.max(
332                line_words
333                    .iter()
334                    .map(|word| word.left.saturating_add(word.width))
335                    .max()
336                    .unwrap_or(0),
337            );
338            qualifying_lines.push(line_words.clone());
339        }
340    }
341
342    if qualifying_lines.len() < 2 {
343        return false;
344    }
345
346    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
347    let mut clusters: Vec<XCluster> = Vec::new();
348    for line in &qualifying_lines {
349        for word in line {
350            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
351            if let Some(cluster) = clusters
352                .iter_mut()
353                .find(|cluster| (cluster.center - center).abs() <= tolerance)
354            {
355                cluster.center =
356                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
357                cluster.count += 1;
358                cluster.lines.insert(word.line_key);
359            } else {
360                let mut lines = HashSet::new();
361                lines.insert(word.line_key);
362                clusters.push(XCluster {
363                    center,
364                    count: 1,
365                    lines,
366                });
367            }
368        }
369    }
370
371    let repeated_clusters: Vec<&XCluster> = clusters
372        .iter()
373        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
374        .collect();
375    if repeated_clusters.len() < 3 {
376        return false;
377    }
378
379    let repeated_centers: Vec<f64> = repeated_clusters
380        .iter()
381        .map(|cluster| cluster.center)
382        .collect();
383    let structured_lines = qualifying_lines
384        .iter()
385        .filter(|line| {
386            let mut seen = HashSet::<usize>::new();
387            for word in *line {
388                let center = f64::from(word.left) + f64::from(word.width) / 2.0;
389                for (idx, repeated_center) in repeated_centers.iter().enumerate() {
390                    if (center - repeated_center).abs() <= tolerance {
391                        seen.insert(idx);
392                    }
393                }
394            }
395            seen.len() >= 3
396                || (seen.len() >= 2
397                    && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
398        })
399        .count();
400
401    structured_lines >= 3
402        || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
403}
404
405fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
406    if !looks_like_table_ocr(words) {
407        return false;
408    }
409
410    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
411    for word in words {
412        by_line.entry(word.line_key).or_default().push(word);
413    }
414
415    let numeric_like_count = words
416        .iter()
417        .filter(|word| is_numeric_like(&word.text))
418        .count();
419    let numeric_lines = by_line
420        .values()
421        .filter(|line| {
422            line.iter()
423                .filter(|word| is_numeric_like(&word.text))
424                .count()
425                >= 2
426        })
427        .count();
428
429    numeric_like_count >= 12 && numeric_lines >= 3
430}
431
432fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
433    let image_width = words
434        .iter()
435        .map(|word| word.left.saturating_add(word.width))
436        .max()?;
437    let image_height = words
438        .iter()
439        .map(|word| word.top.saturating_add(word.height))
440        .max()?;
441    if image_width == 0 || image_height == 0 {
442        return None;
443    }
444
445    let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
446    for word in words {
447        by_line.entry(word.line_key).or_default().push(word);
448    }
449
450    let max_right = words
451        .iter()
452        .map(|word| word.left.saturating_add(word.width))
453        .max()
454        .unwrap_or(0);
455    let tolerance = (f64::from(max_right) * 0.035).max(18.0);
456
457    let mut clusters: Vec<XCluster> = Vec::new();
458    for line_words in by_line.values() {
459        for word in line_words {
460            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
461            if let Some(cluster) = clusters
462                .iter_mut()
463                .find(|cluster| (cluster.center - center).abs() <= tolerance)
464            {
465                cluster.center =
466                    (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
467                cluster.count += 1;
468                cluster.lines.insert(word.line_key);
469            } else {
470                let mut lines = HashSet::new();
471                lines.insert(word.line_key);
472                clusters.push(XCluster {
473                    center,
474                    count: 1,
475                    lines,
476                });
477            }
478        }
479    }
480    let mut centers: Vec<f64> = clusters
481        .into_iter()
482        .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
483        .map(|cluster| cluster.center)
484        .collect();
485    centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
486    if centers.len() < 3 {
487        return None;
488    }
489
490    let mut built_rows = Vec::<OcrRowBuild>::new();
491    for line_words in by_line.values() {
492        let mut sorted_words = line_words.clone();
493        sorted_words.sort_by_key(|word| word.left);
494
495        let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
496        for word in &sorted_words {
497            let center = f64::from(word.left) + f64::from(word.width) / 2.0;
498            if let Some((col_idx, distance)) = centers
499                .iter()
500                .enumerate()
501                .map(|(idx, col_center)| (idx, (center - col_center).abs()))
502                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
503            {
504                if distance <= tolerance {
505                    cells[col_idx].push(word);
506                }
507            }
508        }
509
510        let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
511        let numeric_cells = cells
512            .iter()
513            .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
514            .count();
515        if filled_cells < 3 && numeric_cells < 2 {
516            continue;
517        }
518
519        let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
520        let bottom_px = sorted_words
521            .iter()
522            .map(|word| word.top.saturating_add(word.height))
523            .max()
524            .unwrap_or(0);
525        let top_y =
526            image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
527        let bottom_y = image.bbox.top_y
528            - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
529        let cell_texts = cells
530            .iter()
531            .map(|cell_words| {
532                cell_words
533                    .iter()
534                    .map(|word| word.text.as_str())
535                    .collect::<Vec<_>>()
536                    .join(" ")
537            })
538            .collect();
539        built_rows.push(OcrRowBuild {
540            top_y,
541            bottom_y,
542            cell_texts,
543        });
544    }
545
546    if built_rows.len() < 2 {
547        return None;
548    }
549
550    built_rows.sort_by(|a, b| {
551        b.top_y
552            .partial_cmp(&a.top_y)
553            .unwrap_or(std::cmp::Ordering::Equal)
554    });
555    let x_coordinates =
556        build_boundaries_from_centers(&centers, image.bbox.left_x, image.bbox.right_x);
557    let row_bounds: Vec<(f64, f64)> = built_rows
558        .iter()
559        .map(|row| (row.top_y, row.bottom_y))
560        .collect();
561    let y_coordinates = build_row_boundaries(&row_bounds);
562    if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
563        return None;
564    }
565
566    let mut rows = Vec::new();
567    for (row_idx, row_build) in built_rows.iter().enumerate() {
568        let row_bbox = BoundingBox::new(
569            image.bbox.page_number,
570            image.bbox.left_x,
571            y_coordinates[row_idx + 1],
572            image.bbox.right_x,
573            y_coordinates[row_idx],
574        );
575        let mut cells = Vec::new();
576        for col_idx in 0..centers.len() {
577            let cell_bbox = BoundingBox::new(
578                image.bbox.page_number,
579                x_coordinates[col_idx],
580                y_coordinates[row_idx + 1],
581                x_coordinates[col_idx + 1],
582                y_coordinates[row_idx],
583            );
584            let text = row_build
585                .cell_texts
586                .get(col_idx)
587                .cloned()
588                .unwrap_or_default();
589            let mut content = Vec::new();
590            if !text.trim().is_empty() {
591                content.push(TableToken {
592                    base: TextChunk {
593                        value: text.trim().to_string(),
594                        bbox: cell_bbox.clone(),
595                        font_name: "OCR".to_string(),
596                        font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
597                        font_weight: 400.0,
598                        italic_angle: 0.0,
599                        font_color: "#000000".to_string(),
600                        contrast_ratio: 21.0,
601                        symbol_ends: Vec::new(),
602                        text_format: TextFormat::Normal,
603                        text_type: TextType::Regular,
604                        pdf_layer: PdfLayer::Content,
605                        ocg_visible: true,
606                        index: None,
607                        page_number: image.bbox.page_number,
608                        level: None,
609                        mcid: None,
610                    },
611                    token_type: TableTokenType::Text,
612                });
613            }
614            cells.push(TableBorderCell {
615                bbox: cell_bbox,
616                index: None,
617                level: None,
618                row_number: row_idx,
619                col_number: col_idx,
620                row_span: 1,
621                col_span: 1,
622                content,
623                contents: Vec::new(),
624                semantic_type: None,
625            });
626        }
627        rows.push(TableBorderRow {
628            bbox: row_bbox,
629            index: None,
630            level: None,
631            row_number: row_idx,
632            cells,
633            semantic_type: None,
634        });
635    }
636
637    Some(TableBorder {
638        bbox: image.bbox.clone(),
639        index: None,
640        level: None,
641        x_coordinates: x_coordinates.clone(),
642        x_widths: vec![0.0; x_coordinates.len()],
643        y_coordinates: y_coordinates.clone(),
644        y_widths: vec![0.0; y_coordinates.len()],
645        rows,
646        num_rows: built_rows.len(),
647        num_columns: centers.len(),
648        is_bad_table: false,
649        is_table_transformer: true,
650        previous_table: None,
651        next_table: None,
652    })
653}
654
655fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
656    let mut boundaries = Vec::with_capacity(centers.len() + 1);
657    boundaries.push(left_edge);
658    for pair in centers.windows(2) {
659        boundaries.push((pair[0] + pair[1]) / 2.0);
660    }
661    boundaries.push(right_edge);
662    boundaries
663}
664
665fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
666    let mut boundaries = Vec::with_capacity(rows.len() + 1);
667    boundaries.push(rows[0].0);
668    for pair in rows.windows(2) {
669        boundaries.push((pair[0].1 + pair[1].0) / 2.0);
670    }
671    boundaries.push(rows[rows.len() - 1].1);
672    boundaries
673}
674
675fn words_to_text_chunks(
676    words: &[OcrWord],
677    image: &ImageChunk,
678    text_chunks: &[TextChunk],
679) -> Vec<TextChunk> {
680    let mut image_size = (0u32, 0u32);
681    for word in words {
682        image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
683        image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
684    }
685    if image_size.0 == 0 || image_size.1 == 0 {
686        return Vec::new();
687    }
688
689    let mut dedupe: HashMap<String, usize> = HashMap::new();
690    for chunk in text_chunks {
691        dedupe.insert(normalize_text(&chunk.value), dedupe.len());
692    }
693
694    let mut recovered = Vec::new();
695    for word in words {
696        let normalized = normalize_text(&word.text);
697        if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
698            continue;
699        }
700
701        let left_ratio = f64::from(word.left) / f64::from(image_size.0);
702        let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
703        let top_ratio = f64::from(word.top) / f64::from(image_size.1);
704        let bottom_ratio =
705            f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
706
707        let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
708        let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
709        let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
710        let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
711        if right_x <= left_x || top_y <= bottom_y {
712            continue;
713        }
714
715        recovered.push(TextChunk {
716            value: word.text.clone(),
717            bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
718            font_name: "OCR".to_string(),
719            font_size: (top_y - bottom_y).max(6.0),
720            font_weight: 400.0,
721            italic_angle: 0.0,
722            font_color: "#000000".to_string(),
723            contrast_ratio: 21.0,
724            symbol_ends: Vec::new(),
725            text_format: TextFormat::Normal,
726            text_type: TextType::Regular,
727            pdf_layer: PdfLayer::Content,
728            ocg_visible: true,
729            index: None,
730            page_number: image.bbox.page_number,
731            level: None,
732            mcid: None,
733        });
734    }
735
736    recovered
737}
738
739fn is_numeric_like(text: &str) -> bool {
740    text.chars().any(|ch| ch.is_ascii_digit())
741}
742
743fn normalize_text(text: &str) -> String {
744    text.chars()
745        .filter(|ch| ch.is_alphanumeric())
746        .flat_map(|ch| ch.to_lowercase())
747        .collect()
748}
749
750fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
751    let unique = SystemTime::now()
752        .duration_since(UNIX_EPOCH)
753        .unwrap_or_default()
754        .as_nanos();
755    let dir = std::env::temp_dir().join(format!(
756        "edgeparse-raster-ocr-{}-{}-{}",
757        std::process::id(),
758        page_number,
759        unique
760    ));
761    fs::create_dir_all(&dir)?;
762    Ok(dir)
763}
764
765#[cfg(test)]
766mod tests {
767    use super::*;
768
769    fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
770        OcrWord {
771            line_key: line,
772            left,
773            top: 0,
774            width: 40,
775            height: 12,
776            text: text.to_string(),
777        }
778    }
779
780    #[test]
781    fn test_table_like_ocr_detects_repeated_columns() {
782        let words = vec![
783            word((1, 1, 1), 10, "Temperature"),
784            word((1, 1, 1), 120, "Viscosity"),
785            word((1, 1, 1), 240, "Temperature"),
786            word((1, 1, 1), 360, "Viscosity"),
787            word((1, 1, 2), 10, "0"),
788            word((1, 1, 2), 120, "1.793E-06"),
789            word((1, 1, 2), 240, "25"),
790            word((1, 1, 2), 360, "8.930E-07"),
791            word((1, 1, 3), 10, "1"),
792            word((1, 1, 3), 120, "1.732E-06"),
793            word((1, 1, 3), 240, "26"),
794            word((1, 1, 3), 360, "8.760E-07"),
795        ];
796        assert!(looks_like_table_ocr(&words));
797    }
798
799    #[test]
800    fn test_table_like_ocr_rejects_single_line_caption() {
801        let words = vec![
802            word((1, 1, 1), 10, "Figure"),
803            word((1, 1, 1), 90, "7.2"),
804            word((1, 1, 1), 150, "Viscosity"),
805            word((1, 1, 1), 260, "of"),
806            word((1, 1, 1), 300, "Water"),
807        ];
808        assert!(!looks_like_table_ocr(&words));
809    }
810}
edgeparse_core/pdf/raster_table_ocr.rs

edgeparse_core/pdf/
raster_table_ocr.rs