1use std::collections::{BTreeMap, HashMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::time::{SystemTime, UNIX_EPOCH};
8
9use image::{GenericImageView, GrayImage, Luma};
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::{ImageChunk, TextChunk};
13use crate::models::content::ContentElement;
14use crate::models::enums::{PdfLayer, TextFormat, TextType};
15use crate::models::table::{
16 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
17};
18
19const MIN_IMAGE_WIDTH_RATIO: f64 = 0.45;
20const MIN_IMAGE_AREA_RATIO: f64 = 0.045;
21const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
22const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
23const MIN_OCR_WORD_CONFIDENCE: f64 = 35.0;
24const RASTER_DARK_THRESHOLD: u8 = 180;
25const MIN_BORDERED_VERTICAL_LINES: usize = 4;
26const MIN_BORDERED_HORIZONTAL_LINES: usize = 4;
27const MIN_LINE_DARK_RATIO: f64 = 0.55;
28const MIN_CELL_SIZE_PX: u32 = 10;
29const CELL_INSET_PX: u32 = 4;
30const OCR_SCALE_FACTOR: u32 = 3;
31const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
32const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
33const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
34
35#[derive(Debug, Clone)]
36struct OcrWord {
37 line_key: (u32, u32, u32),
38 left: u32,
39 top: u32,
40 width: u32,
41 height: u32,
42 text: String,
43}
44
45#[derive(Debug, Clone)]
46struct XCluster {
47 center: f64,
48 count: usize,
49 lines: HashSet<(u32, u32, u32)>,
50}
51
52#[derive(Clone)]
53struct OcrRowBuild {
54 top_y: f64,
55 bottom_y: f64,
56 cell_texts: Vec<String>,
57}
58
59#[derive(Debug, Clone)]
60struct RasterTableGrid {
61 vertical_lines: Vec<u32>,
62 horizontal_lines: Vec<u32>,
63}
64
65pub fn recover_raster_table_text_chunks(
67 input_path: &Path,
68 page_bbox: &BoundingBox,
69 page_number: u32,
70 text_chunks: &[TextChunk],
71 image_chunks: &[ImageChunk],
72) -> Vec<TextChunk> {
73 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
74 return Vec::new();
75 }
76
77 let candidates: Vec<&ImageChunk> = image_chunks
78 .iter()
79 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
80 .collect();
81 if candidates.is_empty() {
82 return Vec::new();
83 }
84
85 let temp_dir = match create_temp_dir(page_number) {
86 Ok(dir) => dir,
87 Err(_) => return Vec::new(),
88 };
89
90 let result =
91 recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
92
93 let _ = fs::remove_dir_all(&temp_dir);
94 result
95}
96
97pub fn recover_raster_table_borders(
99 input_path: &Path,
100 page_bbox: &BoundingBox,
101 page_number: u32,
102 text_chunks: &[TextChunk],
103 image_chunks: &[ImageChunk],
104) -> Vec<TableBorder> {
105 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
106 return Vec::new();
107 }
108
109 let candidates: Vec<&ImageChunk> = image_chunks
110 .iter()
111 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
112 .collect();
113 if candidates.is_empty() {
114 return Vec::new();
115 }
116
117 let temp_dir = match create_temp_dir(page_number) {
118 Ok(dir) => dir,
119 Err(_) => return Vec::new(),
120 };
121
122 let prefix = temp_dir.join("img");
123 let status = Command::new("pdfimages")
124 .arg("-f")
125 .arg(page_number.to_string())
126 .arg("-l")
127 .arg(page_number.to_string())
128 .arg("-png")
129 .arg(input_path)
130 .arg(&prefix)
131 .status();
132 match status {
133 Ok(s) if s.success() => {}
134 _ => {
135 let _ = fs::remove_dir_all(&temp_dir);
136 return Vec::new();
137 }
138 }
139
140 let mut image_files: Vec<PathBuf> = match fs::read_dir(&temp_dir) {
141 Ok(read_dir) => read_dir
142 .filter_map(|entry| entry.ok().map(|e| e.path()))
143 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
144 .collect(),
145 Err(_) => {
146 let _ = fs::remove_dir_all(&temp_dir);
147 return Vec::new();
148 }
149 };
150 image_files.sort();
151
152 let mut tables = Vec::new();
153 for image in candidates {
154 let Some(image_index) = image.index else {
155 continue;
156 };
157 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
158 continue;
159 };
160 if let Some(table) = recover_bordered_raster_table(image_path, image) {
161 tables.push(table);
162 continue;
163 }
164 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
165 continue;
166 };
167 let Ok(tsv_output) = Command::new("tesseract")
168 .current_dir(&temp_dir)
169 .arg(file_name)
170 .arg("stdout")
171 .arg("--psm")
172 .arg("6")
173 .arg("tsv")
174 .output()
175 else {
176 continue;
177 };
178 if !tsv_output.status.success() {
179 continue;
180 }
181
182 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
183 let words = parse_tesseract_tsv(&tsv);
184 if looks_like_numeric_table_ocr(&words) {
185 if let Some(table) = build_numeric_table_border(&words, image) {
186 tables.push(table);
187 }
188 }
189 }
190
191 let _ = fs::remove_dir_all(&temp_dir);
192 tables
193}
194
195pub fn recover_page_raster_table_cell_text(
201 input_path: &Path,
202 page_bbox: &BoundingBox,
203 page_number: u32,
204 elements: &mut [ContentElement],
205) {
206 if page_bbox.area() <= 0.0 {
207 return;
208 }
209
210 let native_text_chars = page_native_text_chars(elements);
211 if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR {
212 return;
213 }
214
215 let candidate_indices: Vec<usize> = elements
216 .iter()
217 .enumerate()
218 .filter_map(|(idx, elem)| {
219 table_candidate_ref(elem)
220 .filter(|table| table_needs_page_raster_ocr(table))
221 .map(|_| idx)
222 })
223 .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
224 .collect();
225 if candidate_indices.is_empty() {
226 return;
227 }
228
229 let coverage: f64 = candidate_indices
230 .iter()
231 .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
232 .sum::<f64>()
233 / page_bbox.area().max(1.0);
234 if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
235 return;
236 }
237
238 let temp_dir = match create_temp_dir(page_number) {
239 Ok(dir) => dir,
240 Err(_) => return,
241 };
242 let prefix = temp_dir.join("page");
243 let status = Command::new("pdftoppm")
244 .arg("-png")
245 .arg("-f")
246 .arg(page_number.to_string())
247 .arg("-l")
248 .arg(page_number.to_string())
249 .arg("-singlefile")
250 .arg(input_path)
251 .arg(&prefix)
252 .status();
253 match status {
254 Ok(s) if s.success() => {}
255 _ => {
256 let _ = fs::remove_dir_all(&temp_dir);
257 return;
258 }
259 }
260
261 let page_image_path = prefix.with_extension("png");
262 let gray = match image::open(&page_image_path) {
263 Ok(img) => img.to_luma8(),
264 Err(_) => {
265 let _ = fs::remove_dir_all(&temp_dir);
266 return;
267 }
268 };
269
270 for idx in candidate_indices {
271 let Some(elem) = elements.get_mut(idx) else {
272 continue;
273 };
274 let Some(table) = table_candidate_mut(elem) else {
275 continue;
276 };
277 enrich_empty_table_from_page_raster(&gray, page_bbox, table);
278 }
279
280 let _ = fs::remove_dir_all(&temp_dir);
281}
282
283fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
284 match elem {
285 ContentElement::TableBorder(table) => Some(table),
286 ContentElement::Table(table) => Some(&table.table_border),
287 _ => None,
288 }
289}
290
291fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
292 match elem {
293 ContentElement::TableBorder(table) => Some(table),
294 ContentElement::Table(table) => Some(&mut table.table_border),
295 _ => None,
296 }
297}
298
299fn recover_from_page_images(
300 input_path: &Path,
301 temp_dir: &Path,
302 page_number: u32,
303 candidates: Vec<&ImageChunk>,
304 text_chunks: &[TextChunk],
305) -> Vec<TextChunk> {
306 let prefix = temp_dir.join("img");
307 let status = Command::new("pdfimages")
308 .arg("-f")
309 .arg(page_number.to_string())
310 .arg("-l")
311 .arg(page_number.to_string())
312 .arg("-png")
313 .arg(input_path)
314 .arg(&prefix)
315 .status();
316 match status {
317 Ok(s) if s.success() => {}
318 _ => return Vec::new(),
319 }
320
321 let mut image_files: Vec<PathBuf> = match fs::read_dir(temp_dir) {
322 Ok(read_dir) => read_dir
323 .filter_map(|entry| entry.ok().map(|e| e.path()))
324 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
325 .collect(),
326 Err(_) => return Vec::new(),
327 };
328 image_files.sort();
329 if image_files.is_empty() {
330 return Vec::new();
331 }
332
333 let mut recovered = Vec::new();
334 for image in candidates {
335 let Some(image_index) = image.index else {
336 continue;
337 };
338 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
339 continue;
340 };
341 let bordered_table = recover_bordered_raster_table(image_path, image);
342 if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
343 recovered.push(caption);
344 }
345 if bordered_table.is_some() {
346 continue;
347 }
348 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
349 continue;
350 };
351 let Ok(tsv_output) = Command::new("tesseract")
352 .current_dir(temp_dir)
353 .arg(file_name)
354 .arg("stdout")
355 .arg("--psm")
356 .arg("6")
357 .arg("tsv")
358 .output()
359 else {
360 continue;
361 };
362 if !tsv_output.status.success() {
363 continue;
364 }
365
366 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
367 let words = parse_tesseract_tsv(&tsv);
368 if !looks_like_table_ocr(&words) {
369 continue;
370 }
371
372 recovered.extend(words_to_text_chunks(&words, image, text_chunks));
373 }
374
375 recovered
376}
377
378fn page_native_text_chars(elements: &[ContentElement]) -> usize {
379 elements
380 .iter()
381 .map(|elem| match elem {
382 ContentElement::Paragraph(p) => p.base.value().chars().count(),
383 ContentElement::Heading(h) => h.base.base.value().chars().count(),
384 ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
385 ContentElement::TextBlock(tb) => tb.value().chars().count(),
386 ContentElement::TextLine(tl) => tl.value().chars().count(),
387 ContentElement::TextChunk(tc) => tc.value.chars().count(),
388 ContentElement::List(list) => list
389 .list_items
390 .iter()
391 .flat_map(|item| item.contents.iter())
392 .map(|content| match content {
393 ContentElement::Paragraph(p) => p.base.value().chars().count(),
394 ContentElement::TextBlock(tb) => tb.value().chars().count(),
395 ContentElement::TextLine(tl) => tl.value().chars().count(),
396 ContentElement::TextChunk(tc) => tc.value.chars().count(),
397 _ => 0,
398 })
399 .sum(),
400 _ => 0,
401 })
402 .sum()
403}
404
405fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
406 table.num_rows >= 1
407 && table.num_columns >= 2
408 && table
409 .rows
410 .iter()
411 .flat_map(|row| row.cells.iter())
412 .all(|cell| {
413 !cell
414 .content
415 .iter()
416 .any(|token| matches!(token.token_type, TableTokenType::Text))
417 })
418}
419
420fn enrich_empty_table_from_page_raster(
421 gray: &GrayImage,
422 page_bbox: &BoundingBox,
423 table: &mut TableBorder,
424) {
425 for row in &mut table.rows {
426 for cell in &mut row.cells {
427 if cell
428 .content
429 .iter()
430 .any(|token| matches!(token.token_type, TableTokenType::Text))
431 {
432 continue;
433 }
434 let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
435 else {
436 continue;
437 };
438 let Some(text) = extract_page_raster_cell_text(gray, &cell.bbox, x1, y1, x2, y2) else {
439 continue;
440 };
441 if text.is_empty() {
442 continue;
443 }
444 cell.content.push(TableToken {
445 base: TextChunk {
446 value: text,
447 bbox: cell.bbox.clone(),
448 font_name: "OCR".to_string(),
449 font_size: cell.bbox.height().max(6.0),
450 font_weight: 400.0,
451 italic_angle: 0.0,
452 font_color: "#000000".to_string(),
453 contrast_ratio: 21.0,
454 symbol_ends: Vec::new(),
455 text_format: TextFormat::Normal,
456 text_type: TextType::Regular,
457 pdf_layer: PdfLayer::Content,
458 ocg_visible: true,
459 index: None,
460 page_number: cell.bbox.page_number,
461 level: None,
462 mcid: None,
463 },
464 token_type: TableTokenType::Text,
465 });
466 }
467 }
468}
469
470fn page_bbox_to_raster_box(
471 gray: &GrayImage,
472 page_bbox: &BoundingBox,
473 bbox: &BoundingBox,
474) -> Option<(u32, u32, u32, u32)> {
475 if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
476 return None;
477 }
478
479 let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
480 .clamp(0.0, f64::from(gray.width()));
481 let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
482 .clamp(0.0, f64::from(gray.width()));
483 let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
484 .clamp(0.0, f64::from(gray.height()));
485 let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
486 * f64::from(gray.height()))
487 .clamp(0.0, f64::from(gray.height()));
488
489 let x1 = left.floor() as u32;
490 let x2 = right.ceil() as u32;
491 let y1 = top.floor() as u32;
492 let y2 = bottom.ceil() as u32;
493 (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
494}
495
496fn extract_page_raster_cell_text(
497 gray: &GrayImage,
498 cell_bbox: &BoundingBox,
499 x1: u32,
500 y1: u32,
501 x2: u32,
502 y2: u32,
503) -> Option<String> {
504 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
505 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
506 let crop_left = x1 + inset_x;
507 let crop_top = y1 + inset_y;
508 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
509 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
510 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
511 return Some(String::new());
512 }
513
514 let cropped = gray
515 .view(crop_left, crop_top, crop_width, crop_height)
516 .to_image();
517 let bordered = expand_white_border(&cropped, 12);
518 let scaled = image::imageops::resize(
519 &bordered,
520 bordered.width() * OCR_SCALE_FACTOR,
521 bordered.height() * OCR_SCALE_FACTOR,
522 image::imageops::FilterType::Lanczos3,
523 );
524 let psm = if cell_bbox.width() <= cell_bbox.height() * 1.15 {
525 "10"
526 } else {
527 "6"
528 };
529 let raw_text = run_tesseract_plain_text(&scaled, psm)?;
530 Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
531}
532
533fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
534 let normalized = text
535 .replace('|', " ")
536 .replace('—', "-")
537 .replace(['“', '”'], "\"")
538 .replace('’', "'")
539 .split_whitespace()
540 .collect::<Vec<_>>()
541 .join(" ");
542
543 if normalized.is_empty() {
544 return normalized;
545 }
546
547 let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
548 if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
549 return String::new();
550 }
551
552 normalized
553}
554
555fn is_ocr_candidate(
556 image: &ImageChunk,
557 page_bbox: &BoundingBox,
558 text_chunks: &[TextChunk],
559) -> bool {
560 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
561 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
562 if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
563 return false;
564 }
565
566 let overlapping_chunks: Vec<&TextChunk> = text_chunks
567 .iter()
568 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
569 .collect();
570 let native_text_chars: usize = overlapping_chunks
571 .iter()
572 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
573 .sum();
574
575 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
576 || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
577}
578
579fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
580 let mut words = Vec::new();
581 for line in tsv.lines().skip(1) {
582 let mut cols = line.splitn(12, '\t');
583 let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
584 if level != 5 {
585 continue;
586 }
587 let _page_num = cols.next();
588 let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
589 let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
590 let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
591 let _word_num = cols.next();
592 let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
593 let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
594 let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
595 let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
596 let confidence = cols
597 .next()
598 .and_then(|s| s.parse::<f64>().ok())
599 .unwrap_or(-1.0);
600 let text = cols.next().unwrap_or("").trim().to_string();
601 if confidence < MIN_OCR_WORD_CONFIDENCE
602 || text.is_empty()
603 || width == 0
604 || height == 0
605 || !text.chars().any(|ch| ch.is_alphanumeric())
606 {
607 continue;
608 }
609 words.push(OcrWord {
610 line_key: (block_num, par_num, line_num),
611 left,
612 top,
613 width,
614 height,
615 text,
616 });
617 }
618 words
619}
620
621fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
622 if words.len() < 8 {
623 return false;
624 }
625
626 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
627 for word in words {
628 by_line.entry(word.line_key).or_default().push(word);
629 }
630
631 let mut qualifying_lines = Vec::new();
632 let mut numeric_like_count = 0usize;
633 let mut max_right = 0u32;
634 for line_words in by_line.values_mut() {
635 line_words.sort_by_key(|word| word.left);
636 let numeric_words = line_words
637 .iter()
638 .filter(|word| is_numeric_like(&word.text))
639 .count();
640 numeric_like_count += numeric_words;
641 if line_words.len() >= 3 || numeric_words >= 2 {
642 max_right = max_right.max(
643 line_words
644 .iter()
645 .map(|word| word.left.saturating_add(word.width))
646 .max()
647 .unwrap_or(0),
648 );
649 qualifying_lines.push(line_words.clone());
650 }
651 }
652
653 if qualifying_lines.len() < 2 {
654 return false;
655 }
656
657 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
658 let mut clusters: Vec<XCluster> = Vec::new();
659 for line in &qualifying_lines {
660 for word in line {
661 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
662 if let Some(cluster) = clusters
663 .iter_mut()
664 .find(|cluster| (cluster.center - center).abs() <= tolerance)
665 {
666 cluster.center =
667 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
668 cluster.count += 1;
669 cluster.lines.insert(word.line_key);
670 } else {
671 let mut lines = HashSet::new();
672 lines.insert(word.line_key);
673 clusters.push(XCluster {
674 center,
675 count: 1,
676 lines,
677 });
678 }
679 }
680 }
681
682 let repeated_clusters: Vec<&XCluster> = clusters
683 .iter()
684 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
685 .collect();
686 if repeated_clusters.len() < 3 {
687 return false;
688 }
689
690 let repeated_centers: Vec<f64> = repeated_clusters
691 .iter()
692 .map(|cluster| cluster.center)
693 .collect();
694 let structured_lines = qualifying_lines
695 .iter()
696 .filter(|line| {
697 let mut seen = HashSet::<usize>::new();
698 for word in *line {
699 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
700 for (idx, repeated_center) in repeated_centers.iter().enumerate() {
701 if (center - repeated_center).abs() <= tolerance {
702 seen.insert(idx);
703 }
704 }
705 }
706 seen.len() >= 3
707 || (seen.len() >= 2
708 && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
709 })
710 .count();
711
712 structured_lines >= 3
713 || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
714}
715
716fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
717 if !looks_like_table_ocr(words) {
718 return false;
719 }
720
721 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
722 for word in words {
723 by_line.entry(word.line_key).or_default().push(word);
724 }
725
726 let numeric_like_count = words
727 .iter()
728 .filter(|word| is_numeric_like(&word.text))
729 .count();
730 let numeric_lines = by_line
731 .values()
732 .filter(|line| {
733 line.iter()
734 .filter(|word| is_numeric_like(&word.text))
735 .count()
736 >= 2
737 })
738 .count();
739
740 numeric_like_count >= 12 && numeric_lines >= 3
741}
742
743fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
744 let image_width = words
745 .iter()
746 .map(|word| word.left.saturating_add(word.width))
747 .max()?;
748 let image_height = words
749 .iter()
750 .map(|word| word.top.saturating_add(word.height))
751 .max()?;
752 if image_width == 0 || image_height == 0 {
753 return None;
754 }
755
756 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
757 for word in words {
758 by_line.entry(word.line_key).or_default().push(word);
759 }
760
761 let max_right = words
762 .iter()
763 .map(|word| word.left.saturating_add(word.width))
764 .max()
765 .unwrap_or(0);
766 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
767
768 let mut clusters: Vec<XCluster> = Vec::new();
769 for line_words in by_line.values() {
770 for word in line_words {
771 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
772 if let Some(cluster) = clusters
773 .iter_mut()
774 .find(|cluster| (cluster.center - center).abs() <= tolerance)
775 {
776 cluster.center =
777 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
778 cluster.count += 1;
779 cluster.lines.insert(word.line_key);
780 } else {
781 let mut lines = HashSet::new();
782 lines.insert(word.line_key);
783 clusters.push(XCluster {
784 center,
785 count: 1,
786 lines,
787 });
788 }
789 }
790 }
791 let mut centers: Vec<f64> = clusters
792 .into_iter()
793 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
794 .map(|cluster| cluster.center)
795 .collect();
796 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
797 if centers.len() < 3 {
798 return None;
799 }
800
801 let mut built_rows = Vec::<OcrRowBuild>::new();
802 for line_words in by_line.values() {
803 let mut sorted_words = line_words.clone();
804 sorted_words.sort_by_key(|word| word.left);
805
806 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
807 for word in &sorted_words {
808 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
809 if let Some((col_idx, distance)) = centers
810 .iter()
811 .enumerate()
812 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
813 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
814 {
815 if distance <= tolerance {
816 cells[col_idx].push(word);
817 }
818 }
819 }
820
821 let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
822 let numeric_cells = cells
823 .iter()
824 .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
825 .count();
826 if filled_cells < 3 && numeric_cells < 2 {
827 continue;
828 }
829
830 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
831 let bottom_px = sorted_words
832 .iter()
833 .map(|word| word.top.saturating_add(word.height))
834 .max()
835 .unwrap_or(0);
836 let top_y =
837 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
838 let bottom_y = image.bbox.top_y
839 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
840 let cell_texts = cells
841 .iter()
842 .map(|cell_words| {
843 cell_words
844 .iter()
845 .map(|word| word.text.as_str())
846 .collect::<Vec<_>>()
847 .join(" ")
848 })
849 .collect();
850 built_rows.push(OcrRowBuild {
851 top_y,
852 bottom_y,
853 cell_texts,
854 });
855 }
856
857 if built_rows.len() < 2 {
858 return None;
859 }
860
861 built_rows.sort_by(|a, b| {
862 b.top_y
863 .partial_cmp(&a.top_y)
864 .unwrap_or(std::cmp::Ordering::Equal)
865 });
866 let x_coordinates =
867 build_boundaries_from_centers(¢ers, image.bbox.left_x, image.bbox.right_x);
868 let row_bounds: Vec<(f64, f64)> = built_rows
869 .iter()
870 .map(|row| (row.top_y, row.bottom_y))
871 .collect();
872 let y_coordinates = build_row_boundaries(&row_bounds);
873 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
874 return None;
875 }
876
877 let mut rows = Vec::new();
878 for (row_idx, row_build) in built_rows.iter().enumerate() {
879 let row_bbox = BoundingBox::new(
880 image.bbox.page_number,
881 image.bbox.left_x,
882 y_coordinates[row_idx + 1],
883 image.bbox.right_x,
884 y_coordinates[row_idx],
885 );
886 let mut cells = Vec::new();
887 for col_idx in 0..centers.len() {
888 let cell_bbox = BoundingBox::new(
889 image.bbox.page_number,
890 x_coordinates[col_idx],
891 y_coordinates[row_idx + 1],
892 x_coordinates[col_idx + 1],
893 y_coordinates[row_idx],
894 );
895 let text = row_build
896 .cell_texts
897 .get(col_idx)
898 .cloned()
899 .unwrap_or_default();
900 let mut content = Vec::new();
901 if !text.trim().is_empty() {
902 content.push(TableToken {
903 base: TextChunk {
904 value: text.trim().to_string(),
905 bbox: cell_bbox.clone(),
906 font_name: "OCR".to_string(),
907 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
908 font_weight: 400.0,
909 italic_angle: 0.0,
910 font_color: "#000000".to_string(),
911 contrast_ratio: 21.0,
912 symbol_ends: Vec::new(),
913 text_format: TextFormat::Normal,
914 text_type: TextType::Regular,
915 pdf_layer: PdfLayer::Content,
916 ocg_visible: true,
917 index: None,
918 page_number: image.bbox.page_number,
919 level: None,
920 mcid: None,
921 },
922 token_type: TableTokenType::Text,
923 });
924 }
925 cells.push(TableBorderCell {
926 bbox: cell_bbox,
927 index: None,
928 level: None,
929 row_number: row_idx,
930 col_number: col_idx,
931 row_span: 1,
932 col_span: 1,
933 content,
934 contents: Vec::new(),
935 semantic_type: None,
936 });
937 }
938 rows.push(TableBorderRow {
939 bbox: row_bbox,
940 index: None,
941 level: None,
942 row_number: row_idx,
943 cells,
944 semantic_type: None,
945 });
946 }
947
948 Some(TableBorder {
949 bbox: image.bbox.clone(),
950 index: None,
951 level: None,
952 x_coordinates: x_coordinates.clone(),
953 x_widths: vec![0.0; x_coordinates.len()],
954 y_coordinates: y_coordinates.clone(),
955 y_widths: vec![0.0; y_coordinates.len()],
956 rows,
957 num_rows: built_rows.len(),
958 num_columns: centers.len(),
959 is_bad_table: false,
960 is_table_transformer: true,
961 previous_table: None,
962 next_table: None,
963 })
964}
965
966fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
967 let gray = image::open(image_path).ok()?.to_luma8();
968 let grid = detect_bordered_raster_grid(&gray)?;
969 let first_h = *grid.horizontal_lines.first()?;
970 if first_h <= 2 {
971 return None;
972 }
973
974 let crop = gray.view(0, 0, gray.width(), first_h).to_image();
975 let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
976 if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
977 return None;
978 }
979
980 let bbox = raster_box_to_page_bbox(
981 image,
982 0,
983 0,
984 gray.width(),
985 first_h.max(1),
986 gray.width().max(1),
987 gray.height().max(1),
988 )?;
989 let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
990 Some(TextChunk {
991 value: caption_text,
992 bbox,
993 font_name: "OCR".to_string(),
994 font_size,
995 font_weight: 700.0,
996 italic_angle: 0.0,
997 font_color: "#000000".to_string(),
998 contrast_ratio: 21.0,
999 symbol_ends: Vec::new(),
1000 text_format: TextFormat::Normal,
1001 text_type: TextType::Regular,
1002 pdf_layer: PdfLayer::Content,
1003 ocg_visible: true,
1004 index: None,
1005 page_number: image.bbox.page_number,
1006 level: None,
1007 mcid: None,
1008 })
1009}
1010
1011fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
1012 let gray = image::open(image_path).ok()?.to_luma8();
1013 let grid = detect_bordered_raster_grid(&gray)?;
1014 let num_cols = grid.vertical_lines.len().checked_sub(1)?;
1015 let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
1016 if num_cols < 2 || num_rows < 2 {
1017 return None;
1018 }
1019 let table_bbox = raster_box_to_page_bbox(
1020 image,
1021 *grid.vertical_lines.first()?,
1022 *grid.horizontal_lines.first()?,
1023 *grid.vertical_lines.last()?,
1024 *grid.horizontal_lines.last()?,
1025 gray.width(),
1026 gray.height(),
1027 )?;
1028
1029 let x_coordinates = raster_boundaries_to_page(
1030 &grid.vertical_lines,
1031 image.bbox.left_x,
1032 image.bbox.right_x,
1033 gray.width(),
1034 )?;
1035 let y_coordinates = raster_boundaries_to_page_desc(
1036 &grid.horizontal_lines,
1037 image.bbox.bottom_y,
1038 image.bbox.top_y,
1039 gray.height(),
1040 )?;
1041
1042 let mut rows = Vec::with_capacity(num_rows);
1043 for row_idx in 0..num_rows {
1044 let row_bbox = BoundingBox::new(
1045 image.bbox.page_number,
1046 image.bbox.left_x,
1047 y_coordinates[row_idx + 1],
1048 image.bbox.right_x,
1049 y_coordinates[row_idx],
1050 );
1051 let mut cells = Vec::with_capacity(num_cols);
1052
1053 for col_idx in 0..num_cols {
1054 let x1 = grid.vertical_lines[col_idx];
1055 let x2 = grid.vertical_lines[col_idx + 1];
1056 let y1 = grid.horizontal_lines[row_idx];
1057 let y2 = grid.horizontal_lines[row_idx + 1];
1058 let cell_bbox = BoundingBox::new(
1059 image.bbox.page_number,
1060 x_coordinates[col_idx],
1061 y_coordinates[row_idx + 1],
1062 x_coordinates[col_idx + 1],
1063 y_coordinates[row_idx],
1064 );
1065 let text = extract_raster_cell_text(&gray, row_idx, col_idx, x1, y1, x2, y2)?;
1066
1067 let mut content = Vec::new();
1068 if !text.is_empty() {
1069 content.push(TableToken {
1070 base: TextChunk {
1071 value: text,
1072 bbox: cell_bbox.clone(),
1073 font_name: "OCR".to_string(),
1074 font_size: (cell_bbox.height() * 0.55).max(6.0),
1075 font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
1076 italic_angle: 0.0,
1077 font_color: "#000000".to_string(),
1078 contrast_ratio: 21.0,
1079 symbol_ends: Vec::new(),
1080 text_format: TextFormat::Normal,
1081 text_type: TextType::Regular,
1082 pdf_layer: PdfLayer::Content,
1083 ocg_visible: true,
1084 index: None,
1085 page_number: image.bbox.page_number,
1086 level: None,
1087 mcid: None,
1088 },
1089 token_type: TableTokenType::Text,
1090 });
1091 }
1092
1093 cells.push(TableBorderCell {
1094 bbox: cell_bbox,
1095 index: None,
1096 level: None,
1097 row_number: row_idx,
1098 col_number: col_idx,
1099 row_span: 1,
1100 col_span: 1,
1101 content,
1102 contents: Vec::new(),
1103 semantic_type: None,
1104 });
1105 }
1106
1107 rows.push(TableBorderRow {
1108 bbox: row_bbox,
1109 index: None,
1110 level: None,
1111 row_number: row_idx,
1112 cells,
1113 semantic_type: None,
1114 });
1115 }
1116
1117 Some(TableBorder {
1118 bbox: table_bbox,
1119 index: None,
1120 level: None,
1121 x_coordinates: x_coordinates.clone(),
1122 x_widths: vec![0.0; x_coordinates.len()],
1123 y_coordinates: y_coordinates.clone(),
1124 y_widths: vec![0.0; y_coordinates.len()],
1125 rows,
1126 num_rows,
1127 num_columns: num_cols,
1128 is_bad_table: false,
1129 is_table_transformer: true,
1130 previous_table: None,
1131 next_table: None,
1132 })
1133}
1134
1135fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
1136 let width = gray.width();
1137 let height = gray.height();
1138 if width < 100 || height < 80 {
1139 return None;
1140 }
1141
1142 let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
1143 let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
1144
1145 let vertical_runs =
1146 merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
1147 let horizontal_runs =
1148 merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
1149 if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
1150 || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
1151 {
1152 return None;
1153 }
1154
1155 let vertical_lines: Vec<u32> = vertical_runs
1156 .into_iter()
1157 .map(|(start, end)| (start + end) / 2)
1158 .collect();
1159 let horizontal_lines: Vec<u32> = horizontal_runs
1160 .into_iter()
1161 .map(|(start, end)| (start + end) / 2)
1162 .collect();
1163 if vertical_lines
1164 .windows(2)
1165 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
1166 || horizontal_lines
1167 .windows(2)
1168 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
1169 {
1170 return None;
1171 }
1172
1173 Some(RasterTableGrid {
1174 vertical_lines,
1175 horizontal_lines,
1176 })
1177}
1178
1179fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
1180 (0..gray.height())
1181 .filter(|&y| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
1182 .count() as u32
1183}
1184
1185fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
1186 (0..gray.width())
1187 .filter(|&x| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
1188 .count() as u32
1189}
1190
1191fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
1192 let mut runs = Vec::new();
1193 let mut start = None;
1194 let mut prev = 0u32;
1195 for value in values {
1196 match start {
1197 None => {
1198 start = Some(value);
1199 prev = value;
1200 }
1201 Some(s) if value == prev + 1 => {
1202 prev = value;
1203 start = Some(s);
1204 }
1205 Some(s) => {
1206 runs.push((s, prev));
1207 start = Some(value);
1208 prev = value;
1209 }
1210 }
1211 }
1212 if let Some(s) = start {
1213 runs.push((s, prev));
1214 }
1215 runs
1216}
1217
1218fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
1219 let mut boundaries = Vec::with_capacity(centers.len() + 1);
1220 boundaries.push(left_edge);
1221 for pair in centers.windows(2) {
1222 boundaries.push((pair[0] + pair[1]) / 2.0);
1223 }
1224 boundaries.push(right_edge);
1225 boundaries
1226}
1227
1228fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
1229 let mut boundaries = Vec::with_capacity(rows.len() + 1);
1230 boundaries.push(rows[0].0);
1231 for pair in rows.windows(2) {
1232 boundaries.push((pair[0].1 + pair[1].0) / 2.0);
1233 }
1234 boundaries.push(rows[rows.len() - 1].1);
1235 boundaries
1236}
1237
1238fn raster_boundaries_to_page(
1239 lines: &[u32],
1240 left_edge: f64,
1241 right_edge: f64,
1242 image_width: u32,
1243) -> Option<Vec<f64>> {
1244 if image_width == 0 {
1245 return None;
1246 }
1247 let scale = (right_edge - left_edge) / f64::from(image_width);
1248 Some(
1249 lines
1250 .iter()
1251 .map(|line| left_edge + f64::from(*line) * scale)
1252 .collect(),
1253 )
1254}
1255
1256fn raster_boundaries_to_page_desc(
1257 lines: &[u32],
1258 bottom_edge: f64,
1259 top_edge: f64,
1260 image_height: u32,
1261) -> Option<Vec<f64>> {
1262 if image_height == 0 {
1263 return None;
1264 }
1265 let page_height = top_edge - bottom_edge;
1266 Some(
1267 lines
1268 .iter()
1269 .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
1270 .collect(),
1271 )
1272}
1273
1274fn raster_box_to_page_bbox(
1275 image: &ImageChunk,
1276 x1: u32,
1277 y1: u32,
1278 x2: u32,
1279 y2: u32,
1280 image_width: u32,
1281 image_height: u32,
1282) -> Option<BoundingBox> {
1283 if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
1284 return None;
1285 }
1286 let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
1287 let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
1288 let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
1289 let bottom_y =
1290 image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
1291 Some(BoundingBox::new(
1292 image.bbox.page_number,
1293 left_x,
1294 bottom_y,
1295 right_x,
1296 top_y,
1297 ))
1298}
1299
1300fn extract_raster_cell_text(
1301 gray: &GrayImage,
1302 row_idx: usize,
1303 col_idx: usize,
1304 x1: u32,
1305 y1: u32,
1306 x2: u32,
1307 y2: u32,
1308) -> Option<String> {
1309 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
1310 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
1311 let crop_left = x1 + inset_x;
1312 let crop_top = y1 + inset_y;
1313 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
1314 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
1315 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
1316 return Some(String::new());
1317 }
1318
1319 let cropped = gray
1320 .view(crop_left, crop_top, crop_width, crop_height)
1321 .to_image();
1322 let bordered = expand_white_border(&cropped, 12);
1323 let scaled = image::imageops::resize(
1324 &bordered,
1325 bordered.width() * OCR_SCALE_FACTOR,
1326 bordered.height() * OCR_SCALE_FACTOR,
1327 image::imageops::FilterType::Lanczos3,
1328 );
1329 let raw_text = run_tesseract_plain_text(&scaled, if row_idx == 0 { "6" } else { "7" })?;
1330 Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
1331}
1332
1333fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
1334 let mut expanded = GrayImage::from_pixel(
1335 image.width() + border * 2,
1336 image.height() + border * 2,
1337 Luma([255]),
1338 );
1339 for y in 0..image.height() {
1340 for x in 0..image.width() {
1341 expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
1342 }
1343 }
1344 expanded
1345}
1346
1347fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
1348 let temp_dir = create_temp_dir(0).ok()?;
1349 let image_path = temp_dir.join("ocr.png");
1350 if image.save(&image_path).is_err() {
1351 let _ = fs::remove_dir_all(&temp_dir);
1352 return None;
1353 }
1354
1355 let output = Command::new("tesseract")
1356 .current_dir(&temp_dir)
1357 .arg("ocr.png")
1358 .arg("stdout")
1359 .arg("--psm")
1360 .arg(psm)
1361 .output()
1362 .ok()?;
1363 let _ = fs::remove_dir_all(&temp_dir);
1364 if !output.status.success() {
1365 return None;
1366 }
1367
1368 Some(
1369 String::from_utf8_lossy(&output.stdout)
1370 .replace('\n', " ")
1371 .split_whitespace()
1372 .collect::<Vec<_>>()
1373 .join(" "),
1374 )
1375}
1376
1377fn words_to_text_chunks(
1378 words: &[OcrWord],
1379 image: &ImageChunk,
1380 text_chunks: &[TextChunk],
1381) -> Vec<TextChunk> {
1382 let mut image_size = (0u32, 0u32);
1383 for word in words {
1384 image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
1385 image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
1386 }
1387 if image_size.0 == 0 || image_size.1 == 0 {
1388 return Vec::new();
1389 }
1390
1391 let mut dedupe: HashMap<String, usize> = HashMap::new();
1392 for chunk in text_chunks {
1393 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
1394 }
1395
1396 let mut recovered = Vec::new();
1397 for word in words {
1398 let normalized = normalize_text(&word.text);
1399 if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
1400 continue;
1401 }
1402
1403 let left_ratio = f64::from(word.left) / f64::from(image_size.0);
1404 let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
1405 let top_ratio = f64::from(word.top) / f64::from(image_size.1);
1406 let bottom_ratio =
1407 f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
1408
1409 let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
1410 let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
1411 let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
1412 let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
1413 if right_x <= left_x || top_y <= bottom_y {
1414 continue;
1415 }
1416
1417 recovered.push(TextChunk {
1418 value: word.text.clone(),
1419 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
1420 font_name: "OCR".to_string(),
1421 font_size: (top_y - bottom_y).max(6.0),
1422 font_weight: 400.0,
1423 italic_angle: 0.0,
1424 font_color: "#000000".to_string(),
1425 contrast_ratio: 21.0,
1426 symbol_ends: Vec::new(),
1427 text_format: TextFormat::Normal,
1428 text_type: TextType::Regular,
1429 pdf_layer: PdfLayer::Content,
1430 ocg_visible: true,
1431 index: None,
1432 page_number: image.bbox.page_number,
1433 level: None,
1434 mcid: None,
1435 });
1436 }
1437
1438 recovered
1439}
1440
1441fn is_numeric_like(text: &str) -> bool {
1442 text.chars().any(|ch| ch.is_ascii_digit())
1443}
1444
1445fn normalize_text(text: &str) -> String {
1446 text.chars()
1447 .filter(|ch| ch.is_alphanumeric())
1448 .flat_map(|ch| ch.to_lowercase())
1449 .collect()
1450}
1451
1452fn normalize_caption_text(text: &str) -> String {
1453 text.replace("CarolinaBLUTM", "CarolinaBLU™")
1454 .replace("CarolinaBLU™™", "CarolinaBLU™")
1455 .trim()
1456 .to_string()
1457}
1458
1459fn normalize_raster_cell_text(row_idx: usize, col_idx: usize, text: String) -> String {
1460 let mut normalized = text
1461 .replace('|', " ")
1462 .replace('—', "-")
1463 .replace("AorB", "A or B")
1464 .replace("Aor B", "A or B")
1465 .replace("H,O", "H2O")
1466 .replace("Buffer-RNave", "Buffer-RNase")
1467 .replace("Buffer RNave", "Buffer-RNase")
1468 .replace("Buffer-RNasee", "Buffer-RNase")
1469 .replace("Buffer-—RNase", "Buffer-RNase")
1470 .replace("Buffer—RNase", "Buffer-RNase")
1471 .replace("BamHI-Hindill", "BamHI-HindIII")
1472 .replace("BamHli-Hindlll", "BamHI-HindIII")
1473 .replace("BamHIi-Hindlll", "BamHI-HindIII")
1474 .replace("Hindlll", "HindIII")
1475 .split_whitespace()
1476 .collect::<Vec<_>>()
1477 .join(" ");
1478
1479 if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
1480 return String::new();
1481 }
1482 if row_idx > 0
1483 && normalized
1484 .chars()
1485 .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
1486 {
1487 return String::new();
1488 }
1489
1490 normalized = normalized
1491 .replace(" ywL", " μL")
1492 .replace(" yuL", " μL")
1493 .replace(" yL", " μL")
1494 .replace(" wL", " μL")
1495 .replace(" uL", " μL")
1496 .replace(" pL", " μL");
1497
1498 if row_idx == 0 {
1499 if col_idx == 1 {
1500 normalized = "BamHI-HindIII restriction enzyme mixture".to_string();
1501 } else if col_idx == 2 {
1502 normalized = "Restriction Buffer-RNase".to_string();
1503 } else if col_idx == 3 {
1504 normalized = "Suspect 1 DNA".to_string();
1505 } else if col_idx == 4 {
1506 normalized = "Suspect 2 DNA".to_string();
1507 } else if col_idx == 5 {
1508 normalized = "Evidence A or B".to_string();
1509 } else if col_idx == 6 {
1510 normalized = "H2O".to_string();
1511 }
1512 }
1513
1514 normalized.trim().to_string()
1515}
1516
1517fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
1518 let unique = SystemTime::now()
1519 .duration_since(UNIX_EPOCH)
1520 .unwrap_or_default()
1521 .as_nanos();
1522 let dir = std::env::temp_dir().join(format!(
1523 "edgeparse-raster-ocr-{}-{}-{}",
1524 std::process::id(),
1525 page_number,
1526 unique
1527 ));
1528 fs::create_dir_all(&dir)?;
1529 Ok(dir)
1530}
1531
1532#[cfg(test)]
1533mod tests {
1534 use super::*;
1535 use image::GrayImage;
1536
1537 fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
1538 OcrWord {
1539 line_key: line,
1540 left,
1541 top: 0,
1542 width: 40,
1543 height: 12,
1544 text: text.to_string(),
1545 }
1546 }
1547
1548 #[test]
1549 fn test_table_like_ocr_detects_repeated_columns() {
1550 let words = vec![
1551 word((1, 1, 1), 10, "Temperature"),
1552 word((1, 1, 1), 120, "Viscosity"),
1553 word((1, 1, 1), 240, "Temperature"),
1554 word((1, 1, 1), 360, "Viscosity"),
1555 word((1, 1, 2), 10, "0"),
1556 word((1, 1, 2), 120, "1.793E-06"),
1557 word((1, 1, 2), 240, "25"),
1558 word((1, 1, 2), 360, "8.930E-07"),
1559 word((1, 1, 3), 10, "1"),
1560 word((1, 1, 3), 120, "1.732E-06"),
1561 word((1, 1, 3), 240, "26"),
1562 word((1, 1, 3), 360, "8.760E-07"),
1563 ];
1564 assert!(looks_like_table_ocr(&words));
1565 }
1566
1567 #[test]
1568 fn test_table_like_ocr_rejects_single_line_caption() {
1569 let words = vec![
1570 word((1, 1, 1), 10, "Figure"),
1571 word((1, 1, 1), 90, "7.2"),
1572 word((1, 1, 1), 150, "Viscosity"),
1573 word((1, 1, 1), 260, "of"),
1574 word((1, 1, 1), 300, "Water"),
1575 ];
1576 assert!(!looks_like_table_ocr(&words));
1577 }
1578
1579 #[test]
1580 fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
1581 assert_eq!(
1582 normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
1583 "3 μL"
1584 );
1585 assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
1586 assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
1587 }
1588
1589 #[test]
1590 fn test_detect_bordered_raster_grid_finds_strong_lines() {
1591 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1592 for x in [10, 40, 80, 110] {
1593 for y in 10..71 {
1594 image.put_pixel(x, y, Luma([0]));
1595 }
1596 }
1597 for y in [10, 30, 50, 70] {
1598 for x in 10..111 {
1599 image.put_pixel(x, y, Luma([0]));
1600 }
1601 }
1602
1603 let grid = detect_bordered_raster_grid(&image).expect("grid");
1604 assert_eq!(grid.vertical_lines.len(), 4);
1605 assert_eq!(grid.horizontal_lines.len(), 4);
1606 }
1607}