1use std::collections::{BTreeMap, HashMap, HashSet};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::time::{SystemTime, UNIX_EPOCH};
8
9use crate::models::bbox::BoundingBox;
10use crate::models::chunks::{ImageChunk, TextChunk};
11use crate::models::enums::{PdfLayer, TextFormat, TextType};
12use crate::models::table::{
13 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
14};
15
16const MIN_IMAGE_WIDTH_RATIO: f64 = 0.45;
17const MIN_IMAGE_AREA_RATIO: f64 = 0.045;
18const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
19const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
20const MIN_OCR_WORD_CONFIDENCE: f64 = 35.0;
21
22#[derive(Debug, Clone)]
23struct OcrWord {
24 line_key: (u32, u32, u32),
25 left: u32,
26 top: u32,
27 width: u32,
28 height: u32,
29 text: String,
30}
31
32#[derive(Debug, Clone)]
33struct XCluster {
34 center: f64,
35 count: usize,
36 lines: HashSet<(u32, u32, u32)>,
37}
38
39#[derive(Clone)]
40struct OcrRowBuild {
41 top_y: f64,
42 bottom_y: f64,
43 cell_texts: Vec<String>,
44}
45
46pub fn recover_raster_table_text_chunks(
48 input_path: &Path,
49 page_bbox: &BoundingBox,
50 page_number: u32,
51 text_chunks: &[TextChunk],
52 image_chunks: &[ImageChunk],
53) -> Vec<TextChunk> {
54 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
55 return Vec::new();
56 }
57
58 let candidates: Vec<&ImageChunk> = image_chunks
59 .iter()
60 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
61 .collect();
62 if candidates.is_empty() {
63 return Vec::new();
64 }
65
66 let temp_dir = match create_temp_dir(page_number) {
67 Ok(dir) => dir,
68 Err(_) => return Vec::new(),
69 };
70
71 let result =
72 recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
73
74 let _ = fs::remove_dir_all(&temp_dir);
75 result
76}
77
78pub fn recover_raster_table_borders(
80 input_path: &Path,
81 page_bbox: &BoundingBox,
82 page_number: u32,
83 text_chunks: &[TextChunk],
84 image_chunks: &[ImageChunk],
85) -> Vec<TableBorder> {
86 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
87 return Vec::new();
88 }
89
90 let candidates: Vec<&ImageChunk> = image_chunks
91 .iter()
92 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
93 .collect();
94 if candidates.is_empty() {
95 return Vec::new();
96 }
97
98 let temp_dir = match create_temp_dir(page_number) {
99 Ok(dir) => dir,
100 Err(_) => return Vec::new(),
101 };
102
103 let prefix = temp_dir.join("img");
104 let status = Command::new("pdfimages")
105 .arg("-f")
106 .arg(page_number.to_string())
107 .arg("-l")
108 .arg(page_number.to_string())
109 .arg("-png")
110 .arg(input_path)
111 .arg(&prefix)
112 .status();
113 match status {
114 Ok(s) if s.success() => {}
115 _ => {
116 let _ = fs::remove_dir_all(&temp_dir);
117 return Vec::new();
118 }
119 }
120
121 let mut image_files: Vec<PathBuf> = match fs::read_dir(&temp_dir) {
122 Ok(read_dir) => read_dir
123 .filter_map(|entry| entry.ok().map(|e| e.path()))
124 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
125 .collect(),
126 Err(_) => {
127 let _ = fs::remove_dir_all(&temp_dir);
128 return Vec::new();
129 }
130 };
131 image_files.sort();
132
133 let mut tables = Vec::new();
134 for image in candidates {
135 let Some(image_index) = image.index else {
136 continue;
137 };
138 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
139 continue;
140 };
141 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
142 continue;
143 };
144 let Ok(tsv_output) = Command::new("tesseract")
145 .current_dir(&temp_dir)
146 .arg(file_name)
147 .arg("stdout")
148 .arg("--psm")
149 .arg("6")
150 .arg("tsv")
151 .output()
152 else {
153 continue;
154 };
155 if !tsv_output.status.success() {
156 continue;
157 }
158
159 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
160 let words = parse_tesseract_tsv(&tsv);
161 if looks_like_numeric_table_ocr(&words) {
162 if let Some(table) = build_numeric_table_border(&words, image) {
163 tables.push(table);
164 }
165 }
166 }
167
168 let _ = fs::remove_dir_all(&temp_dir);
169 tables
170}
171
172fn recover_from_page_images(
173 input_path: &Path,
174 temp_dir: &Path,
175 page_number: u32,
176 candidates: Vec<&ImageChunk>,
177 text_chunks: &[TextChunk],
178) -> Vec<TextChunk> {
179 let prefix = temp_dir.join("img");
180 let status = Command::new("pdfimages")
181 .arg("-f")
182 .arg(page_number.to_string())
183 .arg("-l")
184 .arg(page_number.to_string())
185 .arg("-png")
186 .arg(input_path)
187 .arg(&prefix)
188 .status();
189 match status {
190 Ok(s) if s.success() => {}
191 _ => return Vec::new(),
192 }
193
194 let mut image_files: Vec<PathBuf> = match fs::read_dir(temp_dir) {
195 Ok(read_dir) => read_dir
196 .filter_map(|entry| entry.ok().map(|e| e.path()))
197 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
198 .collect(),
199 Err(_) => return Vec::new(),
200 };
201 image_files.sort();
202 if image_files.is_empty() {
203 return Vec::new();
204 }
205
206 let mut recovered = Vec::new();
207 for image in candidates {
208 let Some(image_index) = image.index else {
209 continue;
210 };
211 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
212 continue;
213 };
214 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
215 continue;
216 };
217 let Ok(tsv_output) = Command::new("tesseract")
218 .current_dir(temp_dir)
219 .arg(file_name)
220 .arg("stdout")
221 .arg("--psm")
222 .arg("6")
223 .arg("tsv")
224 .output()
225 else {
226 continue;
227 };
228 if !tsv_output.status.success() {
229 continue;
230 }
231
232 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
233 let words = parse_tesseract_tsv(&tsv);
234 if !looks_like_table_ocr(&words) {
235 continue;
236 }
237
238 recovered.extend(words_to_text_chunks(&words, image, text_chunks));
239 }
240
241 recovered
242}
243
244fn is_ocr_candidate(
245 image: &ImageChunk,
246 page_bbox: &BoundingBox,
247 text_chunks: &[TextChunk],
248) -> bool {
249 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
250 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
251 if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
252 return false;
253 }
254
255 let overlapping_chunks: Vec<&TextChunk> = text_chunks
256 .iter()
257 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
258 .collect();
259 let native_text_chars: usize = overlapping_chunks
260 .iter()
261 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
262 .sum();
263
264 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
265 || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
266}
267
268fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
269 let mut words = Vec::new();
270 for line in tsv.lines().skip(1) {
271 let mut cols = line.splitn(12, '\t');
272 let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
273 if level != 5 {
274 continue;
275 }
276 let _page_num = cols.next();
277 let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
278 let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
279 let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
280 let _word_num = cols.next();
281 let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
282 let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
283 let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
284 let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
285 let confidence = cols
286 .next()
287 .and_then(|s| s.parse::<f64>().ok())
288 .unwrap_or(-1.0);
289 let text = cols.next().unwrap_or("").trim().to_string();
290 if confidence < MIN_OCR_WORD_CONFIDENCE
291 || text.is_empty()
292 || width == 0
293 || height == 0
294 || !text.chars().any(|ch| ch.is_alphanumeric())
295 {
296 continue;
297 }
298 words.push(OcrWord {
299 line_key: (block_num, par_num, line_num),
300 left,
301 top,
302 width,
303 height,
304 text,
305 });
306 }
307 words
308}
309
310fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
311 if words.len() < 8 {
312 return false;
313 }
314
315 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
316 for word in words {
317 by_line.entry(word.line_key).or_default().push(word);
318 }
319
320 let mut qualifying_lines = Vec::new();
321 let mut numeric_like_count = 0usize;
322 let mut max_right = 0u32;
323 for line_words in by_line.values_mut() {
324 line_words.sort_by_key(|word| word.left);
325 let numeric_words = line_words
326 .iter()
327 .filter(|word| is_numeric_like(&word.text))
328 .count();
329 numeric_like_count += numeric_words;
330 if line_words.len() >= 3 || numeric_words >= 2 {
331 max_right = max_right.max(
332 line_words
333 .iter()
334 .map(|word| word.left.saturating_add(word.width))
335 .max()
336 .unwrap_or(0),
337 );
338 qualifying_lines.push(line_words.clone());
339 }
340 }
341
342 if qualifying_lines.len() < 2 {
343 return false;
344 }
345
346 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
347 let mut clusters: Vec<XCluster> = Vec::new();
348 for line in &qualifying_lines {
349 for word in line {
350 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
351 if let Some(cluster) = clusters
352 .iter_mut()
353 .find(|cluster| (cluster.center - center).abs() <= tolerance)
354 {
355 cluster.center =
356 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
357 cluster.count += 1;
358 cluster.lines.insert(word.line_key);
359 } else {
360 let mut lines = HashSet::new();
361 lines.insert(word.line_key);
362 clusters.push(XCluster {
363 center,
364 count: 1,
365 lines,
366 });
367 }
368 }
369 }
370
371 let repeated_clusters: Vec<&XCluster> = clusters
372 .iter()
373 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
374 .collect();
375 if repeated_clusters.len() < 3 {
376 return false;
377 }
378
379 let repeated_centers: Vec<f64> = repeated_clusters
380 .iter()
381 .map(|cluster| cluster.center)
382 .collect();
383 let structured_lines = qualifying_lines
384 .iter()
385 .filter(|line| {
386 let mut seen = HashSet::<usize>::new();
387 for word in *line {
388 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
389 for (idx, repeated_center) in repeated_centers.iter().enumerate() {
390 if (center - repeated_center).abs() <= tolerance {
391 seen.insert(idx);
392 }
393 }
394 }
395 seen.len() >= 3
396 || (seen.len() >= 2
397 && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
398 })
399 .count();
400
401 structured_lines >= 3
402 || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
403}
404
405fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
406 if !looks_like_table_ocr(words) {
407 return false;
408 }
409
410 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
411 for word in words {
412 by_line.entry(word.line_key).or_default().push(word);
413 }
414
415 let numeric_like_count = words
416 .iter()
417 .filter(|word| is_numeric_like(&word.text))
418 .count();
419 let numeric_lines = by_line
420 .values()
421 .filter(|line| {
422 line.iter()
423 .filter(|word| is_numeric_like(&word.text))
424 .count()
425 >= 2
426 })
427 .count();
428
429 numeric_like_count >= 12 && numeric_lines >= 3
430}
431
432fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
433 let image_width = words
434 .iter()
435 .map(|word| word.left.saturating_add(word.width))
436 .max()?;
437 let image_height = words
438 .iter()
439 .map(|word| word.top.saturating_add(word.height))
440 .max()?;
441 if image_width == 0 || image_height == 0 {
442 return None;
443 }
444
445 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
446 for word in words {
447 by_line.entry(word.line_key).or_default().push(word);
448 }
449
450 let max_right = words
451 .iter()
452 .map(|word| word.left.saturating_add(word.width))
453 .max()
454 .unwrap_or(0);
455 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
456
457 let mut clusters: Vec<XCluster> = Vec::new();
458 for line_words in by_line.values() {
459 for word in line_words {
460 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
461 if let Some(cluster) = clusters
462 .iter_mut()
463 .find(|cluster| (cluster.center - center).abs() <= tolerance)
464 {
465 cluster.center =
466 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
467 cluster.count += 1;
468 cluster.lines.insert(word.line_key);
469 } else {
470 let mut lines = HashSet::new();
471 lines.insert(word.line_key);
472 clusters.push(XCluster {
473 center,
474 count: 1,
475 lines,
476 });
477 }
478 }
479 }
480 let mut centers: Vec<f64> = clusters
481 .into_iter()
482 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
483 .map(|cluster| cluster.center)
484 .collect();
485 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
486 if centers.len() < 3 {
487 return None;
488 }
489
490 let mut built_rows = Vec::<OcrRowBuild>::new();
491 for line_words in by_line.values() {
492 let mut sorted_words = line_words.clone();
493 sorted_words.sort_by_key(|word| word.left);
494
495 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
496 for word in &sorted_words {
497 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
498 if let Some((col_idx, distance)) = centers
499 .iter()
500 .enumerate()
501 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
502 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
503 {
504 if distance <= tolerance {
505 cells[col_idx].push(word);
506 }
507 }
508 }
509
510 let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
511 let numeric_cells = cells
512 .iter()
513 .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
514 .count();
515 if filled_cells < 3 && numeric_cells < 2 {
516 continue;
517 }
518
519 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
520 let bottom_px = sorted_words
521 .iter()
522 .map(|word| word.top.saturating_add(word.height))
523 .max()
524 .unwrap_or(0);
525 let top_y =
526 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
527 let bottom_y = image.bbox.top_y
528 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
529 let cell_texts = cells
530 .iter()
531 .map(|cell_words| {
532 cell_words
533 .iter()
534 .map(|word| word.text.as_str())
535 .collect::<Vec<_>>()
536 .join(" ")
537 })
538 .collect();
539 built_rows.push(OcrRowBuild {
540 top_y,
541 bottom_y,
542 cell_texts,
543 });
544 }
545
546 if built_rows.len() < 2 {
547 return None;
548 }
549
550 built_rows.sort_by(|a, b| {
551 b.top_y
552 .partial_cmp(&a.top_y)
553 .unwrap_or(std::cmp::Ordering::Equal)
554 });
555 let x_coordinates =
556 build_boundaries_from_centers(¢ers, image.bbox.left_x, image.bbox.right_x);
557 let row_bounds: Vec<(f64, f64)> = built_rows
558 .iter()
559 .map(|row| (row.top_y, row.bottom_y))
560 .collect();
561 let y_coordinates = build_row_boundaries(&row_bounds);
562 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
563 return None;
564 }
565
566 let mut rows = Vec::new();
567 for (row_idx, row_build) in built_rows.iter().enumerate() {
568 let row_bbox = BoundingBox::new(
569 image.bbox.page_number,
570 image.bbox.left_x,
571 y_coordinates[row_idx + 1],
572 image.bbox.right_x,
573 y_coordinates[row_idx],
574 );
575 let mut cells = Vec::new();
576 for col_idx in 0..centers.len() {
577 let cell_bbox = BoundingBox::new(
578 image.bbox.page_number,
579 x_coordinates[col_idx],
580 y_coordinates[row_idx + 1],
581 x_coordinates[col_idx + 1],
582 y_coordinates[row_idx],
583 );
584 let text = row_build
585 .cell_texts
586 .get(col_idx)
587 .cloned()
588 .unwrap_or_default();
589 let mut content = Vec::new();
590 if !text.trim().is_empty() {
591 content.push(TableToken {
592 base: TextChunk {
593 value: text.trim().to_string(),
594 bbox: cell_bbox.clone(),
595 font_name: "OCR".to_string(),
596 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
597 font_weight: 400.0,
598 italic_angle: 0.0,
599 font_color: "#000000".to_string(),
600 contrast_ratio: 21.0,
601 symbol_ends: Vec::new(),
602 text_format: TextFormat::Normal,
603 text_type: TextType::Regular,
604 pdf_layer: PdfLayer::Content,
605 ocg_visible: true,
606 index: None,
607 page_number: image.bbox.page_number,
608 level: None,
609 mcid: None,
610 },
611 token_type: TableTokenType::Text,
612 });
613 }
614 cells.push(TableBorderCell {
615 bbox: cell_bbox,
616 index: None,
617 level: None,
618 row_number: row_idx,
619 col_number: col_idx,
620 row_span: 1,
621 col_span: 1,
622 content,
623 contents: Vec::new(),
624 semantic_type: None,
625 });
626 }
627 rows.push(TableBorderRow {
628 bbox: row_bbox,
629 index: None,
630 level: None,
631 row_number: row_idx,
632 cells,
633 semantic_type: None,
634 });
635 }
636
637 Some(TableBorder {
638 bbox: image.bbox.clone(),
639 index: None,
640 level: None,
641 x_coordinates: x_coordinates.clone(),
642 x_widths: vec![0.0; x_coordinates.len()],
643 y_coordinates: y_coordinates.clone(),
644 y_widths: vec![0.0; y_coordinates.len()],
645 rows,
646 num_rows: built_rows.len(),
647 num_columns: centers.len(),
648 is_bad_table: false,
649 is_table_transformer: true,
650 previous_table: None,
651 next_table: None,
652 })
653}
654
655fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
656 let mut boundaries = Vec::with_capacity(centers.len() + 1);
657 boundaries.push(left_edge);
658 for pair in centers.windows(2) {
659 boundaries.push((pair[0] + pair[1]) / 2.0);
660 }
661 boundaries.push(right_edge);
662 boundaries
663}
664
665fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
666 let mut boundaries = Vec::with_capacity(rows.len() + 1);
667 boundaries.push(rows[0].0);
668 for pair in rows.windows(2) {
669 boundaries.push((pair[0].1 + pair[1].0) / 2.0);
670 }
671 boundaries.push(rows[rows.len() - 1].1);
672 boundaries
673}
674
675fn words_to_text_chunks(
676 words: &[OcrWord],
677 image: &ImageChunk,
678 text_chunks: &[TextChunk],
679) -> Vec<TextChunk> {
680 let mut image_size = (0u32, 0u32);
681 for word in words {
682 image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
683 image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
684 }
685 if image_size.0 == 0 || image_size.1 == 0 {
686 return Vec::new();
687 }
688
689 let mut dedupe: HashMap<String, usize> = HashMap::new();
690 for chunk in text_chunks {
691 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
692 }
693
694 let mut recovered = Vec::new();
695 for word in words {
696 let normalized = normalize_text(&word.text);
697 if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
698 continue;
699 }
700
701 let left_ratio = f64::from(word.left) / f64::from(image_size.0);
702 let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
703 let top_ratio = f64::from(word.top) / f64::from(image_size.1);
704 let bottom_ratio =
705 f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
706
707 let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
708 let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
709 let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
710 let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
711 if right_x <= left_x || top_y <= bottom_y {
712 continue;
713 }
714
715 recovered.push(TextChunk {
716 value: word.text.clone(),
717 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
718 font_name: "OCR".to_string(),
719 font_size: (top_y - bottom_y).max(6.0),
720 font_weight: 400.0,
721 italic_angle: 0.0,
722 font_color: "#000000".to_string(),
723 contrast_ratio: 21.0,
724 symbol_ends: Vec::new(),
725 text_format: TextFormat::Normal,
726 text_type: TextType::Regular,
727 pdf_layer: PdfLayer::Content,
728 ocg_visible: true,
729 index: None,
730 page_number: image.bbox.page_number,
731 level: None,
732 mcid: None,
733 });
734 }
735
736 recovered
737}
738
739fn is_numeric_like(text: &str) -> bool {
740 text.chars().any(|ch| ch.is_ascii_digit())
741}
742
743fn normalize_text(text: &str) -> String {
744 text.chars()
745 .filter(|ch| ch.is_alphanumeric())
746 .flat_map(|ch| ch.to_lowercase())
747 .collect()
748}
749
750fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
751 let unique = SystemTime::now()
752 .duration_since(UNIX_EPOCH)
753 .unwrap_or_default()
754 .as_nanos();
755 let dir = std::env::temp_dir().join(format!(
756 "edgeparse-raster-ocr-{}-{}-{}",
757 std::process::id(),
758 page_number,
759 unique
760 ));
761 fs::create_dir_all(&dir)?;
762 Ok(dir)
763}
764
765#[cfg(test)]
766mod tests {
767 use super::*;
768
769 fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
770 OcrWord {
771 line_key: line,
772 left,
773 top: 0,
774 width: 40,
775 height: 12,
776 text: text.to_string(),
777 }
778 }
779
780 #[test]
781 fn test_table_like_ocr_detects_repeated_columns() {
782 let words = vec![
783 word((1, 1, 1), 10, "Temperature"),
784 word((1, 1, 1), 120, "Viscosity"),
785 word((1, 1, 1), 240, "Temperature"),
786 word((1, 1, 1), 360, "Viscosity"),
787 word((1, 1, 2), 10, "0"),
788 word((1, 1, 2), 120, "1.793E-06"),
789 word((1, 1, 2), 240, "25"),
790 word((1, 1, 2), 360, "8.930E-07"),
791 word((1, 1, 3), 10, "1"),
792 word((1, 1, 3), 120, "1.732E-06"),
793 word((1, 1, 3), 240, "26"),
794 word((1, 1, 3), 360, "8.760E-07"),
795 ];
796 assert!(looks_like_table_ocr(&words));
797 }
798
799 #[test]
800 fn test_table_like_ocr_rejects_single_line_caption() {
801 let words = vec![
802 word((1, 1, 1), 10, "Figure"),
803 word((1, 1, 1), 90, "7.2"),
804 word((1, 1, 1), 150, "Viscosity"),
805 word((1, 1, 1), 260, "of"),
806 word((1, 1, 1), 300, "Water"),
807 ];
808 assert!(!looks_like_table_ocr(&words));
809 }
810}