1use std::collections::{BTreeMap, HashMap, HashSet};
4use std::env;
5use std::fs;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8use std::sync::OnceLock;
9use std::time::{SystemTime, UNIX_EPOCH};
10
11use image::{GenericImageView, GrayImage, Luma};
12use serde::Deserialize;
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::{ImageChunk, TextChunk};
16use crate::models::content::ContentElement;
17use crate::models::enums::{PdfLayer, TextFormat, TextType};
18use crate::models::table::{
19 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
20};
21
22const MIN_IMAGE_WIDTH_RATIO: f64 = 0.40;
24const MIN_IMAGE_AREA_RATIO: f64 = 0.035;
25const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
26const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
27const MIN_OCR_WORD_CONFIDENCE: f64 = 6.0;
30const MAX_OCR_WORD_CONFIDENCE: f64 = 101.0;
32const RASTER_DARK_THRESHOLD: u8 = 180;
33const RASTER_CHART_INK_THRESHOLD: u8 = 240;
34const MIN_BORDERED_VERTICAL_LINES: usize = 3;
35const MIN_BORDERED_HORIZONTAL_LINES: usize = 3;
36const MIN_LINE_DARK_RATIO: f64 = 0.28;
38const MIN_CELL_SIZE_PX: u32 = 10;
39const CELL_INSET_PX: u32 = 5;
40const TABLE_RASTER_OCR_BORDER_PX: u32 = 14;
41const PDFTOPPM_DPI: u32 = 150;
46const OCR_SCALE_FACTOR: u32 = 2;
47const TESSERACT_EFFECTIVE_DPI: u32 = PDFTOPPM_DPI * OCR_SCALE_FACTOR;
49const MIN_DOMINANT_IMAGE_WIDTH_RATIO: f64 = 0.65;
50const MIN_DOMINANT_IMAGE_AREA_RATIO: f64 = 0.40;
51const MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE: usize = 80;
52const MIN_DOMINANT_IMAGE_OCR_WORDS: usize = 18;
53const MIN_DOMINANT_IMAGE_TEXT_LINES: usize = 6;
54const MIN_DENSE_PROSE_BLOCK_LINES: usize = 3;
55const MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO: f64 = 0.32;
56const MIN_TRUE_GRID_LINE_CONTINUITY: f64 = 0.60;
58const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
59const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
60const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
61const LOCAL_BINARIZATION_RADIUS: u32 = 14;
62const MIN_BINARIZATION_BLOCK_PIXELS: usize = 81;
63const MIN_RASTER_TABLE_TEXT_CELL_RATIO: f64 = 0.05;
65const MIN_RASTER_TABLE_ROWS_WITH_TEXT: usize = 1;
66const MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO: f64 = 0.40;
67const MIN_BORDERED_CELL_DARK_RATIO: f64 = 0.03;
68const MIN_BORDERED_INKED_CELL_RATIO: f64 = 0.18;
69const MIN_BORDERED_ROWS_WITH_INK: usize = 2;
70const MAX_BORDERED_TABLE_PER_CELL_FALLBACK_CELLS: usize = 24;
71const MIN_BRIGHT_PHOTO_MID_TONE_RATIO: f64 = 0.24;
72const MIN_BRIGHT_PHOTO_HISTOGRAM_BINS: usize = 8;
73const MIN_BRIGHT_PHOTO_ENTROPY: f64 = 1.6;
74
75#[derive(Debug, Clone)]
76struct OcrWord {
77 line_key: (u32, u32, u32),
78 left: u32,
79 top: u32,
80 width: u32,
81 height: u32,
82 text: String,
83 confidence: f64,
84}
85
86#[derive(Debug, Clone)]
87struct XCluster {
88 center: f64,
89 count: usize,
90 lines: HashSet<(u32, u32, u32)>,
91}
92
93#[derive(Clone)]
94struct OcrRowBuild {
95 top_y: f64,
96 bottom_y: f64,
97 cell_texts: Vec<String>,
98}
99
100#[derive(Debug, Clone)]
101struct EmptyCellRaster {
102 row_idx: usize,
103 cell_idx: usize,
104 x1: u32,
105 y1: u32,
106 x2: u32,
107 y2: u32,
108}
109
110#[derive(Debug, Clone)]
111struct RasterTableGrid {
112 vertical_lines: Vec<u32>,
113 horizontal_lines: Vec<u32>,
114}
115
116#[derive(Debug, Clone)]
117struct OcrCandidateScore {
118 words: Vec<OcrWord>,
119 score: f64,
120}
121
122#[derive(Debug, Clone)]
123struct PdfImagesListEntry {
124 image_type: String,
125}
126
127#[derive(Debug, Clone, Copy, PartialEq, Eq)]
128enum OcrEngine {
129 Tesseract,
130 RapidOcr,
131}
132
133#[derive(Debug, Deserialize)]
134struct RapidOcrLine {
135 left: u32,
136 top: u32,
137 width: u32,
138 height: u32,
139 text: String,
140 confidence: f64,
141}
142
143static OCR_ENGINE: OnceLock<OcrEngine> = OnceLock::new();
144static RAPIDOCR_PYTHON: OnceLock<Option<String>> = OnceLock::new();
145
146const RAPIDOCR_RUNNER: &str = r#"
147import json, sys
148from rapidocr import RapidOCR
149
150engine = RapidOCR()
151result = engine(sys.argv[1], use_det=True, use_cls=True, use_rec=True)
152
153if result is None:
154 print('[]')
155 raise SystemExit(0)
156
157boxes = getattr(result, 'boxes', []) or []
158txts = getattr(result, 'txts', []) or []
159scores = getattr(result, 'scores', []) or []
160out = []
161for box, text, score in zip(boxes, txts, scores):
162 if not text or not str(text).strip():
163 continue
164 xs = [pt[0] for pt in box]
165 ys = [pt[1] for pt in box]
166 out.append({
167 'left': int(min(xs)),
168 'top': int(min(ys)),
169 'width': max(1, int(max(xs) - min(xs))),
170 'height': max(1, int(max(ys) - min(ys))),
171 'text': str(text),
172 'confidence': float(score),
173 })
174print(json.dumps(out, ensure_ascii=False))
175"#;
176
177fn selected_ocr_engine() -> OcrEngine {
178 *OCR_ENGINE.get_or_init(|| match env::var("EDGEPARSE_OCR_ENGINE") {
179 Ok(value) => match value.to_ascii_lowercase().as_str() {
180 "rapidocr" if rapidocr_python_command().is_some() => OcrEngine::RapidOcr,
181 "rapidocr" => OcrEngine::Tesseract,
182 _ => OcrEngine::Tesseract,
183 },
184 Err(_) => OcrEngine::Tesseract,
185 })
186}
187
188fn rapidocr_python_command() -> Option<&'static str> {
189 RAPIDOCR_PYTHON
190 .get_or_init(|| {
191 let preferred = env::var("EDGEPARSE_OCR_PYTHON").ok();
192 let mut candidates = Vec::new();
193 if let Some(cmd) = preferred {
194 candidates.push(cmd);
195 }
196 candidates.push("python3".to_string());
197 candidates.push("python".to_string());
198
199 for candidate in candidates {
200 let ok = Command::new(&candidate)
201 .arg("-c")
202 .arg("import rapidocr")
203 .output()
204 .ok()
205 .is_some_and(|out| out.status.success());
206 if ok {
207 return Some(candidate);
208 }
209 }
210 None
211 })
212 .as_deref()
213}
214
215fn rapidocr_lines_to_words(lines: Vec<RapidOcrLine>) -> Vec<OcrWord> {
216 let mut words = Vec::new();
217
218 for (line_idx, line) in lines.into_iter().enumerate() {
219 let tokens: Vec<&str> = line.text.split_whitespace().collect();
220 if tokens.is_empty() {
221 continue;
222 }
223
224 let total_chars: u32 = tokens
225 .iter()
226 .map(|token| token.chars().count() as u32)
227 .sum();
228 if total_chars == 0 {
229 continue;
230 }
231
232 let mut cursor = line.left;
233 let mut remaining_width = line.width.max(tokens.len() as u32);
234 let mut remaining_chars = total_chars;
235
236 for (token_idx, token) in tokens.iter().enumerate() {
237 let token_chars = token.chars().count() as u32;
238 let width = if token_idx == tokens.len() - 1 || remaining_chars <= token_chars {
239 remaining_width.max(1)
240 } else {
241 let proportional = ((remaining_width as f64) * (token_chars as f64)
242 / (remaining_chars as f64))
243 .round() as u32;
244 proportional.max(1).min(remaining_width)
245 };
246
247 words.push(OcrWord {
248 line_key: (0, line_idx as u32, 0),
249 left: cursor,
250 top: line.top,
251 width,
252 height: line.height.max(1),
253 text: (*token).to_string(),
254 confidence: line.confidence,
255 });
256
257 cursor = cursor.saturating_add(width);
258 remaining_width = remaining_width.saturating_sub(width);
259 remaining_chars = remaining_chars.saturating_sub(token_chars);
260 }
261 }
262
263 words
264}
265
266fn run_rapidocr_words(image: &GrayImage) -> Option<Vec<OcrWord>> {
267 let python = rapidocr_python_command()?;
268 let temp_dir = create_temp_dir(0).ok()?;
269 let image_path = temp_dir.join("ocr.png");
270 if image.save(&image_path).is_err() {
271 let _ = fs::remove_dir_all(&temp_dir);
272 return None;
273 }
274
275 let output = Command::new(python)
276 .current_dir(&temp_dir)
277 .arg("-c")
278 .arg(RAPIDOCR_RUNNER)
279 .arg("ocr.png")
280 .output()
281 .ok()?;
282 let _ = fs::remove_dir_all(&temp_dir);
283 if !output.status.success() {
284 return None;
285 }
286
287 let json = String::from_utf8_lossy(&output.stdout);
288 let lines: Vec<RapidOcrLine> = serde_json::from_str(&json).ok()?;
289 let words = rapidocr_lines_to_words(lines);
290 (!words.is_empty()).then_some(words)
291}
292
293pub fn recover_raster_table_text_chunks(
295 input_path: &Path,
296 page_bbox: &BoundingBox,
297 page_number: u32,
298 text_chunks: &[TextChunk],
299 image_chunks: &[ImageChunk],
300) -> Vec<TextChunk> {
301 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
302 return Vec::new();
303 }
304
305 let candidates: Vec<&ImageChunk> = image_chunks
306 .iter()
307 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
308 .collect();
309 if candidates.is_empty() {
310 return Vec::new();
311 }
312
313 let temp_dir = match create_temp_dir(page_number) {
314 Ok(dir) => dir,
315 Err(_) => return Vec::new(),
316 };
317
318 let result =
319 recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
320
321 let _ = fs::remove_dir_all(&temp_dir);
322 result
323}
324
325pub fn recover_dominant_image_text_chunks(
332 input_path: &Path,
333 page_bbox: &BoundingBox,
334 page_number: u32,
335 text_chunks: &[TextChunk],
336 image_chunks: &[ImageChunk],
337) -> Vec<TextChunk> {
338 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
339 return Vec::new();
340 }
341
342 let candidates: Vec<&ImageChunk> = image_chunks
343 .iter()
344 .filter(|image| is_dominant_image_text_candidate(image, page_bbox, text_chunks))
345 .collect();
346 if candidates.is_empty() {
347 return Vec::new();
348 }
349
350 let temp_dir = match create_temp_dir(page_number) {
351 Ok(dir) => dir,
352 Err(_) => return Vec::new(),
353 };
354
355 let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
356 Some(files) => files,
357 None => {
358 let _ = fs::remove_dir_all(&temp_dir);
359 return Vec::new();
360 }
361 };
362
363 let mut recovered = Vec::new();
364 for image in candidates {
365 let Some(image_index) = image.index else {
366 continue;
367 };
368 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
369 continue;
370 };
371 let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
372 continue;
373 };
374 if recover_bordered_raster_table_from_gray(&gray, image).is_some()
375 || is_obvious_bar_chart_raster(&gray)
376 || is_natural_photograph_raster(&gray)
377 || is_dark_ui_screenshot_raster(&gray)
378 {
379 continue;
380 }
381
382 let Some(words) = run_tesseract_tsv_words_best(&gray, &["11", "6"], |candidate| {
383 looks_like_dense_prose_image_ocr(candidate)
384 }) else {
385 continue;
386 };
387
388 recovered.extend(lines_from_ocr_words(
389 &words,
390 image,
391 gray.width(),
392 gray.height(),
393 text_chunks,
394 ));
395 }
396
397 let _ = fs::remove_dir_all(&temp_dir);
398 recovered
399}
400
401pub fn recover_raster_table_borders(
403 input_path: &Path,
404 page_bbox: &BoundingBox,
405 page_number: u32,
406 text_chunks: &[TextChunk],
407 image_chunks: &[ImageChunk],
408) -> Vec<TableBorder> {
409 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
410 return Vec::new();
411 }
412
413 let candidates: Vec<&ImageChunk> = image_chunks
414 .iter()
415 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
416 .collect();
417 if candidates.is_empty() {
418 return Vec::new();
419 }
420
421 let temp_dir = match create_temp_dir(page_number) {
422 Ok(dir) => dir,
423 Err(_) => return Vec::new(),
424 };
425
426 let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
427 Some(files) => files,
428 None => {
429 let _ = fs::remove_dir_all(&temp_dir);
430 return Vec::new();
431 }
432 };
433
434 let mut tables = Vec::new();
435 for image in candidates {
436 let Some(image_index) = image.index else {
437 continue;
438 };
439 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
440 continue;
441 };
442 let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
443 continue;
444 };
445 if is_obvious_bar_chart_raster(&gray)
446 || is_natural_photograph_raster(&gray)
447 || is_dark_ui_screenshot_raster(&gray)
448 {
449 continue;
450 }
451 if let Some(table) = recover_bordered_raster_table_from_gray(&gray, image) {
452 let chart_words = run_tesseract_tsv_words_best(&gray, &["6", "11"], |_| true);
453 if chart_words
454 .as_deref()
455 .is_some_and(looks_like_chart_label_ocr)
456 {
457 continue;
458 }
459 tables.push(table);
460 continue;
461 }
462 let Some(words) = run_tesseract_tsv_words_best(&gray, &["6", "11"], |candidate| {
463 looks_like_table_ocr(candidate)
464 }) else {
465 continue;
466 };
467
468 if looks_like_numeric_table_ocr(&words) {
469 if let Some(table) = build_numeric_table_border(&words, image) {
470 if is_matrixish_ocr_artifact_table(&table) {
471 continue;
472 }
473 tables.push(table);
474 continue;
475 }
476 }
477
478 if let Some(table) = build_structured_ocr_table_border(&words, image) {
479 if is_matrixish_ocr_artifact_table(&table) {
480 continue;
481 }
482 tables.push(table);
483 }
484 }
485
486 let _ = fs::remove_dir_all(&temp_dir);
487 tables
488}
489
490pub fn recover_page_raster_table_cell_text(
496 input_path: &Path,
497 page_bbox: &BoundingBox,
498 page_number: u32,
499 elements: &mut [ContentElement],
500) {
501 if page_bbox.area() <= 0.0 {
502 return;
503 }
504
505 let native_text_chars = page_native_text_chars(elements);
506
507 let candidate_indices: Vec<usize> = elements
508 .iter()
509 .enumerate()
510 .filter_map(|(idx, elem)| {
511 let table = table_candidate_ref(elem)?;
512 let local_text_chars = native_text_chars_in_region(elements, &table.bbox);
513 if !table_needs_page_raster_ocr(table) {
514 return None;
515 }
516 if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR
517 && local_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR
518 {
519 return None;
520 }
521 Some(idx)
522 })
523 .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
524 .collect();
525 if candidate_indices.is_empty() {
526 return;
527 }
528
529 let coverage: f64 = candidate_indices
530 .iter()
531 .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
532 .sum::<f64>()
533 / page_bbox.area().max(1.0);
534 if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
535 return;
536 }
537
538 let temp_dir = match create_temp_dir(page_number) {
539 Ok(dir) => dir,
540 Err(_) => return,
541 };
542 let prefix = temp_dir.join("page");
543 let status = Command::new("pdftoppm")
544 .arg("-png")
545 .arg("-f")
546 .arg(page_number.to_string())
547 .arg("-l")
548 .arg(page_number.to_string())
549 .arg("-singlefile")
550 .arg(input_path)
551 .arg(&prefix)
552 .status();
553 match status {
554 Ok(s) if s.success() => {}
555 _ => {
556 let _ = fs::remove_dir_all(&temp_dir);
557 return;
558 }
559 }
560
561 let page_image_path = prefix.with_extension("png");
562 let gray = match image::open(&page_image_path) {
563 Ok(img) => img.to_luma8(),
564 Err(_) => {
565 let _ = fs::remove_dir_all(&temp_dir);
566 return;
567 }
568 };
569
570 for idx in candidate_indices {
571 let Some(elem) = elements.get_mut(idx) else {
572 continue;
573 };
574 let Some(table) = table_candidate_mut(elem) else {
575 continue;
576 };
577 enrich_empty_table_from_page_raster(&gray, page_bbox, table);
578 }
579
580 let _ = fs::remove_dir_all(&temp_dir);
581}
582
583fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
584 match elem {
585 ContentElement::TableBorder(table) => Some(table),
586 ContentElement::Table(table) => Some(&table.table_border),
587 _ => None,
588 }
589}
590
591fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
592 match elem {
593 ContentElement::TableBorder(table) => Some(table),
594 ContentElement::Table(table) => Some(&mut table.table_border),
595 _ => None,
596 }
597}
598
599fn page_native_text_chars(elements: &[ContentElement]) -> usize {
600 native_text_chars_in_region(elements, &BoundingBox::new(None, f64::MIN, f64::MIN, f64::MAX, f64::MAX))
601}
602
603fn native_text_chars_in_region(elements: &[ContentElement], region: &BoundingBox) -> usize {
604 elements
605 .iter()
606 .filter(|elem| region.overlaps(elem.bbox()))
607 .map(|elem| match elem {
608 ContentElement::Paragraph(p) => p.base.value().chars().count(),
609 ContentElement::Heading(h) => h.base.base.value().chars().count(),
610 ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
611 ContentElement::TextBlock(tb) => tb.value().chars().count(),
612 ContentElement::TextLine(tl) => tl.value().chars().count(),
613 ContentElement::TextChunk(tc) => tc.value.chars().count(),
614 ContentElement::List(list) => list
615 .list_items
616 .iter()
617 .flat_map(|item| item.contents.iter())
618 .map(|content| match content {
619 ContentElement::Paragraph(p) => p.base.value().chars().count(),
620 ContentElement::TextBlock(tb) => tb.value().chars().count(),
621 ContentElement::TextLine(tl) => tl.value().chars().count(),
622 ContentElement::TextChunk(tc) => tc.value.chars().count(),
623 _ => 0,
624 })
625 .sum(),
626 _ => 0,
627 })
628 .sum()
629}
630
631fn recover_from_page_images(
632 input_path: &Path,
633 temp_dir: &Path,
634 page_number: u32,
635 candidates: Vec<&ImageChunk>,
636 text_chunks: &[TextChunk],
637) -> Vec<TextChunk> {
638 let image_files = match extract_visible_page_image_files(input_path, page_number, temp_dir) {
639 Some(files) => files,
640 None => return Vec::new(),
641 };
642 if image_files.is_empty() {
643 return Vec::new();
644 }
645
646 let mut recovered = Vec::new();
647 for image in candidates {
648 let Some(image_index) = image.index else {
649 continue;
650 };
651 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
652 continue;
653 };
654 let bordered_table = recover_bordered_raster_table(image_path, image);
655 if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
656 recovered.push(caption);
657 }
658 if bordered_table.is_some() {
659 continue;
660 }
661 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
662 continue;
663 };
664 let native_dpi = PDFTOPPM_DPI.to_string();
668 let Ok(tsv_output) = Command::new("tesseract")
669 .current_dir(temp_dir)
670 .arg(file_name)
671 .arg("stdout")
672 .arg("--dpi")
673 .arg(&native_dpi)
674 .arg("--psm")
675 .arg("6")
676 .arg("-c")
677 .arg("load_system_dawg=0")
678 .arg("-c")
679 .arg("load_freq_dawg=0")
680 .arg("tsv")
681 .output()
682 else {
683 continue;
684 };
685 if !tsv_output.status.success() {
686 continue;
687 }
688
689 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
690 let words = parse_tesseract_tsv(&tsv);
691 if !looks_like_table_ocr(&words) {
692 continue;
693 }
694
695 recovered.extend(words_to_text_chunks(&words, image, text_chunks));
696 }
697
698 recovered
699}
700
701fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
702 if table.num_rows < 1 || table.num_columns < 2 {
703 return false;
704 }
705
706 let total_cells = table.rows.iter().map(|row| row.cells.len()).sum::<usize>();
707 if total_cells == 0 {
708 return false;
709 }
710
711 let text_cells = table_text_cell_count(table);
712 let text_cell_ratio = text_cells as f64 / total_cells as f64;
713 text_cells == 0 || text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
714}
715
716fn table_text_cell_count(table: &TableBorder) -> usize {
717 table
718 .rows
719 .iter()
720 .flat_map(|row| row.cells.iter())
721 .filter(|cell| cell_has_substantive_text(cell))
722 .count()
723}
724
725fn cell_has_substantive_text(cell: &TableBorderCell) -> bool {
726 let has_token_text = cell.content.iter().any(|token| {
727 matches!(token.token_type, TableTokenType::Text)
728 && token.base.value.chars().any(|ch| ch.is_alphanumeric())
729 });
730 if has_token_text {
731 return true;
732 }
733
734 cell.contents.iter().any(|elem| match elem {
735 ContentElement::Paragraph(p) => p.base.value().chars().any(|ch| ch.is_alphanumeric()),
736 ContentElement::Heading(h) => h.base.base.value().chars().any(|ch| ch.is_alphanumeric()),
737 ContentElement::NumberHeading(h) => h
738 .base
739 .base
740 .base
741 .value()
742 .chars()
743 .any(|ch| ch.is_alphanumeric()),
744 ContentElement::TextBlock(tb) => tb.value().chars().any(|ch| ch.is_alphanumeric()),
745 ContentElement::TextLine(tl) => tl.value().chars().any(|ch| ch.is_alphanumeric()),
746 ContentElement::TextChunk(tc) => tc.value.chars().any(|ch| ch.is_alphanumeric()),
747 _ => false,
748 })
749}
750
751fn enrich_empty_table_from_page_raster(
752 gray: &GrayImage,
753 page_bbox: &BoundingBox,
754 table: &mut TableBorder,
755) {
756 let mut empty_cells: Vec<EmptyCellRaster> = Vec::new();
759 for (row_idx, row) in table.rows.iter().enumerate() {
760 for (cell_idx, cell) in row.cells.iter().enumerate() {
761 if cell
762 .content
763 .iter()
764 .any(|token| matches!(token.token_type, TableTokenType::Text))
765 {
766 continue;
767 }
768 let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
769 else {
770 continue;
771 };
772 empty_cells.push(EmptyCellRaster {
773 row_idx,
774 cell_idx,
775 x1,
776 y1,
777 x2,
778 y2,
779 });
780 }
781 }
782 if empty_cells.is_empty() {
783 return;
784 }
785
786 let Some((tx1, ty1, tx2, ty2)) = page_bbox_to_raster_box(gray, page_bbox, &table.bbox) else {
788 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
789 return;
790 };
791
792 let pad = CELL_INSET_PX * 2;
793 let crop_left = tx1.saturating_sub(pad);
794 let crop_top = ty1.saturating_sub(pad);
795 let crop_right = (tx2 + pad).min(gray.width());
796 let crop_bottom = (ty2 + pad).min(gray.height());
797 if crop_right <= crop_left || crop_bottom <= crop_top {
798 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
799 return;
800 }
801
802 let crop_width = crop_right - crop_left;
803 let crop_height = crop_bottom - crop_top;
804 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
805 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
806 return;
807 }
808
809 let cropped = gray
810 .view(crop_left, crop_top, crop_width, crop_height)
811 .to_image();
812 let is_bar_chart = is_obvious_bar_chart_raster(&cropped);
813 let is_photo = is_natural_photograph_raster(&cropped);
814 let is_ui = is_dark_ui_screenshot_raster(&cropped);
815 if is_bar_chart || is_photo || is_ui {
816 return;
817 }
818 let bordered = expand_white_border(&cropped, TABLE_RASTER_OCR_BORDER_PX);
819 let scaled = image::imageops::resize(
820 &bordered,
821 bordered.width() * OCR_SCALE_FACTOR,
822 bordered.height() * OCR_SCALE_FACTOR,
823 image::imageops::FilterType::Lanczos3,
824 );
825
826 let Some(words) = run_tesseract_tsv_words(&scaled, "6") else {
827 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
828 return;
829 };
830 if words.is_empty() {
831 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
832 return;
833 }
834 let chart_like = looks_like_chart_label_ocr(&words);
835 if chart_like {
836 return;
837 }
838
839 let mut buckets: Vec<Vec<(u32, u32, String)>> = vec![Vec::new(); empty_cells.len()];
840 let scale = f64::from(OCR_SCALE_FACTOR);
841 let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
842
843 for word in &words {
844 let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
845 let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
846
847 let cx_crop = cx_scaled / scale - border;
848 let cy_crop = cy_scaled / scale - border;
849 if cx_crop < 0.0 || cy_crop < 0.0 {
850 continue;
851 }
852
853 let cx_page = match u32::try_from(cx_crop.round() as i64) {
854 Ok(v) => crop_left.saturating_add(v),
855 Err(_) => continue,
856 };
857 let cy_page = match u32::try_from(cy_crop.round() as i64) {
858 Ok(v) => crop_top.saturating_add(v),
859 Err(_) => continue,
860 };
861
862 for (idx, cell) in empty_cells.iter().enumerate() {
863 if cx_page >= cell.x1 && cx_page < cell.x2 && cy_page >= cell.y1 && cy_page < cell.y2 {
864 buckets[idx].push((cy_page, cx_page, word.text.clone()));
865 break;
866 }
867 }
868 }
869
870 for (idx, cell) in empty_cells.iter().enumerate() {
871 let Some(row) = table.rows.get_mut(cell.row_idx) else {
872 continue;
873 };
874 let Some(target) = row.cells.get_mut(cell.cell_idx) else {
875 continue;
876 };
877 if target
878 .content
879 .iter()
880 .any(|token| matches!(token.token_type, TableTokenType::Text))
881 {
882 continue;
883 }
884 let mut parts = std::mem::take(&mut buckets[idx]);
885 if parts.is_empty() {
886 continue;
887 }
888 parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
889 let raw = parts
890 .into_iter()
891 .map(|(_, _, t)| t)
892 .collect::<Vec<_>>()
893 .join(" ");
894 let text = normalize_page_raster_cell_text(&target.bbox, raw);
895 if text.is_empty() {
896 continue;
897 }
898 target.content.push(TableToken {
899 base: TextChunk {
900 value: text,
901 bbox: target.bbox.clone(),
902 font_name: "OCR".to_string(),
903 font_size: target.bbox.height().max(6.0),
904 font_weight: 400.0,
905 italic_angle: 0.0,
906 font_color: "#000000".to_string(),
907 contrast_ratio: 21.0,
908 symbol_ends: Vec::new(),
909 text_format: TextFormat::Normal,
910 text_type: TextType::Regular,
911 pdf_layer: PdfLayer::Content,
912 ocg_visible: true,
913 index: None,
914 page_number: target.bbox.page_number,
915 level: None,
916 mcid: None,
917 },
918 token_type: TableTokenType::Text,
919 });
920 }
921}
922
923fn fill_cells_with_per_cell_ocr(
924 gray: &GrayImage,
925 table: &mut TableBorder,
926 empty_cells: &[EmptyCellRaster],
927) {
928 for cell in empty_cells {
929 let Some(row) = table.rows.get_mut(cell.row_idx) else {
930 continue;
931 };
932 let Some(target) = row.cells.get_mut(cell.cell_idx) else {
933 continue;
934 };
935 if target
936 .content
937 .iter()
938 .any(|token| matches!(token.token_type, TableTokenType::Text))
939 {
940 continue;
941 }
942 let Some(text) =
943 extract_page_raster_cell_text(gray, &target.bbox, cell.x1, cell.y1, cell.x2, cell.y2)
944 else {
945 continue;
946 };
947 if text.is_empty() {
948 continue;
949 }
950 target.content.push(TableToken {
951 base: TextChunk {
952 value: text,
953 bbox: target.bbox.clone(),
954 font_name: "OCR".to_string(),
955 font_size: target.bbox.height().max(6.0),
956 font_weight: 400.0,
957 italic_angle: 0.0,
958 font_color: "#000000".to_string(),
959 contrast_ratio: 21.0,
960 symbol_ends: Vec::new(),
961 text_format: TextFormat::Normal,
962 text_type: TextType::Regular,
963 pdf_layer: PdfLayer::Content,
964 ocg_visible: true,
965 index: None,
966 page_number: target.bbox.page_number,
967 level: None,
968 mcid: None,
969 },
970 token_type: TableTokenType::Text,
971 });
972 }
973}
974
975fn page_bbox_to_raster_box(
976 gray: &GrayImage,
977 page_bbox: &BoundingBox,
978 bbox: &BoundingBox,
979) -> Option<(u32, u32, u32, u32)> {
980 if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
981 return None;
982 }
983
984 let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
985 .clamp(0.0, f64::from(gray.width()));
986 let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
987 .clamp(0.0, f64::from(gray.width()));
988 let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
989 .clamp(0.0, f64::from(gray.height()));
990 let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
991 * f64::from(gray.height()))
992 .clamp(0.0, f64::from(gray.height()));
993
994 let x1 = left.floor() as u32;
995 let x2 = right.ceil() as u32;
996 let y1 = top.floor() as u32;
997 let y2 = bottom.ceil() as u32;
998 (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
999}
1000
1001fn extract_page_raster_cell_text(
1002 gray: &GrayImage,
1003 cell_bbox: &BoundingBox,
1004 x1: u32,
1005 y1: u32,
1006 x2: u32,
1007 y2: u32,
1008) -> Option<String> {
1009 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
1010 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
1011 let crop_left = x1 + inset_x;
1012 let crop_top = y1 + inset_y;
1013 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
1014 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
1015 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
1016 return Some(String::new());
1017 }
1018
1019 let cropped = gray
1020 .view(crop_left, crop_top, crop_width, crop_height)
1021 .to_image();
1022 let bordered = expand_white_border(&cropped, 12);
1023 let scaled = image::imageops::resize(
1024 &bordered,
1025 bordered.width() * OCR_SCALE_FACTOR,
1026 bordered.height() * OCR_SCALE_FACTOR,
1027 image::imageops::FilterType::Lanczos3,
1028 );
1029
1030 let aspect_ratio = cell_bbox.width() / cell_bbox.height();
1032 let is_vertical = aspect_ratio < 0.8;
1033
1034 let psm_modes: [&str; 5] = if is_vertical {
1044 ["7", "8", "6", "11", "13"]
1045 } else {
1046 ["6", "7", "8", "11", "13"]
1047 };
1048
1049 let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
1050 Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
1051}
1052
1053fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
1054 let normalized = text
1055 .replace('|', " ")
1056 .replace('—', "-")
1057 .replace(['“', '”'], "\"")
1058 .replace('’', "'")
1059 .split_whitespace()
1060 .collect::<Vec<_>>()
1061 .join(" ");
1062
1063 if normalized.is_empty() {
1064 return normalized;
1065 }
1066
1067 let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
1068 if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
1069 return String::new();
1070 }
1071
1072 normalized
1073}
1074
1075fn is_ocr_candidate(
1076 image: &ImageChunk,
1077 page_bbox: &BoundingBox,
1078 text_chunks: &[TextChunk],
1079) -> bool {
1080 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1081 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1082 if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
1083 return false;
1084 }
1085
1086 let overlapping_chunks: Vec<&TextChunk> = text_chunks
1087 .iter()
1088 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1089 .collect();
1090 let native_text_chars: usize = overlapping_chunks
1091 .iter()
1092 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1093 .sum();
1094
1095 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
1096 || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
1097}
1098
1099fn is_dominant_image_text_candidate(
1100 image: &ImageChunk,
1101 page_bbox: &BoundingBox,
1102 text_chunks: &[TextChunk],
1103) -> bool {
1104 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1105 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1106 if width_ratio < MIN_DOMINANT_IMAGE_WIDTH_RATIO || area_ratio < MIN_DOMINANT_IMAGE_AREA_RATIO {
1107 return false;
1108 }
1109
1110 let native_text_chars: usize = text_chunks
1111 .iter()
1112 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1113 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1114 .sum();
1115
1116 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE
1117}
1118
1119fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
1120 let mut words = Vec::new();
1121 for line in tsv.lines().skip(1) {
1122 let mut cols = line.splitn(12, '\t');
1123 let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1124 if level != 5 {
1125 continue;
1126 }
1127 let _page_num = cols.next();
1128 let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1129 let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1130 let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1131 let _word_num = cols.next();
1132 let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1133 let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1134 let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1135 let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1136 let confidence = cols
1137 .next()
1138 .and_then(|s| s.parse::<f64>().ok())
1139 .unwrap_or(-1.0);
1140 let text = cols.next().unwrap_or("").trim().to_string();
1141 if !(MIN_OCR_WORD_CONFIDENCE..=MAX_OCR_WORD_CONFIDENCE).contains(&confidence)
1142 || text.is_empty()
1143 || width == 0
1144 || height == 0
1145 || !text.chars().any(|ch| ch.is_alphanumeric())
1146 {
1147 continue;
1148 }
1149 words.push(OcrWord {
1150 line_key: (block_num, par_num, line_num),
1151 left,
1152 top,
1153 width,
1154 height,
1155 text,
1156 confidence,
1157 });
1158 }
1159 words
1160}
1161
1162fn looks_like_chart_label_ocr(words: &[OcrWord]) -> bool {
1163 if words.len() < 8 {
1164 return false;
1165 }
1166
1167 let min_left = words.iter().map(|word| word.left).min().unwrap_or(0);
1168 let min_top = words.iter().map(|word| word.top).min().unwrap_or(0);
1169 let max_right = words
1170 .iter()
1171 .map(|word| word.left.saturating_add(word.width))
1172 .max()
1173 .unwrap_or(0);
1174 let max_bottom = words
1175 .iter()
1176 .map(|word| word.top.saturating_add(word.height))
1177 .max()
1178 .unwrap_or(0);
1179 let image_width = max_right.saturating_sub(min_left);
1180 let image_height = max_bottom.saturating_sub(min_top);
1181 if image_width < 160 || image_height < 120 {
1182 return false;
1183 }
1184
1185 let width_f = f64::from(image_width);
1186 let height_f = f64::from(image_height);
1187 let outer_x = width_f * 0.18;
1188 let outer_y = height_f * 0.18;
1189 let inner_left = width_f * 0.22;
1190 let inner_right = width_f * 0.78;
1191 let inner_top = height_f * 0.22;
1192 let inner_bottom = height_f * 0.78;
1193
1194 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1195 let mut outer_words = 0usize;
1196 let mut inner_words = 0usize;
1197
1198 for word in words {
1199 by_line.entry(word.line_key).or_default().push(word);
1200
1201 let center_x = f64::from(word.left.saturating_sub(min_left)) + f64::from(word.width) / 2.0;
1202 let center_y = f64::from(word.top.saturating_sub(min_top)) + f64::from(word.height) / 2.0;
1203
1204 if center_x <= outer_x
1205 || center_x >= width_f - outer_x
1206 || center_y <= outer_y
1207 || center_y >= height_f - outer_y
1208 {
1209 outer_words += 1;
1210 }
1211
1212 if center_x >= inner_left
1213 && center_x <= inner_right
1214 && center_y >= inner_top
1215 && center_y <= inner_bottom
1216 {
1217 inner_words += 1;
1218 }
1219 }
1220
1221 if by_line.len() < 5 {
1222 return false;
1223 }
1224
1225 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1226 let mut clusters: Vec<XCluster> = Vec::new();
1227 for line_words in by_line.values() {
1228 for word in line_words {
1229 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1230 if let Some(cluster) = clusters
1231 .iter_mut()
1232 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1233 {
1234 cluster.center =
1235 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1236 cluster.count += 1;
1237 cluster.lines.insert(word.line_key);
1238 } else {
1239 let mut lines = HashSet::new();
1240 lines.insert(word.line_key);
1241 clusters.push(XCluster {
1242 center,
1243 count: 1,
1244 lines,
1245 });
1246 }
1247 }
1248 }
1249
1250 let stable_centers: Vec<f64> = clusters
1251 .iter()
1252 .filter(|cluster| cluster.lines.len() >= 4 && cluster.count >= 4)
1253 .map(|cluster| cluster.center)
1254 .collect();
1255 let mut sorted_stable_centers = stable_centers.clone();
1256 sorted_stable_centers
1257 .sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1258 let max_stable_gap = sorted_stable_centers
1259 .windows(2)
1260 .map(|pair| pair[1] - pair[0])
1261 .fold(0.0, f64::max);
1262 let spans_full_table_width = stable_centers.len() >= 3
1263 && stable_centers
1264 .iter()
1265 .any(|center| *center - f64::from(min_left) <= width_f * 0.25)
1266 && stable_centers
1267 .iter()
1268 .any(|center| *center - f64::from(min_left) >= width_f * 0.75)
1269 && stable_centers.iter().any(|center| {
1270 let rel = *center - f64::from(min_left);
1271 rel >= inner_left && rel <= inner_right
1272 })
1273 && max_stable_gap <= width_f * 0.45;
1274 if spans_full_table_width {
1275 let table_like_lines = by_line
1276 .values()
1277 .filter(|line_words| {
1278 let mut seen = HashSet::<usize>::new();
1279 for word in *line_words {
1280 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1281 for (idx, stable_center) in stable_centers.iter().enumerate() {
1282 if (center - stable_center).abs() <= tolerance {
1283 seen.insert(idx);
1284 }
1285 }
1286 }
1287 seen.len() >= 3
1288 })
1289 .count();
1290 if table_like_lines >= 4 {
1291 return false;
1292 }
1293 }
1294
1295 let mut short_lines = 0usize;
1296 let mut peripheral_label_lines = 0usize;
1297 let mut wide_sentence_lines = 0usize;
1298 let mut axisish_numeric_lines = 0usize;
1299
1300 for line_words in by_line.values() {
1301 let line_left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1302 let line_top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1303 let line_right = line_words
1304 .iter()
1305 .map(|word| word.left.saturating_add(word.width))
1306 .max()
1307 .unwrap_or(0);
1308 let line_bottom = line_words
1309 .iter()
1310 .map(|word| word.top.saturating_add(word.height))
1311 .max()
1312 .unwrap_or(0);
1313 if line_right <= line_left || line_bottom <= line_top {
1314 continue;
1315 }
1316
1317 let word_count = line_words.len();
1318 let numeric_in_line = line_words
1319 .iter()
1320 .filter(|word| is_numeric_like(&word.text))
1321 .count();
1322 let line_width_ratio =
1323 f64::from(line_right.saturating_sub(line_left)) / f64::from(image_width.max(1));
1324 let touches_outer_band = f64::from(line_left.saturating_sub(min_left)) <= outer_x
1325 || f64::from(line_right.saturating_sub(min_left)) >= width_f - outer_x
1326 || f64::from(line_top.saturating_sub(min_top)) <= outer_y
1327 || f64::from(line_bottom.saturating_sub(min_top)) >= height_f - outer_y;
1328
1329 if word_count <= 3 {
1330 short_lines += 1;
1331 }
1332 if touches_outer_band && word_count <= 4 {
1333 peripheral_label_lines += 1;
1334 }
1335 if touches_outer_band && word_count <= 3 && numeric_in_line > 0 {
1336 axisish_numeric_lines += 1;
1337 }
1338 if word_count >= 4 && line_width_ratio >= 0.45 && numeric_in_line == 0 {
1339 wide_sentence_lines += 1;
1340 }
1341 }
1342
1343 let total_lines = by_line.len();
1344 let outer_dominant = outer_words * 10 >= words.len() * 5;
1345 let inner_sparse = inner_words * 10 <= words.len() * 5;
1346 let label_dominant = peripheral_label_lines * 10 >= total_lines * 6;
1347 let short_line_dominant = short_lines * 10 >= total_lines * 6;
1348 let axis_signal = axisish_numeric_lines >= 2;
1349
1350 outer_dominant
1351 && inner_sparse
1352 && label_dominant
1353 && short_line_dominant
1354 && axis_signal
1355 && wide_sentence_lines <= 2
1356}
1357
1358fn looks_like_matrix_formula_ocr(words: &[OcrWord]) -> bool {
1359 if words.len() < 6 {
1360 return false;
1361 }
1362
1363 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1364 for word in words {
1365 by_line.entry(word.line_key).or_default().push(word);
1366 }
1367
1368 if by_line.len() < 2 || by_line.len() > 4 {
1369 return false;
1370 }
1371
1372 let substantive_words = words
1373 .iter()
1374 .filter(|word| is_substantive_table_word(&word.text))
1375 .count();
1376 let short_formulaish_words = words
1377 .iter()
1378 .filter(|word| is_short_formulaish_word(&word.text))
1379 .count();
1380 let slash_words = words.iter().filter(|word| word.text.contains('/')).count();
1381 let equation_label_words = words
1382 .iter()
1383 .filter(|word| looks_like_equation_label_word(&word.text))
1384 .count();
1385 let dense_lines = by_line.values().filter(|line| line.len() >= 3).count();
1386 let short_lines = by_line
1387 .values()
1388 .filter(|line| line.iter().all(|word| is_short_formulaish_word(&word.text)))
1389 .count();
1390
1391 substantive_words == 0
1392 && dense_lines >= 2
1393 && short_lines * 10 >= by_line.len() * 7
1394 && short_formulaish_words * 10 >= words.len() * 7
1395 && (slash_words > 0 || equation_label_words >= 2)
1396}
1397
1398fn is_substantive_table_word(text: &str) -> bool {
1399 let normalized: String = text
1400 .chars()
1401 .filter(|ch| ch.is_alphanumeric())
1402 .flat_map(char::to_lowercase)
1403 .collect();
1404 if normalized.is_empty() {
1405 return false;
1406 }
1407
1408 let alpha_count = normalized.chars().filter(|ch| ch.is_alphabetic()).count();
1409 let digit_count = normalized.chars().filter(|ch| ch.is_ascii_digit()).count();
1410 let has_non_binary_digit = normalized
1411 .chars()
1412 .any(|ch| ch.is_ascii_digit() && !matches!(ch, '0' | '1'));
1413
1414 alpha_count >= 4
1415 || (digit_count >= 2 && alpha_count == 0 && has_non_binary_digit)
1416 || (normalized.len() >= 5 && alpha_count >= 2)
1417}
1418
1419fn is_short_formulaish_word(text: &str) -> bool {
1420 let normalized: String = text
1421 .chars()
1422 .filter(|ch| ch.is_alphanumeric())
1423 .flat_map(char::to_lowercase)
1424 .collect();
1425 if normalized.is_empty() {
1426 return true;
1427 }
1428
1429 normalized.len() <= 3 || (text.contains('/') && normalized.len() <= 4)
1430}
1431
1432fn looks_like_equation_label_word(text: &str) -> bool {
1433 let trimmed = text.trim_matches(|ch: char| !ch.is_alphanumeric());
1434 let mut chars = trimmed.chars();
1435 let Some(first) = chars.next() else {
1436 return false;
1437 };
1438 if !first.is_ascii_alphabetic() || !first.is_ascii_uppercase() {
1439 return false;
1440 }
1441
1442 let remainder: String = chars.collect();
1443 !remainder.is_empty() && remainder.len() <= 3 && remainder.chars().all(|ch| ch.is_ascii_digit())
1444}
1445
1446fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
1447 if words.len() < 8 {
1448 return false;
1449 }
1450
1451 if looks_like_chart_label_ocr(words) {
1452 return false;
1453 }
1454
1455 if looks_like_matrix_formula_ocr(words) {
1456 return false;
1457 }
1458
1459 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1460 for word in words {
1461 by_line.entry(word.line_key).or_default().push(word);
1462 }
1463
1464 let mut qualifying_lines = Vec::new();
1465 let mut numeric_like_count = 0usize;
1466 let mut max_right = 0u32;
1467 for line_words in by_line.values_mut() {
1468 line_words.sort_by_key(|word| word.left);
1469 let numeric_words = line_words
1470 .iter()
1471 .filter(|word| is_numeric_like(&word.text))
1472 .count();
1473 numeric_like_count += numeric_words;
1474 if line_words.len() >= 3 || numeric_words >= 2 {
1475 max_right = max_right.max(
1476 line_words
1477 .iter()
1478 .map(|word| word.left.saturating_add(word.width))
1479 .max()
1480 .unwrap_or(0),
1481 );
1482 qualifying_lines.push(line_words.clone());
1483 }
1484 }
1485
1486 if qualifying_lines.len() < 2 {
1487 return false;
1488 }
1489
1490 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1491 let mut clusters: Vec<XCluster> = Vec::new();
1492 for line in &qualifying_lines {
1493 for word in line {
1494 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1495 if let Some(cluster) = clusters
1496 .iter_mut()
1497 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1498 {
1499 cluster.center =
1500 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1501 cluster.count += 1;
1502 cluster.lines.insert(word.line_key);
1503 } else {
1504 let mut lines = HashSet::new();
1505 lines.insert(word.line_key);
1506 clusters.push(XCluster {
1507 center,
1508 count: 1,
1509 lines,
1510 });
1511 }
1512 }
1513 }
1514
1515 let repeated_clusters: Vec<&XCluster> = clusters
1516 .iter()
1517 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1518 .collect();
1519 if repeated_clusters.len() < 3 {
1520 return false;
1521 }
1522
1523 let repeated_centers: Vec<f64> = repeated_clusters
1524 .iter()
1525 .map(|cluster| cluster.center)
1526 .collect();
1527 let structured_lines = qualifying_lines
1528 .iter()
1529 .filter(|line| {
1530 let mut seen = HashSet::<usize>::new();
1531 for word in *line {
1532 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1533 for (idx, repeated_center) in repeated_centers.iter().enumerate() {
1534 if (center - repeated_center).abs() <= tolerance {
1535 seen.insert(idx);
1536 }
1537 }
1538 }
1539 seen.len() >= 3
1540 || (seen.len() >= 2
1541 && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
1542 })
1543 .count();
1544
1545 let alphabetic_words = words
1546 .iter()
1547 .filter(|word| word.text.chars().any(|ch| ch.is_alphabetic()))
1548 .count();
1549
1550 if numeric_like_count == 0
1554 && alphabetic_words * 10 >= words.len() * 9
1555 && repeated_clusters.len() <= 4
1556 {
1557 return false;
1558 }
1559
1560 structured_lines >= 3
1561 || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
1562}
1563
1564fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
1565 if !looks_like_table_ocr(words) {
1566 return false;
1567 }
1568
1569 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1570 for word in words {
1571 by_line.entry(word.line_key).or_default().push(word);
1572 }
1573
1574 let numeric_like_count = words
1575 .iter()
1576 .filter(|word| is_numeric_like(&word.text))
1577 .count();
1578 let numeric_lines = by_line
1579 .values()
1580 .filter(|line| {
1581 line.iter()
1582 .filter(|word| is_numeric_like(&word.text))
1583 .count()
1584 >= 2
1585 })
1586 .count();
1587
1588 numeric_like_count >= 12 && numeric_lines >= 3
1589}
1590
1591fn looks_like_dense_prose_image_ocr(words: &[OcrWord]) -> bool {
1592 if words.len() < MIN_DOMINANT_IMAGE_OCR_WORDS || looks_like_table_ocr(words) {
1593 return false;
1594 }
1595
1596 if looks_like_chart_label_ocr(words) {
1597 return false;
1598 }
1599
1600 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1601 let mut alphabetic_words = 0usize;
1602 let mut numeric_like_words = 0usize;
1603 for word in words {
1604 by_line.entry(word.line_key).or_default().push(word);
1605 if word.text.chars().any(|ch| ch.is_alphabetic()) {
1606 alphabetic_words += 1;
1607 }
1608 if is_numeric_like(&word.text) {
1609 numeric_like_words += 1;
1610 }
1611 }
1612
1613 if by_line.len() < MIN_DOMINANT_IMAGE_TEXT_LINES || alphabetic_words * 3 < words.len() * 2 {
1614 return false;
1615 }
1616 if numeric_like_words * 4 > words.len() {
1617 return false;
1618 }
1619
1620 let multiword_lines = by_line
1621 .values()
1622 .filter(|line| line.iter().filter(|word| word.text.len() >= 2).count() >= 3)
1623 .count();
1624 multiword_lines >= 4 && has_dense_prose_block_geometry(words)
1625}
1626
1627fn has_dense_prose_block_geometry(words: &[OcrWord]) -> bool {
1628 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1629 for word in words {
1630 by_line.entry(word.line_key).or_default().push(word);
1631 }
1632
1633 let mut spatial_lines = Vec::new();
1634 for line_words in by_line.values() {
1635 if line_words.len() < 3 {
1636 continue;
1637 }
1638
1639 let left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1640 let right = line_words
1641 .iter()
1642 .map(|word| word.left.saturating_add(word.width))
1643 .max()
1644 .unwrap_or(0);
1645 let top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1646 let bottom = line_words
1647 .iter()
1648 .map(|word| word.top.saturating_add(word.height))
1649 .max()
1650 .unwrap_or(0);
1651
1652 if right <= left || bottom <= top {
1653 continue;
1654 }
1655
1656 spatial_lines.push(SpatialOcrLine {
1657 left,
1658 top,
1659 right,
1660 bottom,
1661 text: String::new(),
1662 word_count: line_words.len(),
1663 line_count: 1,
1664 line_height_sum: bottom.saturating_sub(top).max(1),
1665 });
1666 }
1667
1668 spatial_lines.sort_by_key(|line| (line.top, line.left));
1669 if spatial_lines.len() < MIN_DENSE_PROSE_BLOCK_LINES {
1670 return false;
1671 }
1672
1673 let image_width = spatial_lines
1674 .iter()
1675 .map(|line| line.right)
1676 .max()
1677 .unwrap_or(0);
1678 if image_width == 0 {
1679 return false;
1680 }
1681
1682 let median_height = {
1683 let mut heights: Vec<u32> = spatial_lines
1684 .iter()
1685 .map(|line| line.bottom.saturating_sub(line.top).max(1))
1686 .collect();
1687 heights.sort_unstable();
1688 heights[heights.len() / 2]
1689 };
1690
1691 let mut best_line_count = 1usize;
1692 let mut best_left = spatial_lines[0].left;
1693 let mut best_right = spatial_lines[0].right;
1694 let mut current_line_count = 1usize;
1695 let mut current_left = spatial_lines[0].left;
1696 let mut current_right = spatial_lines[0].right;
1697
1698 for pair in spatial_lines.windows(2) {
1699 let prev = &pair[0];
1700 let curr = &pair[1];
1701 if spatial_lines_share_block_geometry(prev, curr, image_width, median_height) {
1702 current_line_count += 1;
1703 current_left = current_left.min(curr.left);
1704 current_right = current_right.max(curr.right);
1705 } else {
1706 if current_line_count > best_line_count {
1707 best_line_count = current_line_count;
1708 best_left = current_left;
1709 best_right = current_right;
1710 }
1711 current_line_count = 1;
1712 current_left = curr.left;
1713 current_right = curr.right;
1714 }
1715 }
1716
1717 if current_line_count > best_line_count {
1718 best_line_count = current_line_count;
1719 best_left = current_left;
1720 best_right = current_right;
1721 }
1722
1723 let block_width_ratio =
1724 f64::from(best_right.saturating_sub(best_left)) / f64::from(image_width);
1725 best_line_count >= MIN_DENSE_PROSE_BLOCK_LINES
1726 && block_width_ratio >= MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO
1727}
1728
1729fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1730 let image_width = words
1731 .iter()
1732 .map(|word| word.left.saturating_add(word.width))
1733 .max()?;
1734 let image_height = words
1735 .iter()
1736 .map(|word| word.top.saturating_add(word.height))
1737 .max()?;
1738 if image_width == 0 || image_height == 0 {
1739 return None;
1740 }
1741
1742 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1743 for word in words {
1744 by_line.entry(word.line_key).or_default().push(word);
1745 }
1746
1747 let max_right = words
1748 .iter()
1749 .map(|word| word.left.saturating_add(word.width))
1750 .max()
1751 .unwrap_or(0);
1752 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1753
1754 let mut clusters: Vec<XCluster> = Vec::new();
1755 for line_words in by_line.values() {
1756 for word in line_words {
1757 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1758 if let Some(cluster) = clusters
1759 .iter_mut()
1760 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1761 {
1762 cluster.center =
1763 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1764 cluster.count += 1;
1765 cluster.lines.insert(word.line_key);
1766 } else {
1767 let mut lines = HashSet::new();
1768 lines.insert(word.line_key);
1769 clusters.push(XCluster {
1770 center,
1771 count: 1,
1772 lines,
1773 });
1774 }
1775 }
1776 }
1777 let mut centers: Vec<f64> = clusters
1778 .into_iter()
1779 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1780 .map(|cluster| cluster.center)
1781 .collect();
1782 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1783 if centers.len() < 3 {
1784 return None;
1785 }
1786
1787 let mut built_rows = Vec::<OcrRowBuild>::new();
1788 let mut row_fill_counts = Vec::<usize>::new();
1789 for line_words in by_line.values() {
1790 let mut sorted_words = line_words.clone();
1791 sorted_words.sort_by_key(|word| word.left);
1792
1793 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1794 for word in &sorted_words {
1795 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1796 if let Some((col_idx, distance)) = centers
1797 .iter()
1798 .enumerate()
1799 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1800 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1801 {
1802 if distance <= tolerance {
1803 cells[col_idx].push(word);
1804 }
1805 }
1806 }
1807
1808 let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
1809 let numeric_cells = cells
1810 .iter()
1811 .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
1812 .count();
1813 if filled_cells < 3 && numeric_cells < 2 {
1814 continue;
1815 }
1816 row_fill_counts.push(filled_cells);
1817
1818 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
1819 let bottom_px = sorted_words
1820 .iter()
1821 .map(|word| word.top.saturating_add(word.height))
1822 .max()
1823 .unwrap_or(0);
1824 let top_y =
1825 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
1826 let bottom_y = image.bbox.top_y
1827 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
1828 let cell_texts = cells
1829 .iter()
1830 .map(|cell_words| {
1831 cell_words
1832 .iter()
1833 .map(|word| word.text.as_str())
1834 .collect::<Vec<_>>()
1835 .join(" ")
1836 })
1837 .collect();
1838 built_rows.push(OcrRowBuild {
1839 top_y,
1840 bottom_y,
1841 cell_texts,
1842 });
1843 }
1844
1845 if built_rows.len() < 2 {
1846 return None;
1847 }
1848 if row_fill_counts.is_empty() {
1849 return None;
1850 }
1851
1852 let mut sorted_fill_counts = row_fill_counts.clone();
1853 sorted_fill_counts.sort_unstable();
1854 let median_fill_ratio =
1855 sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
1856 if median_fill_ratio < MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO {
1857 return None;
1858 }
1859
1860 built_rows.sort_by(|a, b| {
1861 b.top_y
1862 .partial_cmp(&a.top_y)
1863 .unwrap_or(std::cmp::Ordering::Equal)
1864 });
1865 let x_coordinates = build_boundaries_from_centers(
1866 ¢ers,
1867 image.bbox.left_x,
1868 image.bbox.right_x,
1869 image_width,
1870 );
1871 let row_bounds: Vec<(f64, f64)> = built_rows
1872 .iter()
1873 .map(|row| (row.top_y, row.bottom_y))
1874 .collect();
1875 let y_coordinates = build_row_boundaries(&row_bounds);
1876 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
1877 return None;
1878 }
1879
1880 let mut rows = Vec::new();
1881 for (row_idx, row_build) in built_rows.iter().enumerate() {
1882 let row_bbox = BoundingBox::new(
1883 image.bbox.page_number,
1884 image.bbox.left_x,
1885 y_coordinates[row_idx + 1],
1886 image.bbox.right_x,
1887 y_coordinates[row_idx],
1888 );
1889 let mut cells = Vec::new();
1890 for col_idx in 0..centers.len() {
1891 let cell_bbox = BoundingBox::new(
1892 image.bbox.page_number,
1893 x_coordinates[col_idx],
1894 y_coordinates[row_idx + 1],
1895 x_coordinates[col_idx + 1],
1896 y_coordinates[row_idx],
1897 );
1898 let text = row_build
1899 .cell_texts
1900 .get(col_idx)
1901 .cloned()
1902 .unwrap_or_default();
1903 let mut content = Vec::new();
1904 if !text.trim().is_empty() {
1905 content.push(TableToken {
1906 base: TextChunk {
1907 value: text.trim().to_string(),
1908 bbox: cell_bbox.clone(),
1909 font_name: "OCR".to_string(),
1910 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
1911 font_weight: 400.0,
1912 italic_angle: 0.0,
1913 font_color: "#000000".to_string(),
1914 contrast_ratio: 21.0,
1915 symbol_ends: Vec::new(),
1916 text_format: TextFormat::Normal,
1917 text_type: TextType::Regular,
1918 pdf_layer: PdfLayer::Content,
1919 ocg_visible: true,
1920 index: None,
1921 page_number: image.bbox.page_number,
1922 level: None,
1923 mcid: None,
1924 },
1925 token_type: TableTokenType::Text,
1926 });
1927 }
1928 cells.push(TableBorderCell {
1929 bbox: cell_bbox,
1930 index: None,
1931 level: None,
1932 row_number: row_idx,
1933 col_number: col_idx,
1934 row_span: 1,
1935 col_span: 1,
1936 content,
1937 contents: Vec::new(),
1938 semantic_type: None,
1939 });
1940 }
1941 rows.push(TableBorderRow {
1942 bbox: row_bbox,
1943 index: None,
1944 level: None,
1945 row_number: row_idx,
1946 cells,
1947 semantic_type: None,
1948 });
1949 }
1950
1951 Some(TableBorder {
1952 bbox: image.bbox.clone(),
1953 index: None,
1954 level: None,
1955 x_coordinates: x_coordinates.clone(),
1956 x_widths: vec![0.0; x_coordinates.len()],
1957 y_coordinates: y_coordinates.clone(),
1958 y_widths: vec![0.0; y_coordinates.len()],
1959 rows,
1960 num_rows: built_rows.len(),
1961 num_columns: centers.len(),
1962 is_bad_table: false,
1963 is_table_transformer: true,
1964 previous_table: None,
1965 next_table: None,
1966 })
1967}
1968
1969fn build_structured_ocr_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1970 let image_width = words
1971 .iter()
1972 .map(|word| word.left.saturating_add(word.width))
1973 .max()?;
1974 let image_height = words
1975 .iter()
1976 .map(|word| word.top.saturating_add(word.height))
1977 .max()?;
1978 if image_width == 0 || image_height == 0 {
1979 return None;
1980 }
1981
1982 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1983 for word in words {
1984 by_line.entry(word.line_key).or_default().push(word);
1985 }
1986
1987 let max_right = words
1988 .iter()
1989 .map(|word| word.left.saturating_add(word.width))
1990 .max()
1991 .unwrap_or(0);
1992 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1993
1994 let mut clusters: Vec<XCluster> = Vec::new();
1995 for line_words in by_line.values() {
1996 for word in line_words {
1997 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1998 if let Some(cluster) = clusters
1999 .iter_mut()
2000 .find(|cluster| (cluster.center - center).abs() <= tolerance)
2001 {
2002 cluster.center =
2003 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
2004 cluster.count += 1;
2005 cluster.lines.insert(word.line_key);
2006 } else {
2007 let mut lines = HashSet::new();
2008 lines.insert(word.line_key);
2009 clusters.push(XCluster {
2010 center,
2011 count: 1,
2012 lines,
2013 });
2014 }
2015 }
2016 }
2017
2018 let mut centers: Vec<f64> = clusters
2019 .into_iter()
2020 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
2021 .map(|cluster| cluster.center)
2022 .collect();
2023 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
2024 if centers.len() < 3 {
2025 return None;
2026 }
2027
2028 let mut built_rows = Vec::<OcrRowBuild>::new();
2029 let mut row_fill_counts = Vec::<usize>::new();
2030 let mut occupied_columns = vec![0usize; centers.len()];
2031
2032 for line_words in by_line.values() {
2033 let mut sorted_words = line_words.clone();
2034 sorted_words.sort_by_key(|word| word.left);
2035
2036 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
2037 for word in &sorted_words {
2038 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
2039 if let Some((col_idx, distance)) = centers
2040 .iter()
2041 .enumerate()
2042 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
2043 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
2044 {
2045 if distance <= tolerance {
2046 cells[col_idx].push(word);
2047 }
2048 }
2049 }
2050
2051 let filled_indices: Vec<usize> = cells
2052 .iter()
2053 .enumerate()
2054 .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
2055 .collect();
2056 if filled_indices.len() < 2 {
2057 continue;
2058 }
2059
2060 let span = filled_indices.last().unwrap_or(&0) - filled_indices.first().unwrap_or(&0) + 1;
2061 if filled_indices.len() < 3 && span < 3 {
2062 continue;
2063 }
2064
2065 row_fill_counts.push(filled_indices.len());
2066 for idx in &filled_indices {
2067 if let Some(count) = occupied_columns.get_mut(*idx) {
2068 *count += 1;
2069 }
2070 }
2071
2072 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
2073 let bottom_px = sorted_words
2074 .iter()
2075 .map(|word| word.top.saturating_add(word.height))
2076 .max()
2077 .unwrap_or(0);
2078 let top_y =
2079 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
2080 let bottom_y = image.bbox.top_y
2081 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
2082 let cell_texts = cells
2083 .iter()
2084 .map(|cell_words| {
2085 let mut sorted_cell_words = cell_words.clone();
2086 sorted_cell_words.sort_by_key(|word| word.left);
2087 sorted_cell_words
2088 .iter()
2089 .map(|word| word.text.as_str())
2090 .collect::<Vec<_>>()
2091 .join(" ")
2092 })
2093 .collect();
2094 built_rows.push(OcrRowBuild {
2095 top_y,
2096 bottom_y,
2097 cell_texts,
2098 });
2099 }
2100
2101 if built_rows.len() < 3 || row_fill_counts.is_empty() {
2102 return None;
2103 }
2104
2105 let repeated_columns = occupied_columns.iter().filter(|count| **count >= 2).count();
2106 if repeated_columns < 3 {
2107 return None;
2108 }
2109
2110 let mut sorted_fill_counts = row_fill_counts.clone();
2111 sorted_fill_counts.sort_unstable();
2112 let median_fill_ratio =
2113 sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
2114 if median_fill_ratio < 0.5 {
2115 return None;
2116 }
2117
2118 built_rows.sort_by(|a, b| {
2119 b.top_y
2120 .partial_cmp(&a.top_y)
2121 .unwrap_or(std::cmp::Ordering::Equal)
2122 });
2123 let x_coordinates = build_boundaries_from_centers(
2124 ¢ers,
2125 image.bbox.left_x,
2126 image.bbox.right_x,
2127 image_width,
2128 );
2129 let row_bounds: Vec<(f64, f64)> = built_rows
2130 .iter()
2131 .map(|row| (row.top_y, row.bottom_y))
2132 .collect();
2133 let y_coordinates = build_row_boundaries(&row_bounds);
2134 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
2135 return None;
2136 }
2137
2138 let mut rows = Vec::new();
2139 for (row_idx, row_build) in built_rows.iter().enumerate() {
2140 let row_bbox = BoundingBox::new(
2141 image.bbox.page_number,
2142 image.bbox.left_x,
2143 y_coordinates[row_idx + 1],
2144 image.bbox.right_x,
2145 y_coordinates[row_idx],
2146 );
2147 let mut cells = Vec::new();
2148 for col_idx in 0..centers.len() {
2149 let cell_bbox = BoundingBox::new(
2150 image.bbox.page_number,
2151 x_coordinates[col_idx],
2152 y_coordinates[row_idx + 1],
2153 x_coordinates[col_idx + 1],
2154 y_coordinates[row_idx],
2155 );
2156 let text = row_build
2157 .cell_texts
2158 .get(col_idx)
2159 .cloned()
2160 .unwrap_or_default();
2161 let mut content = Vec::new();
2162 if !text.trim().is_empty() {
2163 content.push(TableToken {
2164 base: TextChunk {
2165 value: text.trim().to_string(),
2166 bbox: cell_bbox.clone(),
2167 font_name: "OCR".to_string(),
2168 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
2169 font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2170 italic_angle: 0.0,
2171 font_color: "#000000".to_string(),
2172 contrast_ratio: 21.0,
2173 symbol_ends: Vec::new(),
2174 text_format: TextFormat::Normal,
2175 text_type: TextType::Regular,
2176 pdf_layer: PdfLayer::Content,
2177 ocg_visible: true,
2178 index: None,
2179 page_number: image.bbox.page_number,
2180 level: None,
2181 mcid: None,
2182 },
2183 token_type: TableTokenType::Text,
2184 });
2185 }
2186 cells.push(TableBorderCell {
2187 bbox: cell_bbox,
2188 index: None,
2189 level: None,
2190 row_number: row_idx,
2191 col_number: col_idx,
2192 row_span: 1,
2193 col_span: 1,
2194 content,
2195 contents: Vec::new(),
2196 semantic_type: None,
2197 });
2198 }
2199 rows.push(TableBorderRow {
2200 bbox: row_bbox,
2201 index: None,
2202 level: None,
2203 row_number: row_idx,
2204 cells,
2205 semantic_type: None,
2206 });
2207 }
2208
2209 Some(TableBorder {
2210 bbox: image.bbox.clone(),
2211 index: None,
2212 level: None,
2213 x_coordinates: x_coordinates.clone(),
2214 x_widths: vec![0.0; x_coordinates.len()],
2215 y_coordinates: y_coordinates.clone(),
2216 y_widths: vec![0.0; y_coordinates.len()],
2217 rows,
2218 num_rows: built_rows.len(),
2219 num_columns: centers.len(),
2220 is_bad_table: false,
2221 is_table_transformer: true,
2222 previous_table: None,
2223 next_table: None,
2224 })
2225}
2226
2227fn is_matrixish_ocr_artifact_table(table: &TableBorder) -> bool {
2228 if !table.is_table_transformer
2229 || table.num_rows < 2
2230 || table.num_rows > 4
2231 || table.num_columns < 3
2232 || table.bbox.height() > table.bbox.width() * 0.55
2233 {
2234 return false;
2235 }
2236
2237 let texts: Vec<String> = table
2238 .rows
2239 .iter()
2240 .flat_map(|row| row.cells.iter())
2241 .map(table_cell_text)
2242 .filter(|text| !text.is_empty())
2243 .collect();
2244 if texts.len() < 6 {
2245 return false;
2246 }
2247
2248 let substantive_cells = texts
2249 .iter()
2250 .filter(|text| is_substantive_ocr_cell_text(text))
2251 .count();
2252 let short_cells = texts
2253 .iter()
2254 .filter(|text| is_short_ocr_cell_text(text))
2255 .count();
2256 let ambiguous_cells = texts
2257 .iter()
2258 .filter(|text| is_ambiguous_matrix_cell_text(text))
2259 .count();
2260
2261 substantive_cells == 0
2262 && short_cells * 10 >= texts.len() * 8
2263 && ambiguous_cells * 10 >= texts.len() * 5
2264}
2265
2266fn table_cell_text(cell: &TableBorderCell) -> String {
2267 cell.content
2268 .iter()
2269 .map(|token| token.base.value.trim())
2270 .filter(|value| !value.is_empty())
2271 .collect::<Vec<_>>()
2272 .join(" ")
2273}
2274
2275fn is_substantive_ocr_cell_text(text: &str) -> bool {
2276 text.split_whitespace().any(is_substantive_table_word)
2277}
2278
2279fn is_short_ocr_cell_text(text: &str) -> bool {
2280 let normalized: String = text
2281 .chars()
2282 .filter(|ch| ch.is_alphanumeric())
2283 .flat_map(char::to_lowercase)
2284 .collect();
2285 !normalized.is_empty() && normalized.len() <= 4
2286}
2287
2288fn is_ambiguous_matrix_cell_text(text: &str) -> bool {
2289 if text.contains(['/', '\\', '=', '|', '[', ']', '{', '}', '(', ')']) {
2290 return true;
2291 }
2292
2293 let normalized: String = text
2294 .chars()
2295 .filter(|ch| ch.is_alphanumeric())
2296 .flat_map(char::to_lowercase)
2297 .collect();
2298 !normalized.is_empty()
2299 && normalized.len() <= 4
2300 && normalized
2301 .chars()
2302 .all(|ch| matches!(ch, '0' | '1' | 'o' | 'd' | 'q' | 'i' | 'l'))
2303}
2304
2305fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
2306 let gray = image::open(image_path).ok()?.to_luma8();
2307 recover_bordered_raster_caption_from_gray(&gray, image)
2308}
2309
2310fn recover_bordered_raster_caption_from_gray(
2311 gray: &GrayImage,
2312 image: &ImageChunk,
2313) -> Option<TextChunk> {
2314 let grid = detect_bordered_raster_grid(gray)?;
2315 let first_h = *grid.horizontal_lines.first()?;
2316 if first_h <= 2 {
2317 return None;
2318 }
2319
2320 let crop = gray.view(0, 0, gray.width(), first_h).to_image();
2321 let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
2322 if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
2323 return None;
2324 }
2325
2326 let bbox = raster_box_to_page_bbox(
2327 image,
2328 0,
2329 0,
2330 gray.width(),
2331 first_h.max(1),
2332 gray.width().max(1),
2333 gray.height().max(1),
2334 )?;
2335 let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
2336 Some(TextChunk {
2337 value: caption_text,
2338 bbox,
2339 font_name: "OCR".to_string(),
2340 font_size,
2341 font_weight: 700.0,
2342 italic_angle: 0.0,
2343 font_color: "#000000".to_string(),
2344 contrast_ratio: 21.0,
2345 symbol_ends: Vec::new(),
2346 text_format: TextFormat::Normal,
2347 text_type: TextType::Regular,
2348 pdf_layer: PdfLayer::Content,
2349 ocg_visible: true,
2350 index: None,
2351 page_number: image.bbox.page_number,
2352 level: None,
2353 mcid: None,
2354 })
2355}
2356
2357fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
2358 let gray = image::open(image_path).ok()?.to_luma8();
2359 recover_bordered_raster_table_from_gray(&gray, image)
2360}
2361
2362fn recover_bordered_raster_table_from_gray(
2363 gray: &GrayImage,
2364 image: &ImageChunk,
2365) -> Option<TableBorder> {
2366 let grid = detect_bordered_raster_grid(gray)?;
2367 let num_cols = grid.vertical_lines.len().checked_sub(1)?;
2368 let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
2369 if num_cols < 2 || num_rows < 2 {
2370 return None;
2371 }
2372 let table_bbox = raster_box_to_page_bbox(
2373 image,
2374 *grid.vertical_lines.first()?,
2375 *grid.horizontal_lines.first()?,
2376 *grid.vertical_lines.last()?,
2377 *grid.horizontal_lines.last()?,
2378 gray.width(),
2379 gray.height(),
2380 )?;
2381
2382 let x_coordinates = raster_boundaries_to_page(
2383 &grid.vertical_lines,
2384 image.bbox.left_x,
2385 image.bbox.right_x,
2386 gray.width(),
2387 )?;
2388 let y_coordinates = raster_boundaries_to_page_desc(
2389 &grid.horizontal_lines,
2390 image.bbox.bottom_y,
2391 image.bbox.top_y,
2392 gray.height(),
2393 )?;
2394
2395 if !bordered_grid_has_cell_ink(gray, &grid) {
2396 return None;
2397 }
2398
2399 let mut rows = Vec::with_capacity(num_rows);
2400 let mut non_empty_cells = 0usize;
2401 let mut rows_with_text = 0usize;
2402 let mut total_cells = 0usize;
2403 let mut whole_table_buckets =
2404 collect_bordered_table_ocr_buckets(gray, &grid, num_rows, num_cols)
2405 .unwrap_or_else(|| vec![Vec::new(); num_rows * num_cols]);
2406 let allow_per_cell_fallback =
2407 num_rows.saturating_mul(num_cols) <= MAX_BORDERED_TABLE_PER_CELL_FALLBACK_CELLS;
2408 for row_idx in 0..num_rows {
2409 let row_bbox = BoundingBox::new(
2410 image.bbox.page_number,
2411 image.bbox.left_x,
2412 y_coordinates[row_idx + 1],
2413 image.bbox.right_x,
2414 y_coordinates[row_idx],
2415 );
2416 let mut cells = Vec::with_capacity(num_cols);
2417 let mut row_has_text = false;
2418
2419 for col_idx in 0..num_cols {
2420 let x1 = grid.vertical_lines[col_idx];
2421 let x2 = grid.vertical_lines[col_idx + 1];
2422 let y1 = grid.horizontal_lines[row_idx];
2423 let y2 = grid.horizontal_lines[row_idx + 1];
2424 let cell_bbox = BoundingBox::new(
2425 image.bbox.page_number,
2426 x_coordinates[col_idx],
2427 y_coordinates[row_idx + 1],
2428 x_coordinates[col_idx + 1],
2429 y_coordinates[row_idx],
2430 );
2431 let bucket_idx = row_idx * num_cols + col_idx;
2432 let text = if let Some(parts) = whole_table_buckets.get_mut(bucket_idx) {
2433 if parts.is_empty() {
2434 String::new()
2435 } else {
2436 parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
2437 let raw = parts
2438 .iter()
2439 .map(|(_, _, text)| text.as_str())
2440 .collect::<Vec<_>>()
2441 .join(" ");
2442 normalize_raster_cell_text(row_idx, col_idx, raw)
2443 }
2444 } else {
2445 String::new()
2446 };
2447 let text = if text.is_empty() && allow_per_cell_fallback {
2448 extract_raster_cell_text(gray, row_idx, col_idx, x1, y1, x2, y2).unwrap_or_default()
2449 } else {
2450 text
2451 };
2452 total_cells += 1;
2453
2454 let mut content = Vec::new();
2455 if !text.is_empty() {
2456 row_has_text = true;
2457 non_empty_cells += 1;
2458 content.push(TableToken {
2459 base: TextChunk {
2460 value: text,
2461 bbox: cell_bbox.clone(),
2462 font_name: "OCR".to_string(),
2463 font_size: (cell_bbox.height() * 0.55).max(6.0),
2464 font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2465 italic_angle: 0.0,
2466 font_color: "#000000".to_string(),
2467 contrast_ratio: 21.0,
2468 symbol_ends: Vec::new(),
2469 text_format: TextFormat::Normal,
2470 text_type: TextType::Regular,
2471 pdf_layer: PdfLayer::Content,
2472 ocg_visible: true,
2473 index: None,
2474 page_number: image.bbox.page_number,
2475 level: None,
2476 mcid: None,
2477 },
2478 token_type: TableTokenType::Text,
2479 });
2480 }
2481
2482 cells.push(TableBorderCell {
2483 bbox: cell_bbox,
2484 index: None,
2485 level: None,
2486 row_number: row_idx,
2487 col_number: col_idx,
2488 row_span: 1,
2489 col_span: 1,
2490 content,
2491 contents: Vec::new(),
2492 semantic_type: None,
2493 });
2494 }
2495
2496 if row_has_text {
2497 rows_with_text += 1;
2498 }
2499
2500 rows.push(TableBorderRow {
2501 bbox: row_bbox,
2502 index: None,
2503 level: None,
2504 row_number: row_idx,
2505 cells,
2506 semantic_type: None,
2507 });
2508 }
2509
2510 if total_cells == 0 {
2511 return None;
2512 }
2513 let text_cell_ratio = non_empty_cells as f64 / total_cells as f64;
2514 if text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
2515 || rows_with_text < MIN_RASTER_TABLE_ROWS_WITH_TEXT
2516 {
2517 return None;
2518 }
2519
2520 Some(TableBorder {
2521 bbox: table_bbox,
2522 index: None,
2523 level: None,
2524 x_coordinates: x_coordinates.clone(),
2525 x_widths: vec![0.0; x_coordinates.len()],
2526 y_coordinates: y_coordinates.clone(),
2527 y_widths: vec![0.0; y_coordinates.len()],
2528 rows,
2529 num_rows,
2530 num_columns: num_cols,
2531 is_bad_table: false,
2532 is_table_transformer: true,
2533 previous_table: None,
2534 next_table: None,
2535 })
2536}
2537
2538fn collect_bordered_table_ocr_buckets(
2539 gray: &GrayImage,
2540 grid: &RasterTableGrid,
2541 num_rows: usize,
2542 num_cols: usize,
2543) -> Option<Vec<Vec<(u32, u32, String)>>> {
2544 if num_rows == 0 || num_cols == 0 {
2545 return None;
2546 }
2547
2548 let bordered = expand_white_border(gray, TABLE_RASTER_OCR_BORDER_PX);
2549 let scaled = image::imageops::resize(
2550 &bordered,
2551 bordered.width() * OCR_SCALE_FACTOR,
2552 bordered.height() * OCR_SCALE_FACTOR,
2553 image::imageops::FilterType::Lanczos3,
2554 );
2555 let words = run_tesseract_tsv_words_best(&scaled, &["6", "11"], |_| true)?;
2556 if words.is_empty() || looks_like_chart_label_ocr(&words) {
2557 return None;
2558 }
2559
2560 let mut buckets = vec![Vec::new(); num_rows * num_cols];
2561 let scale = f64::from(OCR_SCALE_FACTOR);
2562 let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
2563
2564 for word in words {
2565 let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
2566 let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
2567
2568 let cx = cx_scaled / scale - border;
2569 let cy = cy_scaled / scale - border;
2570 if cx < 0.0 || cy < 0.0 {
2571 continue;
2572 }
2573
2574 let cx = match u32::try_from(cx.round() as i64) {
2575 Ok(value) => value,
2576 Err(_) => continue,
2577 };
2578 let cy = match u32::try_from(cy.round() as i64) {
2579 Ok(value) => value,
2580 Err(_) => continue,
2581 };
2582
2583 let col_idx = grid
2584 .vertical_lines
2585 .windows(2)
2586 .position(|span| cx >= span[0] && cx < span[1]);
2587 let row_idx = grid
2588 .horizontal_lines
2589 .windows(2)
2590 .position(|span| cy >= span[0] && cy < span[1]);
2591 let (Some(row_idx), Some(col_idx)) = (row_idx, col_idx) else {
2592 continue;
2593 };
2594
2595 buckets[row_idx * num_cols + col_idx].push((cy, cx, word.text));
2596 }
2597
2598 Some(buckets)
2599}
2600
2601fn is_obvious_bar_chart_raster(gray: &GrayImage) -> bool {
2602 let width = gray.width();
2603 let height = gray.height();
2604 if width < 160 || height < 120 {
2605 return false;
2606 }
2607
2608 let min_ink_pixels = (f64::from(width) * 0.35).ceil() as u32;
2609 let min_run_height = (height / 80).max(6);
2610 let wide_ink_row_runs = merge_runs(
2611 (0..height)
2612 .filter(|&y| count_ink_in_row(gray, y, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels),
2613 );
2614 let thick_runs = wide_ink_row_runs
2615 .into_iter()
2616 .filter(|(start, end)| end.saturating_sub(*start) + 1 >= min_run_height)
2617 .count();
2618
2619 thick_runs >= 3 || is_obvious_vertical_bar_chart_raster(gray)
2620}
2621
2622fn is_obvious_vertical_bar_chart_raster(gray: &GrayImage) -> bool {
2623 let width = gray.width();
2624 let height = gray.height();
2625 if width < 160 || height < 120 {
2626 return false;
2627 }
2628
2629 let min_ink_pixels = (f64::from(height) * 0.08).ceil() as u32;
2630 let min_bar_width = (width / 28).max(10);
2631 let min_bar_height = (height / 8).max(16);
2632 let max_baseline_delta = (height / 14).max(8);
2633 let min_fill_ratio = 0.10;
2634
2635 let candidate_runs =
2636 merge_runs((0..width).filter(|&x| {
2637 count_ink_in_column(gray, x, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels
2638 }));
2639 let mut baselines = Vec::new();
2640 let mut has_dominant_bar = false;
2641 let mut qualifying_bars = 0usize;
2642
2643 for (start, end) in candidate_runs {
2644 let run_width = end.saturating_sub(start) + 1;
2645 if run_width < min_bar_width {
2646 continue;
2647 }
2648
2649 let mut top = height;
2650 let mut bottom = 0u32;
2651 let mut ink_pixels = 0usize;
2652 for x in start..=end {
2653 for y in 0..height {
2654 if gray.get_pixel(x, y).0[0] < RASTER_CHART_INK_THRESHOLD {
2655 top = top.min(y);
2656 bottom = bottom.max(y);
2657 ink_pixels += 1;
2658 }
2659 }
2660 }
2661
2662 if top >= height || bottom <= top {
2663 continue;
2664 }
2665
2666 let run_height = bottom.saturating_sub(top) + 1;
2667 if run_height < min_bar_height {
2668 continue;
2669 }
2670
2671 let bbox_area = run_width as usize * run_height as usize;
2672 if bbox_area == 0 {
2673 continue;
2674 }
2675
2676 let fill_ratio = ink_pixels as f64 / bbox_area as f64;
2677 if fill_ratio < min_fill_ratio {
2678 continue;
2679 }
2680
2681 qualifying_bars += 1;
2682 if run_width >= min_bar_width.saturating_mul(2) {
2683 has_dominant_bar = true;
2684 }
2685 baselines.push(bottom);
2686 }
2687
2688 if baselines.len() < 2 {
2689 return false;
2690 }
2691
2692 baselines.sort_unstable();
2693 let median_baseline = baselines[baselines.len() / 2];
2694 let aligned_baselines = baselines
2695 .iter()
2696 .filter(|baseline| baseline.abs_diff(median_baseline) <= max_baseline_delta)
2697 .count();
2698
2699 aligned_baselines >= 2 && (has_dominant_bar || (qualifying_bars >= 4 && aligned_baselines >= 4))
2700}
2701
2702fn is_natural_photograph_raster(gray: &GrayImage) -> bool {
2715 let total = (gray.width() * gray.height()) as usize;
2716 if total < 400 {
2717 return false;
2718 }
2719
2720 let mut histogram = [0usize; 256];
2721 for pixel in gray.pixels() {
2722 histogram[pixel[0] as usize] += 1;
2723 }
2724
2725 let mid_tone_count: usize = histogram[40..=215].iter().sum();
2726 if mid_tone_count * 10 >= total * 3 {
2727 return true;
2728 }
2729
2730 let mut coarse_histogram = [0usize; 16];
2731 for (value, count) in histogram.iter().enumerate() {
2732 coarse_histogram[value / 16] += count;
2733 }
2734
2735 let occupied_bins = coarse_histogram
2736 .iter()
2737 .filter(|count| **count as f64 >= total as f64 * 0.01)
2738 .count();
2739 let entropy = coarse_histogram.iter().fold(0.0, |acc, count| {
2740 if *count == 0 {
2741 return acc;
2742 }
2743 let probability = *count as f64 / total as f64;
2744 acc - probability * probability.log2()
2745 });
2746
2747 mid_tone_count as f64 / total as f64 >= MIN_BRIGHT_PHOTO_MID_TONE_RATIO
2748 && occupied_bins >= MIN_BRIGHT_PHOTO_HISTOGRAM_BINS
2749 && entropy >= MIN_BRIGHT_PHOTO_ENTROPY
2750}
2751
2752fn is_dark_ui_screenshot_raster(gray: &GrayImage) -> bool {
2755 let total = (gray.width() * gray.height()) as usize;
2756 if total < 400 {
2757 return false;
2758 }
2759
2760 let very_dark_count = gray.pixels().filter(|p| p[0] <= 39).count();
2761 let non_extreme_count = gray.pixels().filter(|p| p[0] >= 15 && p[0] <= 240).count();
2762 let bright_detail_count = gray.pixels().filter(|p| p[0] >= 180 && p[0] <= 245).count();
2763
2764 very_dark_count * 20 >= total * 13
2765 && non_extreme_count * 2 >= total
2766 && bright_detail_count * 20 >= total
2767}
2768
2769fn bordered_grid_has_cell_ink(gray: &GrayImage, grid: &RasterTableGrid) -> bool {
2770 let num_cols = match grid.vertical_lines.len().checked_sub(1) {
2771 Some(value) => value,
2772 None => return false,
2773 };
2774 let num_rows = match grid.horizontal_lines.len().checked_sub(1) {
2775 Some(value) => value,
2776 None => return false,
2777 };
2778 if num_cols == 0 || num_rows == 0 {
2779 return false;
2780 }
2781
2782 let mut total_cells = 0usize;
2783 let mut inked_cells = 0usize;
2784 let mut rows_with_ink = 0usize;
2785
2786 for row_idx in 0..num_rows {
2787 let mut row_has_ink = false;
2788 for col_idx in 0..num_cols {
2789 total_cells += 1;
2790 let x1 = grid.vertical_lines[col_idx];
2791 let x2 = grid.vertical_lines[col_idx + 1];
2792 let y1 = grid.horizontal_lines[row_idx];
2793 let y2 = grid.horizontal_lines[row_idx + 1];
2794
2795 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2796 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2797 let crop_left = x1 + inset_x;
2798 let crop_top = y1 + inset_y;
2799 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2800 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2801 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2802 continue;
2803 }
2804
2805 let dark_pixels = (crop_top..crop_top + crop_height)
2806 .flat_map(|y| (crop_left..crop_left + crop_width).map(move |x| (x, y)))
2807 .filter(|&(x, y)| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2808 .count();
2809 let area = (crop_width as usize) * (crop_height as usize);
2810 if area == 0 {
2811 continue;
2812 }
2813
2814 let dark_ratio = dark_pixels as f64 / area as f64;
2815 if dark_ratio >= MIN_BORDERED_CELL_DARK_RATIO {
2816 inked_cells += 1;
2817 row_has_ink = true;
2818 }
2819 }
2820 if row_has_ink {
2821 rows_with_ink += 1;
2822 }
2823 }
2824
2825 if total_cells == 0 {
2826 return false;
2827 }
2828
2829 (inked_cells as f64 / total_cells as f64) >= MIN_BORDERED_INKED_CELL_RATIO
2830 && rows_with_ink >= MIN_BORDERED_ROWS_WITH_INK
2831}
2832
2833fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
2834 let mut best_grid: Option<(RasterTableGrid, f64)> = None;
2835 for variant in build_ocr_variants(gray) {
2836 let Some((grid, score)) = detect_bordered_raster_grid_single(&variant) else {
2837 continue;
2838 };
2839 match &best_grid {
2840 Some((_, best_score)) if *best_score >= score => {}
2841 _ => best_grid = Some((grid, score)),
2842 }
2843 }
2844 best_grid.map(|(grid, _)| grid)
2845}
2846
2847fn detect_bordered_raster_grid_single(gray: &GrayImage) -> Option<(RasterTableGrid, f64)> {
2848 let width = gray.width();
2849 let height = gray.height();
2850 if width < 100 || height < 80 {
2851 return None;
2852 }
2853
2854 let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
2855 let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
2856
2857 let vertical_runs =
2858 merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
2859 let horizontal_runs =
2860 merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
2861 if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
2862 || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
2863 {
2864 return None;
2865 }
2866
2867 let mut vertical_lines: Vec<u32> = vertical_runs
2868 .into_iter()
2869 .map(|(start, end)| (start + end) / 2)
2870 .collect();
2871 let mut horizontal_lines: Vec<u32> = horizontal_runs
2872 .into_iter()
2873 .map(|(start, end)| (start + end) / 2)
2874 .collect();
2875
2876 let (&rough_min_x, &rough_max_x) = vertical_lines.first().zip(vertical_lines.last())?;
2877 let (&rough_min_y, &rough_max_y) = horizontal_lines.first().zip(horizontal_lines.last())?;
2878 if rough_max_x <= rough_min_x || rough_max_y <= rough_min_y {
2879 return None;
2880 }
2881
2882 vertical_lines.retain(|&x| {
2883 dark_ratio_in_column(gray, x, rough_min_y, rough_max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY
2884 });
2885 horizontal_lines.retain(|&y| {
2886 dark_ratio_in_row(gray, y, rough_min_x, rough_max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY
2887 });
2888 if vertical_lines.len() < MIN_BORDERED_VERTICAL_LINES
2889 || horizontal_lines.len() < MIN_BORDERED_HORIZONTAL_LINES
2890 {
2891 return None;
2892 }
2893
2894 if vertical_lines
2895 .windows(2)
2896 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2897 || horizontal_lines
2898 .windows(2)
2899 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2900 {
2901 return None;
2902 }
2903 if !grid_lines_are_continuous(&vertical_lines, &horizontal_lines, gray) {
2904 return None;
2905 }
2906
2907 let continuity = grid_continuity_score(&vertical_lines, &horizontal_lines, gray);
2908 let line_score = vertical_lines.len() as f64 + horizontal_lines.len() as f64;
2909 let score = continuity * 100.0 + line_score;
2910
2911 Some((
2912 RasterTableGrid {
2913 vertical_lines,
2914 horizontal_lines,
2915 },
2916 score,
2917 ))
2918}
2919
2920fn grid_lines_are_continuous(
2921 vertical_lines: &[u32],
2922 horizontal_lines: &[u32],
2923 gray: &GrayImage,
2924) -> bool {
2925 let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2926 return false;
2927 };
2928 let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2929 return false;
2930 };
2931 if max_x <= min_x || max_y <= min_y {
2932 return false;
2933 }
2934
2935 vertical_lines
2936 .iter()
2937 .all(|&x| dark_ratio_in_column(gray, x, min_y, max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2938 && horizontal_lines
2939 .iter()
2940 .all(|&y| dark_ratio_in_row(gray, y, min_x, max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2941}
2942
2943fn grid_continuity_score(
2944 vertical_lines: &[u32],
2945 horizontal_lines: &[u32],
2946 gray: &GrayImage,
2947) -> f64 {
2948 let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2949 return 0.0;
2950 };
2951 let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2952 return 0.0;
2953 };
2954 if max_x <= min_x || max_y <= min_y {
2955 return 0.0;
2956 }
2957
2958 let mut samples = 0usize;
2959 let mut sum = 0.0;
2960 for &x in vertical_lines {
2961 sum += dark_ratio_in_column(gray, x, min_y, max_y);
2962 samples += 1;
2963 }
2964 for &y in horizontal_lines {
2965 sum += dark_ratio_in_row(gray, y, min_x, max_x);
2966 samples += 1;
2967 }
2968 if samples == 0 {
2969 0.0
2970 } else {
2971 sum / samples as f64
2972 }
2973}
2974
2975fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
2976 count_ink_in_column(gray, x, RASTER_DARK_THRESHOLD)
2977}
2978
2979fn count_ink_in_column(gray: &GrayImage, x: u32, threshold: u8) -> u32 {
2980 (0..gray.height())
2981 .filter(|&y| gray.get_pixel(x, y).0[0] < threshold)
2982 .count() as u32
2983}
2984
2985fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
2986 count_ink_in_row(gray, y, RASTER_DARK_THRESHOLD)
2987}
2988
2989fn count_ink_in_row(gray: &GrayImage, y: u32, threshold: u8) -> u32 {
2990 (0..gray.width())
2991 .filter(|&x| gray.get_pixel(x, y).0[0] < threshold)
2992 .count() as u32
2993}
2994
2995fn dark_ratio_in_column(gray: &GrayImage, x: u32, y1: u32, y2: u32) -> f64 {
2996 if y2 <= y1 || x >= gray.width() {
2997 return 0.0;
2998 }
2999 let dark = (y1..=y2)
3000 .filter(|&y| y < gray.height() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
3001 .count();
3002 dark as f64 / f64::from(y2 - y1 + 1)
3003}
3004
3005fn dark_ratio_in_row(gray: &GrayImage, y: u32, x1: u32, x2: u32) -> f64 {
3006 if x2 <= x1 || y >= gray.height() {
3007 return 0.0;
3008 }
3009 let dark = (x1..=x2)
3010 .filter(|&x| x < gray.width() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
3011 .count();
3012 dark as f64 / f64::from(x2 - x1 + 1)
3013}
3014
3015fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
3016 let mut runs = Vec::new();
3017 let mut start = None;
3018 let mut prev = 0u32;
3019 for value in values {
3020 match start {
3021 None => {
3022 start = Some(value);
3023 prev = value;
3024 }
3025 Some(s) if value == prev + 1 => {
3026 prev = value;
3027 start = Some(s);
3028 }
3029 Some(s) => {
3030 runs.push((s, prev));
3031 start = Some(value);
3032 prev = value;
3033 }
3034 }
3035 }
3036 if let Some(s) = start {
3037 runs.push((s, prev));
3038 }
3039 runs
3040}
3041
3042fn build_boundaries_from_centers(
3043 centers: &[f64],
3044 left_edge: f64,
3045 right_edge: f64,
3046 image_width: u32,
3047) -> Vec<f64> {
3048 let mut boundaries = Vec::with_capacity(centers.len() + 1);
3049 boundaries.push(left_edge);
3050 if centers.len() < 2 || image_width == 0 || right_edge <= left_edge {
3051 boundaries.push(right_edge.max(left_edge));
3052 return boundaries;
3053 }
3054
3055 let page_width = right_edge - left_edge;
3056 let mut previous = left_edge;
3057 for pair in centers.windows(2) {
3058 let midpoint_px = ((pair[0] + pair[1]) / 2.0).clamp(0.0, f64::from(image_width));
3059 let boundary =
3060 left_edge + midpoint_px / f64::from(image_width) * page_width;
3061 let boundary = boundary.clamp(previous, right_edge);
3062 boundaries.push(boundary);
3063 previous = boundary;
3064 }
3065 boundaries.push(right_edge);
3066 boundaries
3067}
3068
3069fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
3070 let mut boundaries = Vec::with_capacity(rows.len() + 1);
3071 boundaries.push(rows[0].0);
3072 for pair in rows.windows(2) {
3073 boundaries.push((pair[0].1 + pair[1].0) / 2.0);
3074 }
3075 boundaries.push(rows[rows.len() - 1].1);
3076 boundaries
3077}
3078
3079fn raster_boundaries_to_page(
3080 lines: &[u32],
3081 left_edge: f64,
3082 right_edge: f64,
3083 image_width: u32,
3084) -> Option<Vec<f64>> {
3085 if image_width == 0 {
3086 return None;
3087 }
3088 let scale = (right_edge - left_edge) / f64::from(image_width);
3089 Some(
3090 lines
3091 .iter()
3092 .map(|line| left_edge + f64::from(*line) * scale)
3093 .collect(),
3094 )
3095}
3096
3097fn raster_boundaries_to_page_desc(
3098 lines: &[u32],
3099 bottom_edge: f64,
3100 top_edge: f64,
3101 image_height: u32,
3102) -> Option<Vec<f64>> {
3103 if image_height == 0 {
3104 return None;
3105 }
3106 let page_height = top_edge - bottom_edge;
3107 Some(
3108 lines
3109 .iter()
3110 .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
3111 .collect(),
3112 )
3113}
3114
3115fn raster_box_to_page_bbox(
3116 image: &ImageChunk,
3117 x1: u32,
3118 y1: u32,
3119 x2: u32,
3120 y2: u32,
3121 image_width: u32,
3122 image_height: u32,
3123) -> Option<BoundingBox> {
3124 if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
3125 return None;
3126 }
3127 let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
3128 let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
3129 let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
3130 let bottom_y =
3131 image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
3132 Some(BoundingBox::new(
3133 image.bbox.page_number,
3134 left_x,
3135 bottom_y,
3136 right_x,
3137 top_y,
3138 ))
3139}
3140
3141fn extract_raster_cell_text(
3142 gray: &GrayImage,
3143 row_idx: usize,
3144 col_idx: usize,
3145 x1: u32,
3146 y1: u32,
3147 x2: u32,
3148 y2: u32,
3149) -> Option<String> {
3150 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
3151 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
3152 let crop_left = x1 + inset_x;
3153 let crop_top = y1 + inset_y;
3154 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
3155 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
3156 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
3157 return Some(String::new());
3158 }
3159
3160 let cropped = gray
3161 .view(crop_left, crop_top, crop_width, crop_height)
3162 .to_image();
3163 let bordered = expand_white_border(&cropped, 12);
3164 let scaled = image::imageops::resize(
3165 &bordered,
3166 bordered.width() * OCR_SCALE_FACTOR,
3167 bordered.height() * OCR_SCALE_FACTOR,
3168 image::imageops::FilterType::Lanczos3,
3169 );
3170 let psm_modes: [&str; 3] = if row_idx == 0 {
3171 ["6", "11", "7"]
3172 } else {
3173 ["7", "6", "11"]
3174 };
3175 let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
3176 Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
3177}
3178
3179fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
3180 let mut expanded = GrayImage::from_pixel(
3181 image.width() + border * 2,
3182 image.height() + border * 2,
3183 Luma([255]),
3184 );
3185 for y in 0..image.height() {
3186 for x in 0..image.width() {
3187 expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
3188 }
3189 }
3190 expanded
3191}
3192
3193fn run_tesseract_tsv_words(image: &GrayImage, psm: &str) -> Option<Vec<OcrWord>> {
3194 match selected_ocr_engine() {
3195 OcrEngine::RapidOcr => run_rapidocr_words(image),
3196 OcrEngine::Tesseract => run_tesseract_tsv_words_with_oem(image, psm, "3"),
3197 }
3198}
3199
3200fn run_tesseract_tsv_words_with_oem(
3201 image: &GrayImage,
3202 psm: &str,
3203 oem: &str,
3204) -> Option<Vec<OcrWord>> {
3205 let temp_dir = create_temp_dir(0).ok()?;
3206 let image_path = temp_dir.join("ocr.png");
3207 if image.save(&image_path).is_err() {
3208 let _ = fs::remove_dir_all(&temp_dir);
3209 return None;
3210 }
3211
3212 let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3213 let output = Command::new("tesseract")
3214 .current_dir(&temp_dir)
3215 .arg("ocr.png")
3216 .arg("stdout")
3217 .arg("--dpi")
3220 .arg(&dpi)
3221 .arg("--oem")
3222 .arg(oem)
3223 .arg("--psm")
3224 .arg(psm)
3225 .arg("-c")
3229 .arg("load_system_dawg=0")
3230 .arg("-c")
3231 .arg("load_freq_dawg=0")
3232 .arg("tsv")
3233 .output()
3234 .ok()?;
3235 let _ = fs::remove_dir_all(&temp_dir);
3236 if !output.status.success() {
3237 return None;
3238 }
3239
3240 let tsv = String::from_utf8_lossy(&output.stdout);
3241 Some(parse_tesseract_tsv(&tsv))
3242}
3243
3244fn run_tesseract_cell_text_best(image: &GrayImage, psm_modes: &[&str]) -> Option<String> {
3245 let mut best: Option<(String, f64)> = None;
3246
3247 if matches!(selected_ocr_engine(), OcrEngine::Tesseract) {
3248 let consensus_words = collect_consensus_words(image, psm_modes);
3250 if !consensus_words.is_empty() {
3251 let text = words_to_plain_line_text(&consensus_words);
3252 if !text.is_empty() {
3253 let score = score_ocr_words(&consensus_words, image.width(), image.height());
3254 best = Some((text, score));
3255 }
3256 }
3257 }
3258
3259 if best.is_none() {
3261 for variant in build_ocr_variants(image) {
3262 for psm in psm_modes {
3263 let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3264 continue;
3265 };
3266 if words.is_empty() {
3267 continue;
3268 }
3269 let text = words_to_plain_line_text(&words);
3270 if text.is_empty() {
3271 continue;
3272 }
3273 let score = score_ocr_words(&words, variant.width(), variant.height());
3274 match &best {
3275 Some((_, best_score)) if *best_score >= score => {}
3276 _ => best = Some((text, score)),
3277 }
3278
3279 if let Some(text) = run_tesseract_plain_text_with_variant(&variant, psm) {
3280 let norm_len = normalize_text(&text).len() as f64;
3281 if norm_len > 0.0 {
3282 match &best {
3283 Some((_, best_score)) if *best_score >= norm_len => {}
3284 _ => best = Some((text, norm_len)),
3285 }
3286 }
3287 }
3288 }
3289
3290 if let Some(words) = run_rapidocr_words(&variant) {
3295 let text = words_to_plain_line_text(&words);
3296 if !text.is_empty() {
3297 let score = score_ocr_words(&words, variant.width(), variant.height());
3298 match &best {
3299 Some((_, best_score)) if *best_score >= score => {}
3300 _ => best = Some((text, score)),
3301 }
3302 }
3303 }
3304 }
3305 }
3306
3307 best.map(|(text, _)| text)
3308}
3309
3310fn collect_consensus_words(image: &GrayImage, psm_modes: &[&str]) -> Vec<OcrWord> {
3311 let variants = build_ocr_variants(image);
3312
3313 let oems = ["1", "3"]; let mut perspective_best: HashMap<(String, String, String), OcrWord> = HashMap::new();
3328
3329 for variant in &variants {
3330 for psm in psm_modes {
3331 for oem in oems {
3332 let Some(words) = run_tesseract_tsv_words_with_oem(variant, psm, oem) else {
3333 continue;
3334 };
3335 for word in words {
3336 let key = (psm.to_string(), oem.to_string(), word.text.to_lowercase());
3337 perspective_best
3338 .entry(key)
3339 .and_modify(|best| {
3340 if word.confidence > best.confidence {
3341 *best = word.clone();
3342 }
3343 })
3344 .or_insert(word);
3345 }
3346 }
3347 }
3348 }
3349
3350 const MIN_PERSPECTIVES: usize = 2;
3353
3354 let mut text_to_perspectives: HashMap<String, HashSet<(String, String)>> = HashMap::new();
3355 for (psm, oem, norm_text) in perspective_best.keys() {
3356 text_to_perspectives
3357 .entry(norm_text.clone())
3358 .or_default()
3359 .insert((psm.clone(), oem.clone()));
3360 }
3361
3362 let mut consensus: Vec<OcrWord> = text_to_perspectives
3364 .iter()
3365 .filter(|(_, perspectives)| perspectives.len() >= MIN_PERSPECTIVES)
3366 .filter_map(|(norm_text, _)| {
3367 perspective_best
3368 .iter()
3369 .filter(|((_, _, t), _)| t == norm_text)
3370 .max_by(|(_, a), (_, b)| {
3371 a.confidence
3372 .partial_cmp(&b.confidence)
3373 .unwrap_or(std::cmp::Ordering::Equal)
3374 })
3375 .map(|(_, w)| w.clone())
3376 })
3377 .collect();
3378
3379 consensus.sort_by_key(|w| (w.top, w.left));
3380 consensus
3381}
3382
3383fn filter_words_by_spatial_coherence(words: &[OcrWord]) -> Vec<OcrWord> {
3384 if words.len() <= 1 {
3385 return words.to_vec();
3386 }
3387
3388 let median_h: u32 = {
3395 let mut heights: Vec<u32> = words.iter().map(|w| w.height.max(1)).collect();
3396 heights.sort_unstable();
3397 heights[heights.len() / 2]
3398 };
3399 let gap_threshold = (median_h * 3).max(8);
3401 let narrow_threshold = (median_h / 2).max(4);
3403 let min_iso_width = (median_h * 2 / 5).max(4);
3405 let min_iso_height = (median_h * 2 / 5).max(3);
3406
3407 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3409 for word in words {
3410 by_line.entry(word.line_key).or_default().push(word);
3411 }
3412
3413 let mut filtered = Vec::new();
3414
3415 for line_words in by_line.values_mut() {
3417 if line_words.len() <= 1 {
3418 if let Some(word) = line_words.first() {
3420 if word.width >= min_iso_width && word.height >= min_iso_height {
3421 filtered.push((*word).clone());
3422 }
3423 }
3424 continue;
3425 }
3426
3427 line_words.sort_by_key(|word| word.left);
3428
3429 for (i, word) in line_words.iter().enumerate() {
3431 let is_isolated = if i > 0 {
3432 let prev = line_words[i - 1];
3433 let gap = word
3434 .left
3435 .saturating_sub(prev.left.saturating_add(prev.width));
3436 gap > gap_threshold && word.width < narrow_threshold
3437 } else if i < line_words.len() - 1 {
3438 let next = line_words[i + 1];
3439 let gap = next
3440 .left
3441 .saturating_sub(word.left.saturating_add(word.width));
3442 gap > gap_threshold && word.width < narrow_threshold
3443 } else {
3444 false
3445 };
3446
3447 if !is_isolated {
3448 filtered.push((*word).clone());
3449 }
3450 }
3451 }
3452
3453 filtered
3454}
3455
3456fn cluster_words_by_proximity(words: &[OcrWord], gap_tolerance: u32) -> Vec<Vec<OcrWord>> {
3457 if words.is_empty() {
3458 return Vec::new();
3459 }
3460
3461 let mut sorted_words = words.to_vec();
3462 sorted_words.sort_by_key(|w| (w.top, w.left));
3463
3464 let median_h: i32 = {
3468 let mut heights: Vec<u32> = sorted_words.iter().map(|w| w.height.max(1)).collect();
3469 heights.sort_unstable();
3470 heights[heights.len() / 2] as i32
3471 };
3472 let vertical_tolerance = (median_h / 2).max(2);
3473
3474 let mut clusters: Vec<Vec<OcrWord>> = Vec::new();
3475 let mut current_cluster = vec![sorted_words[0].clone()];
3476
3477 for word in &sorted_words[1..] {
3478 if let Some(last) = current_cluster.last() {
3479 let vertical_gap = (word.top as i32 - last.top as i32).abs();
3480 let horizontal_gap = word
3481 .left
3482 .saturating_sub(last.left.saturating_add(last.width));
3483
3484 if vertical_gap <= vertical_tolerance && horizontal_gap <= gap_tolerance {
3485 current_cluster.push(word.clone());
3486 } else {
3487 clusters.push(current_cluster);
3488 current_cluster = vec![word.clone()];
3489 }
3490 }
3491 }
3492
3493 if !current_cluster.is_empty() {
3494 clusters.push(current_cluster);
3495 }
3496
3497 clusters
3498}
3499
3500fn words_to_plain_line_text(words: &[OcrWord]) -> String {
3501 let filtered_words = filter_words_by_spatial_coherence(words);
3503
3504 if filtered_words.is_empty() {
3505 return String::new();
3506 }
3507
3508 let avg_word_width =
3510 filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3511 let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3512 let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3513
3514 let mut lines: Vec<String> = Vec::new();
3515 for cluster in clusters {
3516 let mut sorted_cluster = cluster;
3517 sorted_cluster.sort_by_key(|w| w.left);
3518
3519 let line = sorted_cluster
3520 .iter()
3521 .map(|word| word.text.as_str())
3522 .collect::<Vec<_>>()
3523 .join(" ")
3524 .trim()
3525 .to_string();
3526
3527 if !line.is_empty() {
3528 lines.push(line);
3529 }
3530 }
3531
3532 lines.join(" ")
3533}
3534
3535fn run_tesseract_tsv_words_best<F>(
3540 image: &GrayImage,
3541 psm_modes: &[&str],
3542 accept: F,
3543) -> Option<Vec<OcrWord>>
3544where
3545 F: Fn(&[OcrWord]) -> bool,
3546{
3547 let variants = build_ocr_variants(image);
3548 let mut best: Option<OcrCandidateScore> = None;
3549
3550 for variant in variants {
3551 for psm in psm_modes {
3552 let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3553 continue;
3554 };
3555 if !accept(&words) {
3556 continue;
3557 }
3558 let score = score_ocr_words(&words, variant.width(), variant.height());
3559 match &best {
3560 Some(current) if current.score >= score => {}
3561 _ => {
3562 best = Some(OcrCandidateScore { words, score });
3563 }
3564 }
3565 }
3566 }
3567
3568 best.map(|candidate| candidate.words)
3569}
3570
3571fn score_ocr_words(words: &[OcrWord], width: u32, height: u32) -> f64 {
3572 if words.is_empty() || width == 0 || height == 0 {
3573 return 0.0;
3574 }
3575
3576 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3577 let mut alpha_words = 0usize;
3578 let mut area_coverage = 0f64;
3579 let mut vertical_spread_top = height;
3580 let mut vertical_spread_bottom = 0u32;
3581 let mut total_confidence = 0f64;
3582
3583 for word in words {
3584 by_line.entry(word.line_key).or_default().push(word);
3585 if word.text.chars().any(|ch| ch.is_alphabetic()) {
3586 alpha_words += 1;
3587 }
3588 area_coverage += f64::from(word.width.saturating_mul(word.height));
3589 vertical_spread_top = vertical_spread_top.min(word.top);
3590 vertical_spread_bottom = vertical_spread_bottom.max(word.top.saturating_add(word.height));
3591 total_confidence += word.confidence;
3592 }
3593
3594 let line_count = by_line.len() as f64;
3595 let alpha_ratio = alpha_words as f64 / words.len() as f64;
3596 let density = (area_coverage / f64::from(width.saturating_mul(height))).clamp(0.0, 1.0);
3597 let spread = if vertical_spread_bottom > vertical_spread_top {
3598 f64::from(vertical_spread_bottom - vertical_spread_top) / f64::from(height)
3599 } else {
3600 0.0
3601 };
3602 let avg_confidence = total_confidence / words.len() as f64;
3603 let confidence_bonus = (avg_confidence / 100.0).clamp(0.0, 1.0);
3605
3606 let horizontal_spread = if words.is_empty() {
3608 0.0
3609 } else {
3610 let min_left = words.iter().map(|w| w.left).min().unwrap_or(0);
3611 let max_right = words
3612 .iter()
3613 .map(|w| w.left + w.width)
3614 .max()
3615 .unwrap_or(width);
3616 f64::from(max_right.saturating_sub(min_left)) / f64::from(width)
3617 };
3618
3619 words.len() as f64
3620 + line_count * 1.5
3621 + alpha_ratio * 6.0
3622 + density * 25.0
3623 + spread * 3.0
3624 + horizontal_spread * 2.0
3625 + confidence_bonus * 5.0 }
3627
3628fn build_ocr_variants(gray: &GrayImage) -> Vec<GrayImage> {
3629 vec![
3630 gray.clone(),
3631 contrast_stretch(gray),
3632 global_otsu_binarize(gray),
3633 local_mean_binarize(gray, LOCAL_BINARIZATION_RADIUS),
3634 morphological_clean(gray),
3636 unsharp_mask(gray, 1.5),
3639 gamma_correct(gray, 0.6),
3641 ]
3642}
3643
3644fn unsharp_mask(gray: &GrayImage, amount: f32) -> GrayImage {
3648 let width = gray.width() as i32;
3649 let height = gray.height() as i32;
3650 let mut out = GrayImage::new(gray.width(), gray.height());
3651 for y in 0..height {
3652 for x in 0..width {
3653 let mut sum = 0i32;
3654 let mut count = 0i32;
3655 for dy in -1i32..=1 {
3656 for dx in -1i32..=1 {
3657 let nx = x + dx;
3658 let ny = y + dy;
3659 if nx >= 0 && ny >= 0 && nx < width && ny < height {
3660 sum += gray.get_pixel(nx as u32, ny as u32).0[0] as i32;
3661 count += 1;
3662 }
3663 }
3664 }
3665 let blurred = if count > 0 {
3666 sum / count
3667 } else {
3668 gray.get_pixel(x as u32, y as u32).0[0] as i32
3669 };
3670 let original = gray.get_pixel(x as u32, y as u32).0[0] as i32;
3671 let sharpened = original + ((original - blurred) as f32 * amount) as i32;
3672 out.put_pixel(x as u32, y as u32, Luma([sharpened.clamp(0, 255) as u8]));
3673 }
3674 }
3675 out
3676}
3677
3678fn gamma_correct(gray: &GrayImage, gamma: f32) -> GrayImage {
3681 let mut out = GrayImage::new(gray.width(), gray.height());
3682 for (x, y, pixel) in gray.enumerate_pixels() {
3683 let v = pixel.0[0] as f32 / 255.0;
3684 let corrected = (v.powf(gamma) * 255.0).round() as u8;
3685 out.put_pixel(x, y, Luma([corrected]));
3686 }
3687 out
3688}
3689
3690fn contrast_stretch(gray: &GrayImage) -> GrayImage {
3691 let mut min_val = u8::MAX;
3692 let mut max_val = u8::MIN;
3693 for pixel in gray.pixels() {
3694 let value = pixel.0[0];
3695 min_val = min_val.min(value);
3696 max_val = max_val.max(value);
3697 }
3698
3699 if max_val <= min_val {
3700 return gray.clone();
3701 }
3702
3703 let in_range = (max_val - min_val) as f64;
3704 let mut out = GrayImage::new(gray.width(), gray.height());
3705 for (x, y, pixel) in gray.enumerate_pixels() {
3706 let value = pixel.0[0];
3707 let normalized = ((value.saturating_sub(min_val)) as f64 / in_range * 255.0).round() as u8;
3708 out.put_pixel(x, y, Luma([normalized]));
3709 }
3710 out
3711}
3712
3713fn global_otsu_binarize(gray: &GrayImage) -> GrayImage {
3714 let threshold = otsu_threshold(gray);
3715 let mut out = GrayImage::new(gray.width(), gray.height());
3716 for (x, y, pixel) in gray.enumerate_pixels() {
3717 let value = if pixel.0[0] <= threshold { 0 } else { 255 };
3718 out.put_pixel(x, y, Luma([value]));
3719 }
3720 out
3721}
3722
3723fn otsu_threshold(gray: &GrayImage) -> u8 {
3724 let mut histogram = [0u64; 256];
3725 for pixel in gray.pixels() {
3726 histogram[pixel.0[0] as usize] += 1;
3727 }
3728
3729 let total = (gray.width() as u64) * (gray.height() as u64);
3730 if total == 0 {
3731 return 127;
3732 }
3733
3734 let sum_total: f64 = histogram
3735 .iter()
3736 .enumerate()
3737 .map(|(idx, count)| idx as f64 * *count as f64)
3738 .sum();
3739
3740 let mut sum_background = 0f64;
3741 let mut weight_background = 0f64;
3742 let mut max_variance = -1f64;
3743 let mut best_threshold = 127u8;
3744
3745 for (idx, count) in histogram.iter().enumerate() {
3746 weight_background += *count as f64;
3747 if weight_background <= 0.0 {
3748 continue;
3749 }
3750
3751 let weight_foreground = total as f64 - weight_background;
3752 if weight_foreground <= 0.0 {
3753 break;
3754 }
3755
3756 sum_background += idx as f64 * *count as f64;
3757 let mean_background = sum_background / weight_background;
3758 let mean_foreground = (sum_total - sum_background) / weight_foreground;
3759 let between_class_variance =
3760 weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
3761
3762 if between_class_variance > max_variance {
3763 max_variance = between_class_variance;
3764 best_threshold = idx as u8;
3765 }
3766 }
3767
3768 best_threshold
3769}
3770
3771fn local_mean_binarize(gray: &GrayImage, radius: u32) -> GrayImage {
3772 if gray.width() == 0 || gray.height() == 0 {
3773 return gray.clone();
3774 }
3775
3776 let width = gray.width() as usize;
3777 let height = gray.height() as usize;
3778 let (integral, stride) = integral_image(gray);
3779 let mut out = GrayImage::new(gray.width(), gray.height());
3780
3781 for y in 0..height {
3782 for x in 0..width {
3783 let x1 = x.saturating_sub(radius as usize);
3784 let y1 = y.saturating_sub(radius as usize);
3785 let x2 = (x + radius as usize).min(width - 1);
3786 let y2 = (y + radius as usize).min(height - 1);
3787
3788 let area = (x2 - x1 + 1) * (y2 - y1 + 1);
3789 let sum = region_sum(&integral, stride, x1, y1, x2, y2);
3790 let local_mean = (sum as f64) / (area as f64);
3791 let offset = if area >= MIN_BINARIZATION_BLOCK_PIXELS {
3792 8.0
3793 } else {
3794 4.0
3795 };
3796 let threshold = (local_mean - offset).clamp(0.0, 255.0);
3797
3798 let pixel_value = gray.get_pixel(x as u32, y as u32).0[0] as f64;
3799 let value = if pixel_value <= threshold { 0 } else { 255 };
3800 out.put_pixel(x as u32, y as u32, Luma([value]));
3801 }
3802 }
3803
3804 out
3805}
3806
3807fn morphological_clean(gray: &GrayImage) -> GrayImage {
3810 if gray.width() == 0 || gray.height() == 0 {
3811 return gray.clone();
3812 }
3813
3814 let binary = global_otsu_binarize(gray);
3816
3817 let dilated = morphological_dilate(&binary, 2);
3819 morphological_erode(&dilated, 2)
3820}
3821
3822fn morphological_dilate(gray: &GrayImage, iterations: u32) -> GrayImage {
3823 let mut result = gray.clone();
3824 for _ in 0..iterations {
3825 let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3826
3827 for y in 1..gray.height().saturating_sub(1) {
3828 for x in 1..gray.width().saturating_sub(1) {
3829 let mut has_black = false;
3831 for dy in 0..3 {
3832 for dx in 0..3 {
3833 let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3834 if px < 128 {
3835 has_black = true;
3836 break;
3837 }
3838 }
3839 if has_black {
3840 break;
3841 }
3842 }
3843 next.put_pixel(x, y, if has_black { Luma([0]) } else { Luma([255]) });
3844 }
3845 }
3846 result = next;
3847 }
3848 result
3849}
3850
3851fn morphological_erode(gray: &GrayImage, iterations: u32) -> GrayImage {
3852 let mut result = gray.clone();
3853 for _ in 0..iterations {
3854 let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3855
3856 for y in 1..gray.height().saturating_sub(1) {
3857 for x in 1..gray.width().saturating_sub(1) {
3858 let mut all_black = true;
3861 for dy in 0..3 {
3862 for dx in 0..3 {
3863 let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3864 if px >= 128 {
3865 all_black = false;
3866 break;
3867 }
3868 }
3869 if !all_black {
3870 break;
3871 }
3872 }
3873 next.put_pixel(x, y, if all_black { Luma([0]) } else { Luma([255]) });
3874 }
3875 }
3876 result = next;
3877 }
3878 result
3879}
3880
3881fn integral_image(gray: &GrayImage) -> (Vec<u64>, usize) {
3882 let width = gray.width() as usize;
3883 let height = gray.height() as usize;
3884 let stride = width + 1;
3885 let mut integral = vec![0u64; (width + 1) * (height + 1)];
3886
3887 for y in 0..height {
3888 let mut row_sum = 0u64;
3889 for x in 0..width {
3890 row_sum += gray.get_pixel(x as u32, y as u32).0[0] as u64;
3891 let idx = (y + 1) * stride + (x + 1);
3892 integral[idx] = integral[y * stride + (x + 1)] + row_sum;
3893 }
3894 }
3895
3896 (integral, stride)
3897}
3898
3899fn region_sum(integral: &[u64], stride: usize, x1: usize, y1: usize, x2: usize, y2: usize) -> u64 {
3900 let a = integral[y1 * stride + x1];
3901 let b = integral[y1 * stride + (x2 + 1)];
3902 let c = integral[(y2 + 1) * stride + x1];
3903 let d = integral[(y2 + 1) * stride + (x2 + 1)];
3904 d + a - b - c
3905}
3906
3907fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
3908 run_tesseract_plain_text_with_variant(image, psm)
3909}
3910
3911fn run_tesseract_plain_text_with_variant(image: &GrayImage, psm: &str) -> Option<String> {
3912 if matches!(selected_ocr_engine(), OcrEngine::RapidOcr) {
3913 return run_rapidocr_words(image).map(|words| words_to_plain_line_text(&words));
3914 }
3915
3916 let temp_dir = create_temp_dir(0).ok()?;
3917 let image_path = temp_dir.join("ocr.png");
3918 if image.save(&image_path).is_err() {
3919 let _ = fs::remove_dir_all(&temp_dir);
3920 return None;
3921 }
3922
3923 let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3924 let output = Command::new("tesseract")
3925 .current_dir(&temp_dir)
3926 .arg("ocr.png")
3927 .arg("stdout")
3928 .arg("--dpi")
3929 .arg(&dpi)
3930 .arg("--oem")
3931 .arg("3")
3932 .arg("--psm")
3933 .arg(psm)
3934 .arg("-c")
3935 .arg("load_system_dawg=0")
3936 .arg("-c")
3937 .arg("load_freq_dawg=0")
3938 .output()
3939 .ok()?;
3940 let _ = fs::remove_dir_all(&temp_dir);
3941 if !output.status.success() {
3942 return None;
3943 }
3944
3945 Some(
3946 String::from_utf8_lossy(&output.stdout)
3947 .replace('\n', " ")
3948 .split_whitespace()
3949 .collect::<Vec<_>>()
3950 .join(" "),
3951 )
3952}
3953
3954fn words_to_text_chunks(
3955 words: &[OcrWord],
3956 image: &ImageChunk,
3957 text_chunks: &[TextChunk],
3958) -> Vec<TextChunk> {
3959 let mut image_size = (0u32, 0u32);
3960 for word in words {
3961 image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
3962 image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
3963 }
3964 if image_size.0 == 0 || image_size.1 == 0 {
3965 return Vec::new();
3966 }
3967
3968 let mut dedupe: HashMap<String, usize> = HashMap::new();
3969 for chunk in text_chunks {
3970 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3971 }
3972
3973 let mut recovered = Vec::new();
3974 for word in words {
3975 let normalized = normalize_text(&word.text);
3976 if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
3977 continue;
3978 }
3979
3980 let left_ratio = f64::from(word.left) / f64::from(image_size.0);
3981 let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
3982 let top_ratio = f64::from(word.top) / f64::from(image_size.1);
3983 let bottom_ratio =
3984 f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
3985
3986 let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
3987 let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
3988 let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
3989 let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
3990 if right_x <= left_x || top_y <= bottom_y {
3991 continue;
3992 }
3993
3994 recovered.push(TextChunk {
3995 value: word.text.clone(),
3996 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3997 font_name: "OCR".to_string(),
3998 font_size: (top_y - bottom_y).max(6.0),
3999 font_weight: 400.0,
4000 italic_angle: 0.0,
4001 font_color: "#000000".to_string(),
4002 contrast_ratio: 21.0,
4003 symbol_ends: Vec::new(),
4004 text_format: TextFormat::Normal,
4005 text_type: TextType::Regular,
4006 pdf_layer: PdfLayer::Content,
4007 ocg_visible: true,
4008 index: None,
4009 page_number: image.bbox.page_number,
4010 level: None,
4011 mcid: None,
4012 });
4013 }
4014
4015 recovered
4016}
4017
4018fn lines_from_ocr_words(
4019 words: &[OcrWord],
4020 image: &ImageChunk,
4021 image_width: u32,
4022 image_height: u32,
4023 text_chunks: &[TextChunk],
4024) -> Vec<TextChunk> {
4025 if image_width == 0 || image_height == 0 {
4026 return Vec::new();
4027 }
4028
4029 let mut dedupe: HashMap<String, usize> = HashMap::new();
4030 for chunk in text_chunks {
4031 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
4032 }
4033
4034 let spatial_lines = build_spatial_ocr_lines(words);
4035 if spatial_lines.is_empty() {
4036 return Vec::new();
4037 }
4038
4039 let blocks = merge_spatial_ocr_lines_into_blocks(&spatial_lines, image_width);
4040 if blocks.is_empty() {
4041 return Vec::new();
4042 }
4043
4044 let mut recovered = Vec::new();
4045 for block in blocks {
4046 let normalized = normalize_text(&block.text);
4047 if normalized.len() >= 8 && dedupe.contains_key(&normalized) {
4048 continue;
4049 }
4050
4051 if block.right <= block.left || block.bottom <= block.top {
4052 continue;
4053 }
4054
4055 let left_x = image.bbox.left_x
4056 + image.bbox.width() * (f64::from(block.left) / f64::from(image_width));
4057 let right_x = image.bbox.left_x
4058 + image.bbox.width() * (f64::from(block.right) / f64::from(image_width));
4059 let top_y = image.bbox.top_y
4060 - image.bbox.height() * (f64::from(block.top) / f64::from(image_height));
4061 let bottom_y = image.bbox.top_y
4062 - image.bbox.height() * (f64::from(block.bottom) / f64::from(image_height));
4063 if right_x <= left_x || top_y <= bottom_y {
4064 continue;
4065 }
4066
4067 recovered.push(TextChunk {
4068 value: block.text,
4069 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
4070 font_name: "OCR".to_string(),
4071 font_size: (f64::from(block.line_height_sum) / block.line_count.max(1) as f64).max(6.0),
4072 font_weight: 400.0,
4073 italic_angle: 0.0,
4074 font_color: "#000000".to_string(),
4075 contrast_ratio: 21.0,
4076 symbol_ends: Vec::new(),
4077 text_format: TextFormat::Normal,
4078 text_type: TextType::Regular,
4079 pdf_layer: PdfLayer::Content,
4080 ocg_visible: true,
4081 index: None,
4082 page_number: image.bbox.page_number,
4083 level: None,
4084 mcid: None,
4085 });
4086 }
4087
4088 recovered
4089}
4090
4091#[derive(Debug, Clone)]
4092struct SpatialOcrLine {
4093 left: u32,
4094 top: u32,
4095 right: u32,
4096 bottom: u32,
4097 text: String,
4098 word_count: usize,
4099 line_count: usize,
4100 line_height_sum: u32,
4101}
4102
4103fn build_spatial_ocr_lines(words: &[OcrWord]) -> Vec<SpatialOcrLine> {
4104 let filtered_words = filter_words_by_spatial_coherence(words);
4105 if filtered_words.is_empty() {
4106 return Vec::new();
4107 }
4108
4109 let avg_word_width =
4110 filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
4111 let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
4112 let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
4113
4114 let mut lines = Vec::new();
4115 for mut cluster in clusters {
4116 cluster.sort_by_key(|word| word.left);
4117 let text = cluster
4118 .iter()
4119 .map(|word| word.text.as_str())
4120 .collect::<Vec<_>>()
4121 .join(" ")
4122 .trim()
4123 .to_string();
4124 if text.is_empty() {
4125 continue;
4126 }
4127
4128 let left = cluster.iter().map(|word| word.left).min().unwrap_or(0);
4129 let right = cluster
4130 .iter()
4131 .map(|word| word.left.saturating_add(word.width))
4132 .max()
4133 .unwrap_or(0);
4134 let top = cluster.iter().map(|word| word.top).min().unwrap_or(0);
4135 let bottom = cluster
4136 .iter()
4137 .map(|word| word.top.saturating_add(word.height))
4138 .max()
4139 .unwrap_or(0);
4140 if right <= left || bottom <= top {
4141 continue;
4142 }
4143
4144 lines.push(SpatialOcrLine {
4145 left,
4146 top,
4147 right,
4148 bottom,
4149 text,
4150 word_count: cluster.len(),
4151 line_count: 1,
4152 line_height_sum: bottom.saturating_sub(top).max(1),
4153 });
4154 }
4155
4156 lines.sort_by_key(|line| (line.top, line.left));
4157 lines
4158}
4159
4160fn merge_spatial_ocr_lines_into_blocks(
4161 lines: &[SpatialOcrLine],
4162 image_width: u32,
4163) -> Vec<SpatialOcrLine> {
4164 if lines.is_empty() {
4165 return Vec::new();
4166 }
4167
4168 let median_height = {
4169 let mut heights: Vec<u32> = lines
4170 .iter()
4171 .map(|line| line.bottom.saturating_sub(line.top).max(1))
4172 .collect();
4173 heights.sort_unstable();
4174 heights[heights.len() / 2]
4175 };
4176 let vertical_tolerance = (median_height / 2).max(3);
4177 let max_vertical_gap = median_height.saturating_mul(2).max(8);
4178
4179 let mut blocks: Vec<SpatialOcrLine> = Vec::new();
4180 for line in lines {
4181 let merge_idx = blocks.iter().rposition(|block| {
4182 let vertical_gap = line.top.saturating_sub(block.bottom);
4183 if vertical_gap > max_vertical_gap {
4184 return false;
4185 }
4186 if line.top + vertical_tolerance < block.bottom {
4187 return false;
4188 }
4189
4190 spatial_lines_share_block_geometry(block, line, image_width, median_height)
4191 });
4192
4193 if let Some(merge_idx) = merge_idx {
4194 let block = &mut blocks[merge_idx];
4195 block.left = block.left.min(line.left);
4196 block.top = block.top.min(line.top);
4197 block.right = block.right.max(line.right);
4198 block.bottom = block.bottom.max(line.bottom);
4199 block.word_count += line.word_count;
4200 block.line_count += line.line_count;
4201 block.line_height_sum = block.line_height_sum.saturating_add(line.line_height_sum);
4202 if !block.text.ends_with('-') {
4203 block.text.push(' ');
4204 }
4205 block.text.push_str(&line.text);
4206 continue;
4207 }
4208
4209 blocks.push(line.clone());
4210 }
4211
4212 blocks
4213 .into_iter()
4214 .filter_map(|mut block| {
4215 block.text = block.text.split_whitespace().collect::<Vec<_>>().join(" ");
4216 let alphabetic = block.text.chars().filter(|ch| ch.is_alphabetic()).count();
4217 let min_chars = if block.word_count >= 4 { 10 } else { 16 };
4218 if block.text.len() < min_chars || alphabetic < 4 {
4219 return None;
4220 }
4221 Some(block)
4222 })
4223 .collect()
4224}
4225
4226fn spatial_lines_share_block_geometry(
4227 upper: &SpatialOcrLine,
4228 lower: &SpatialOcrLine,
4229 image_width: u32,
4230 median_height: u32,
4231) -> bool {
4232 let overlap_left = upper.left.max(lower.left);
4233 let overlap_right = upper.right.min(lower.right);
4234 let overlap = overlap_right.saturating_sub(overlap_left);
4235 let upper_width = upper.right.saturating_sub(upper.left).max(1);
4236 let lower_width = lower.right.saturating_sub(lower.left).max(1);
4237 let min_width = upper_width.min(lower_width);
4238 let max_width = upper_width.max(lower_width);
4239 let overlap_ratio = overlap as f64 / min_width as f64;
4240 let width_ratio = min_width as f64 / max_width as f64;
4241 let max_left_shift = ((f64::from(image_width) * 0.045).round() as u32)
4242 .max(median_height.saturating_mul(2))
4243 .max(8);
4244 let left_shift = upper.left.abs_diff(lower.left);
4245
4246 overlap_ratio >= 0.40
4247 || (overlap_ratio >= 0.15 && left_shift <= max_left_shift && width_ratio >= 0.55)
4248}
4249
4250fn is_numeric_like(text: &str) -> bool {
4251 text.chars().any(|ch| ch.is_ascii_digit())
4252}
4253
4254fn normalize_text(text: &str) -> String {
4255 text.chars()
4256 .filter(|ch| ch.is_alphanumeric())
4257 .flat_map(|ch| ch.to_lowercase())
4258 .collect()
4259}
4260
4261fn normalize_caption_text(text: &str) -> String {
4262 text.replace("CarolinaBLUTM", "CarolinaBLU™")
4263 .replace("CarolinaBLU™™", "CarolinaBLU™")
4264 .trim()
4265 .to_string()
4266}
4267
4268fn normalize_raster_cell_text(row_idx: usize, _col_idx: usize, text: String) -> String {
4269 let mut normalized = text
4270 .replace('|', " ")
4271 .replace('—', "-")
4272 .replace("AorB", "A or B")
4273 .replace("Aor B", "A or B")
4274 .replace("H,O", "H2O")
4275 .split_whitespace()
4276 .collect::<Vec<_>>()
4277 .join(" ");
4278
4279 if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
4280 return String::new();
4281 }
4282 if row_idx > 0
4283 && normalized
4284 .chars()
4285 .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
4286 {
4287 return String::new();
4288 }
4289
4290 normalized = normalized
4291 .replace(" ywL", " μL")
4292 .replace(" yuL", " μL")
4293 .replace(" yL", " μL")
4294 .replace(" wL", " μL")
4295 .replace(" uL", " μL")
4296 .replace(" pL", " μL");
4297
4298 normalized.trim().to_string()
4299}
4300
4301fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
4302 let unique = SystemTime::now()
4303 .duration_since(UNIX_EPOCH)
4304 .unwrap_or_default()
4305 .as_nanos();
4306 let dir = std::env::temp_dir().join(format!(
4307 "edgeparse-raster-ocr-{}-{}-{}",
4308 std::process::id(),
4309 page_number,
4310 unique
4311 ));
4312 fs::create_dir_all(&dir)?;
4313 Ok(dir)
4314}
4315
4316fn extract_visible_page_image_files(
4317 input_path: &Path,
4318 page_number: u32,
4319 temp_dir: &Path,
4320) -> Option<Vec<PathBuf>> {
4321 let list_output = Command::new("pdfimages")
4322 .arg("-f")
4323 .arg(page_number.to_string())
4324 .arg("-l")
4325 .arg(page_number.to_string())
4326 .arg("-list")
4327 .arg(input_path)
4328 .output()
4329 .ok()?;
4330 if !list_output.status.success() {
4331 return None;
4332 }
4333
4334 let entries = parse_pdfimages_list(&String::from_utf8_lossy(&list_output.stdout));
4335 let visible_indices: Vec<usize> = entries
4336 .iter()
4337 .enumerate()
4338 .filter_map(|(idx, entry)| (entry.image_type == "image").then_some(idx))
4339 .collect();
4340 if visible_indices.is_empty() {
4341 return Some(Vec::new());
4342 }
4343
4344 let prefix = temp_dir.join("img");
4345 let status = Command::new("pdfimages")
4346 .arg("-f")
4347 .arg(page_number.to_string())
4348 .arg("-l")
4349 .arg(page_number.to_string())
4350 .arg("-png")
4351 .arg(input_path)
4352 .arg(&prefix)
4353 .status()
4354 .ok()?;
4355 if !status.success() {
4356 return None;
4357 }
4358
4359 let mut image_files: Vec<PathBuf> = fs::read_dir(temp_dir)
4360 .ok()?
4361 .filter_map(|entry| entry.ok().map(|e| e.path()))
4362 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
4363 .collect();
4364 image_files.sort();
4365
4366 let visible_files: Vec<PathBuf> = visible_indices
4367 .into_iter()
4368 .filter_map(|idx| image_files.get(idx).cloned())
4369 .collect();
4370 Some(visible_files)
4371}
4372
4373fn parse_pdfimages_list(output: &str) -> Vec<PdfImagesListEntry> {
4374 let mut entries = Vec::new();
4375 let mut in_rows = false;
4376
4377 for line in output.lines() {
4378 let trimmed = line.trim();
4379 if trimmed.is_empty() {
4380 continue;
4381 }
4382 if trimmed.starts_with("---") {
4383 in_rows = true;
4384 continue;
4385 }
4386 if !in_rows {
4387 continue;
4388 }
4389
4390 let mut cols = trimmed.split_whitespace();
4391 let Some(_page) = cols.next() else {
4392 continue;
4393 };
4394 let Some(_num) = cols.next() else {
4395 continue;
4396 };
4397 let Some(image_type) = cols.next() else {
4398 continue;
4399 };
4400
4401 entries.push(PdfImagesListEntry {
4402 image_type: image_type.to_string(),
4403 });
4404 }
4405
4406 entries
4407}
4408
4409#[cfg(test)]
4410mod tests {
4411 use super::*;
4412 use image::GrayImage;
4413 use crate::models::enums::{PdfLayer, TextFormat, TextType};
4414
4415 fn image_chunk() -> ImageChunk {
4416 ImageChunk {
4417 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 400.0, 400.0),
4418 index: Some(1),
4419 level: None,
4420 }
4421 }
4422
4423 fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
4424 OcrWord {
4425 line_key: line,
4426 left,
4427 top: 0,
4428 width: 40,
4429 height: 12,
4430 text: text.to_string(),
4431 confidence: 90.0,
4432 }
4433 }
4434
4435 fn word_at(line: (u32, u32, u32), left: u32, top: u32, width: u32, text: &str) -> OcrWord {
4436 OcrWord {
4437 line_key: line,
4438 left,
4439 top,
4440 width,
4441 height: 12,
4442 text: text.to_string(),
4443 confidence: 90.0,
4444 }
4445 }
4446
4447 fn text_chunk(value: &str, bbox: BoundingBox) -> TextChunk {
4448 TextChunk {
4449 value: value.to_string(),
4450 bbox,
4451 font_name: "Helvetica".to_string(),
4452 font_size: 12.0,
4453 font_weight: 400.0,
4454 italic_angle: 0.0,
4455 font_color: "#000000".to_string(),
4456 contrast_ratio: 21.0,
4457 symbol_ends: Vec::new(),
4458 text_format: TextFormat::Normal,
4459 text_type: TextType::Regular,
4460 pdf_layer: PdfLayer::Main,
4461 ocg_visible: true,
4462 index: None,
4463 page_number: Some(1),
4464 level: None,
4465 mcid: None,
4466 }
4467 }
4468
4469 fn test_cell_text(cell: &TableBorderCell) -> String {
4470 cell.content
4471 .iter()
4472 .map(|token| token.base.value.trim())
4473 .filter(|value| !value.is_empty())
4474 .collect::<Vec<_>>()
4475 .join(" ")
4476 }
4477
4478 #[test]
4479 fn test_table_like_ocr_detects_repeated_columns() {
4480 let words = vec![
4481 word((1, 1, 1), 10, "Temperature"),
4482 word((1, 1, 1), 120, "Viscosity"),
4483 word((1, 1, 1), 240, "Temperature"),
4484 word((1, 1, 1), 360, "Viscosity"),
4485 word((1, 1, 2), 10, "0"),
4486 word((1, 1, 2), 120, "1.793E-06"),
4487 word((1, 1, 2), 240, "25"),
4488 word((1, 1, 2), 360, "8.930E-07"),
4489 word((1, 1, 3), 10, "1"),
4490 word((1, 1, 3), 120, "1.732E-06"),
4491 word((1, 1, 3), 240, "26"),
4492 word((1, 1, 3), 360, "8.760E-07"),
4493 ];
4494 assert!(!looks_like_chart_label_ocr(&words));
4495 assert!(looks_like_table_ocr(&words));
4496 }
4497
4498 #[test]
4499 fn test_structured_ocr_table_border_recovers_non_numeric_table() {
4500 let image = image_chunk();
4501 let words = vec![
4502 word_at((1, 1, 1), 10, 10, 80, "Tube"),
4503 word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4504 word_at((1, 1, 1), 305, 10, 70, "DNA"),
4505 word_at((1, 1, 2), 10, 42, 80, "1"),
4506 word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4507 word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4508 word_at((1, 1, 3), 10, 74, 80, "2"),
4509 word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4510 word_at((1, 1, 3), 305, 74, 70, "lambda"),
4511 word_at((1, 1, 4), 10, 106, 80, "3"),
4512 word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4513 word_at((1, 1, 4), 305, 106, 70, "control"),
4514 ];
4515
4516 assert!(!looks_like_chart_label_ocr(&words));
4517 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4518 assert_eq!(table.num_columns, 3);
4519 assert_eq!(table.num_rows, 4);
4520 assert_eq!(test_cell_text(&table.rows[0].cells[0]), "Tube");
4521 assert_eq!(test_cell_text(&table.rows[1].cells[1]), "BamHI");
4522 assert_eq!(test_cell_text(&table.rows[3].cells[2]), "control");
4523 }
4524
4525 #[test]
4526 fn test_structured_ocr_table_border_scales_column_boundaries_to_page_bbox() {
4527 let image = ImageChunk {
4528 bbox: BoundingBox::new(Some(1), 56.6929, 163.6519, 555.3071, 442.0069),
4529 index: Some(1),
4530 level: None,
4531 };
4532 let words = vec![
4533 word_at((1, 1, 1), 10, 10, 110, "TempC"),
4534 word_at((1, 1, 1), 255, 10, 150, "KinViscA"),
4535 word_at((1, 1, 1), 520, 10, 110, "TempC"),
4536 word_at((1, 1, 1), 760, 10, 170, "KinViscB"),
4537 word_at((1, 1, 2), 10, 44, 24, "0"),
4538 word_at((1, 1, 2), 255, 44, 130, "1.793E-06"),
4539 word_at((1, 1, 2), 520, 44, 28, "25"),
4540 word_at((1, 1, 2), 760, 44, 130, "8.930E-07"),
4541 word_at((1, 1, 3), 10, 78, 24, "1"),
4542 word_at((1, 1, 3), 255, 78, 130, "1.732E-06"),
4543 word_at((1, 1, 3), 520, 78, 28, "26"),
4544 word_at((1, 1, 3), 760, 78, 130, "8.760E-07"),
4545 ];
4546
4547 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4548
4549 assert_eq!(table.num_columns, 4);
4550 assert_eq!(table.num_rows, 3);
4551 assert_eq!(test_cell_text(&table.rows[1].cells[1]), "1.793E-06");
4552 assert!(table.x_coordinates.windows(2).all(|pair| pair[1] >= pair[0]));
4553 assert!(table
4554 .x_coordinates
4555 .iter()
4556 .all(|x| *x >= image.bbox.left_x && *x <= image.bbox.right_x));
4557 }
4558
4559 #[test]
4560 fn test_chart_label_ocr_does_not_reject_five_row_table() {
4561 let words = vec![
4562 word_at((1, 1, 1), 10, 10, 80, "Tube"),
4563 word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4564 word_at((1, 1, 1), 305, 10, 70, "DNA"),
4565 word_at((1, 1, 2), 10, 42, 80, "1"),
4566 word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4567 word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4568 word_at((1, 1, 3), 10, 74, 80, "2"),
4569 word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4570 word_at((1, 1, 3), 305, 74, 70, "lambda"),
4571 word_at((1, 1, 4), 10, 106, 80, "3"),
4572 word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4573 word_at((1, 1, 4), 305, 106, 70, "control"),
4574 word_at((1, 1, 5), 10, 138, 80, "4"),
4575 word_at((1, 1, 5), 145, 138, 110, "NotI"),
4576 word_at((1, 1, 5), 305, 138, 70, "sample"),
4577 ];
4578
4579 assert!(!looks_like_chart_label_ocr(&words));
4580 assert!(looks_like_table_ocr(&words));
4581 }
4582
4583 #[test]
4584 fn test_structured_ocr_table_border_rejects_two_column_prose_layout() {
4585 let image = image_chunk();
4586 let words = vec![
4587 word_at((1, 1, 1), 10, 10, 90, "Summary"),
4588 word_at((1, 1, 1), 220, 10, 120, "Detailed findings"),
4589 word_at((1, 1, 2), 10, 42, 90, "Background"),
4590 word_at((1, 1, 2), 220, 42, 120, "Additional context"),
4591 word_at((1, 1, 3), 10, 74, 90, "Notes"),
4592 word_at((1, 1, 3), 220, 74, 120, "Further explanation"),
4593 ];
4594
4595 assert!(build_structured_ocr_table_border(&words, &image).is_none());
4596 }
4597
4598 #[test]
4599 fn test_parse_pdfimages_list_ignores_smask_entries() {
4600 let output = "page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio\n--------------------------------------------------------------------------------------------\n 1 0 image 1320 358 icc 3 8 image no 46 0 208 208 63.5K 4.6%\n 1 1 smask 1320 358 gray 1 8 image no 46 0 208 208 483B 0.1%\n";
4601
4602 let entries = parse_pdfimages_list(output);
4603 assert_eq!(entries.len(), 2);
4604 assert_eq!(entries[0].image_type, "image");
4605 assert_eq!(entries[1].image_type, "smask");
4606 }
4607
4608 #[test]
4609 fn test_table_like_ocr_rejects_single_line_caption() {
4610 let words = vec![
4611 word((1, 1, 1), 10, "Figure"),
4612 word((1, 1, 1), 90, "7.2"),
4613 word((1, 1, 1), 150, "Viscosity"),
4614 word((1, 1, 1), 260, "of"),
4615 word((1, 1, 1), 300, "Water"),
4616 ];
4617 assert!(!looks_like_table_ocr(&words));
4618 }
4619
4620 #[test]
4621 fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
4622 assert_eq!(
4623 normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
4624 "3 μL"
4625 );
4626 assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
4627 assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
4628 }
4629
4630 #[test]
4631 fn test_detect_bordered_raster_grid_finds_strong_lines() {
4632 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
4633 for x in [10, 40, 80, 110] {
4634 for y in 10..71 {
4635 image.put_pixel(x, y, Luma([0]));
4636 }
4637 }
4638 for y in [10, 30, 50, 70] {
4639 for x in 10..111 {
4640 image.put_pixel(x, y, Luma([0]));
4641 }
4642 }
4643
4644 let grid = detect_bordered_raster_grid(&image).expect("grid");
4645 assert_eq!(grid.vertical_lines.len(), 4);
4646 assert_eq!(grid.horizontal_lines.len(), 4);
4647 }
4648
4649 #[test]
4650 fn test_obvious_bar_chart_raster_is_rejected() {
4651 let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4652 for &(y1, y2) in &[(25, 40), (70, 85), (115, 130), (160, 175)] {
4653 for y in y1..y2 {
4654 for x in 40..280 {
4655 image.put_pixel(x, y, Luma([80]));
4656 }
4657 }
4658 }
4659
4660 assert!(is_obvious_bar_chart_raster(&image));
4661 }
4662
4663 #[test]
4664 fn test_vertical_bar_chart_raster_is_rejected() {
4665 let mut image = GrayImage::from_pixel(360, 240, Luma([255]));
4666 for &(x1, x2, y1) in &[
4667 (40, 78, 52),
4668 (92, 126, 118),
4669 (140, 170, 146),
4670 (184, 210, 162),
4671 ] {
4672 for x in x1..x2 {
4673 for y in y1..212 {
4674 image.put_pixel(x, y, Luma([90]));
4675 }
4676 }
4677 }
4678
4679 assert!(is_obvious_bar_chart_raster(&image));
4680 }
4681
4682 #[test]
4683 fn test_light_fill_vertical_bar_chart_raster_is_rejected() {
4684 let mut image = GrayImage::from_pixel(420, 260, Luma([255]));
4685 for x in 24..396 {
4686 image.put_pixel(x, 222, Luma([170]));
4687 }
4688 for &(x1, x2, y1, shade) in &[
4689 (46, 82, 132, 222),
4690 (104, 140, 84, 214),
4691 (162, 198, 62, 206),
4692 (220, 256, 144, 228),
4693 ] {
4694 for x in x1..x2 {
4695 for y in y1..222 {
4696 image.put_pixel(x, y, Luma([shade]));
4697 }
4698 }
4699 }
4700
4701 assert!(is_obvious_bar_chart_raster(&image));
4702 }
4703
4704 #[test]
4705 fn test_grouped_vertical_bar_chart_raster_is_rejected() {
4706 let mut image = GrayImage::from_pixel(420, 240, Luma([255]));
4707 for x in 28..392 {
4708 image.put_pixel(x, 214, Luma([175]));
4709 }
4710 for &(x1, x2, y1, shade) in &[
4711 (44, 60, 98, 210),
4712 (64, 80, 140, 225),
4713 (108, 124, 116, 214),
4714 (128, 144, 148, 229),
4715 (172, 188, 88, 206),
4716 (192, 208, 128, 222),
4717 (236, 252, 104, 212),
4718 (256, 272, 156, 228),
4719 ] {
4720 for x in x1..x2 {
4721 for y in y1..214 {
4722 image.put_pixel(x, y, Luma([shade]));
4723 }
4724 }
4725 }
4726
4727 assert!(is_obvious_bar_chart_raster(&image));
4728 }
4729
4730 #[test]
4731 fn test_natural_photograph_raster_is_detected() {
4732 let w = 100u32;
4734 let h = 100u32;
4735 let mut image = GrayImage::new(w, h);
4736 for y in 0..h {
4738 for x in 0..w {
4739 let v = ((x + y) * 255 / (w + h - 2)) as u8;
4740 image.put_pixel(x, y, Luma([v]));
4741 }
4742 }
4743 assert!(is_natural_photograph_raster(&image));
4745 }
4746
4747 #[test]
4748 fn test_chart_image_is_not_classified_as_photograph() {
4749 let mut image = GrayImage::from_pixel(200, 160, Luma([255]));
4751 for x in 20..180 {
4753 image.put_pixel(x, 20, Luma([0]));
4754 image.put_pixel(x, 80, Luma([0]));
4755 image.put_pixel(x, 140, Luma([0]));
4756 }
4757 for y in 20..141 {
4758 image.put_pixel(20, y, Luma([0]));
4759 image.put_pixel(180, y, Luma([0]));
4760 }
4761 assert!(!is_natural_photograph_raster(&image));
4763 assert!(!is_dark_ui_screenshot_raster(&image));
4764 }
4765
4766 #[test]
4767 fn test_bright_natural_photograph_raster_is_detected() {
4768 let mut image = GrayImage::from_pixel(240, 180, Luma([250]));
4769 for y in 24..148 {
4770 for x in 52..156 {
4771 let tone = 72 + (((x - 52) * 11 + (y - 24) * 7) % 132) as u8;
4772 image.put_pixel(x, y, Luma([tone]));
4773 }
4774 }
4775
4776 assert!(is_natural_photograph_raster(&image));
4777 }
4778
4779 #[test]
4780 fn test_dark_ui_screenshot_raster_is_detected() {
4781 let mut image = GrayImage::from_pixel(260, 180, Luma([20]));
4782 for x in 18..242 {
4783 for y in 18..34 {
4784 image.put_pixel(x, y, Luma([210]));
4785 }
4786 }
4787 for &(x1, y1, x2, y2, shade) in &[
4788 (26, 58, 84, 108, 198),
4789 (94, 58, 152, 108, 210),
4790 (162, 58, 220, 108, 192),
4791 (26, 118, 220, 134, 224),
4792 ] {
4793 for x in x1..x2 {
4794 for y in y1..y2 {
4795 image.put_pixel(x, y, Luma([shade]));
4796 }
4797 }
4798 }
4799
4800 assert!(is_dark_ui_screenshot_raster(&image));
4801 }
4802
4803 #[test]
4804 fn test_table_like_ocr_rejects_matrix_formula_layout() {
4805 let words = vec![
4806 word_at((1, 1, 1), 14, 10, 36, "B23"),
4807 word_at((1, 1, 1), 160, 10, 22, "C1"),
4808 word_at((1, 1, 1), 230, 10, 22, "C2"),
4809 word_at((1, 1, 1), 300, 10, 22, "C3"),
4810 word_at((1, 1, 2), 20, 44, 24, "0/0"),
4811 word_at((1, 1, 2), 150, 44, 18, "0"),
4812 word_at((1, 1, 2), 220, 44, 28, "001"),
4813 word_at((1, 1, 2), 300, 44, 28, "000"),
4814 word_at((1, 1, 3), 20, 76, 24, "0/1"),
4815 word_at((1, 1, 3), 150, 76, 28, "000"),
4816 word_at((1, 1, 3), 220, 76, 28, "010"),
4817 word_at((1, 1, 3), 300, 76, 28, "000"),
4818 ];
4819
4820 assert!(looks_like_matrix_formula_ocr(&words));
4821 assert!(!looks_like_table_ocr(&words));
4822 }
4823
4824 #[test]
4825 fn test_table_like_ocr_keeps_small_numeric_table_with_real_headers() {
4826 let words = vec![
4827 word_at((1, 1, 1), 10, 10, 64, "Year"),
4828 word_at((1, 1, 1), 130, 10, 28, "Q1"),
4829 word_at((1, 1, 1), 220, 10, 28, "Q2"),
4830 word_at((1, 1, 1), 310, 10, 28, "Q3"),
4831 word_at((1, 1, 2), 10, 42, 64, "2022"),
4832 word_at((1, 1, 2), 130, 42, 24, "10"),
4833 word_at((1, 1, 2), 220, 42, 24, "25"),
4834 word_at((1, 1, 2), 310, 42, 24, "30"),
4835 word_at((1, 1, 3), 10, 74, 64, "2023"),
4836 word_at((1, 1, 3), 130, 74, 24, "11"),
4837 word_at((1, 1, 3), 220, 74, 24, "26"),
4838 word_at((1, 1, 3), 310, 74, 24, "31"),
4839 ];
4840
4841 assert!(!looks_like_matrix_formula_ocr(&words));
4842 assert!(looks_like_table_ocr(&words));
4843 }
4844
4845 #[test]
4846 fn test_matrixish_small_ocr_table_is_rejected_after_build() {
4847 let image = ImageChunk {
4848 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 120.0),
4849 index: Some(1),
4850 level: None,
4851 };
4852 let words = vec![
4853 word_at((1, 1, 1), 14, 10, 36, "B23"),
4854 word_at((1, 1, 1), 160, 10, 22, "C1"),
4855 word_at((1, 1, 1), 230, 10, 22, "C2"),
4856 word_at((1, 1, 1), 300, 10, 22, "C3"),
4857 word_at((1, 1, 2), 20, 44, 24, "0/0"),
4858 word_at((1, 1, 2), 150, 44, 18, "0"),
4859 word_at((1, 1, 2), 220, 44, 28, "001"),
4860 word_at((1, 1, 2), 300, 44, 28, "000"),
4861 word_at((1, 1, 3), 20, 76, 24, "0/1"),
4862 word_at((1, 1, 3), 150, 76, 28, "000"),
4863 word_at((1, 1, 3), 220, 76, 28, "010"),
4864 word_at((1, 1, 3), 300, 76, 28, "000"),
4865 ];
4866
4867 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4868 assert!(is_matrixish_ocr_artifact_table(&table));
4869 }
4870
4871 #[test]
4872 fn test_small_numeric_table_with_real_headers_is_not_rejected_after_build() {
4873 let image = ImageChunk {
4874 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 140.0),
4875 index: Some(1),
4876 level: None,
4877 };
4878 let words = vec![
4879 word_at((1, 1, 1), 10, 10, 64, "Year"),
4880 word_at((1, 1, 1), 130, 10, 28, "Q1"),
4881 word_at((1, 1, 1), 220, 10, 28, "Q2"),
4882 word_at((1, 1, 1), 310, 10, 28, "Q3"),
4883 word_at((1, 1, 2), 10, 42, 64, "2022"),
4884 word_at((1, 1, 2), 130, 42, 24, "10"),
4885 word_at((1, 1, 2), 220, 42, 24, "25"),
4886 word_at((1, 1, 2), 310, 42, 24, "30"),
4887 word_at((1, 1, 3), 10, 74, 64, "2023"),
4888 word_at((1, 1, 3), 130, 74, 24, "11"),
4889 word_at((1, 1, 3), 220, 74, 24, "26"),
4890 word_at((1, 1, 3), 310, 74, 24, "31"),
4891 ];
4892
4893 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4894 assert!(!is_matrixish_ocr_artifact_table(&table));
4895 }
4896
4897 #[test]
4898 fn test_bordered_table_raster_is_not_rejected_as_chart() {
4899 let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4900 for x in [20, 110, 210, 300] {
4901 for y in 20..181 {
4902 image.put_pixel(x, y, Luma([0]));
4903 }
4904 }
4905 for y in [20, 70, 120, 180] {
4906 for x in 20..301 {
4907 image.put_pixel(x, y, Luma([0]));
4908 }
4909 }
4910
4911 assert!(!is_obvious_bar_chart_raster(&image));
4912 }
4913
4914 #[test]
4915 fn test_morphological_erode_preserves_white_background() {
4916 let image = GrayImage::from_fn(9, 9, |x, y| {
4917 if x == 4 || y == 4 {
4918 Luma([0])
4919 } else {
4920 Luma([255])
4921 }
4922 });
4923
4924 let eroded = morphological_erode(&image, 1);
4925
4926 assert_eq!(eroded.get_pixel(0, 0).0[0], 255);
4927 assert_eq!(eroded.get_pixel(8, 8).0[0], 255);
4928 assert_eq!(eroded.get_pixel(4, 4).0[0], 255);
4929 }
4930
4931 #[test]
4932 fn test_dense_prose_image_ocr_detects_infographic_text() {
4933 let mut words = Vec::new();
4934 let mut top = 20;
4935 for line_num in 1..=8 {
4936 for (idx, (left, text)) in [
4937 (20, "Copyright"),
4938 (120, "protects"),
4939 (240, "creative"),
4940 (350, "work"),
4941 ]
4942 .into_iter()
4943 .enumerate()
4944 {
4945 words.push(OcrWord {
4946 line_key: (1, 1, line_num),
4947 left,
4948 top,
4949 width: 60,
4950 height: 14,
4951 confidence: 85.0,
4952 text: if idx == 0 && line_num % 2 == 0 {
4953 "Creators".to_string()
4954 } else {
4955 text.to_string()
4956 },
4957 });
4958 }
4959 top += 22;
4960 }
4961
4962 assert!(looks_like_dense_prose_image_ocr(&words));
4963 }
4964
4965 #[test]
4966 fn test_dense_prose_image_ocr_rejects_chart_like_words() {
4967 let words = vec![
4968 word((1, 1, 1), 10, "70.2"),
4969 word((1, 1, 1), 90, "75.6"),
4970 word((1, 1, 1), 170, "92.4"),
4971 word((1, 1, 2), 10, "80.4"),
4972 word((1, 1, 2), 90, "94.2"),
4973 word((1, 1, 2), 170, "95.5"),
4974 word((1, 1, 3), 10, "Company"),
4975 word((1, 1, 3), 90, "A"),
4976 word((1, 1, 3), 170, "B"),
4977 word((1, 1, 4), 10, "Scene"),
4978 word((1, 1, 4), 90, "Document"),
4979 word((1, 1, 5), 10, "65"),
4980 word((1, 1, 5), 90, "70"),
4981 word((1, 1, 5), 170, "75"),
4982 word((1, 1, 6), 10, "80"),
4983 word((1, 1, 6), 90, "85"),
4984 word((1, 1, 6), 170, "90"),
4985 word((1, 1, 7), 10, "95"),
4986 word((1, 1, 7), 90, "100"),
4987 ];
4988
4989 assert!(!looks_like_dense_prose_image_ocr(&words));
4990 }
4991
4992 #[test]
4993 fn test_dense_prose_image_ocr_rejects_scattered_chart_labels() {
4994 let words = vec![
4995 word_at((1, 1, 1), 20, 20, 80, "Participation"),
4996 word_at((1, 1, 1), 120, 20, 70, "of"),
4997 word_at((1, 1, 1), 210, 20, 90, "Institutions"),
4998 word_at((1, 1, 2), 310, 50, 50, "57"),
4999 word_at((1, 1, 2), 380, 50, 60, "(24%)"),
5000 word_at((1, 1, 3), 290, 86, 40, "20"),
5001 word_at((1, 1, 3), 345, 86, 50, "(8%)"),
5002 word_at((1, 1, 4), 80, 124, 120, "Government"),
5003 word_at((1, 1, 4), 260, 124, 90, "Other"),
5004 word_at((1, 1, 4), 360, 124, 60, "State"),
5005 word_at((1, 1, 5), 70, 160, 80, "Civil"),
5006 word_at((1, 1, 5), 170, 160, 80, "Society"),
5007 word_at((1, 1, 5), 280, 160, 110, "Organizations"),
5008 word_at((1, 1, 6), 300, 194, 50, "31"),
5009 word_at((1, 1, 6), 365, 194, 60, "(13%)"),
5010 word_at((1, 1, 7), 35, 228, 120, "Educational"),
5011 word_at((1, 1, 7), 180, 228, 100, "Institution"),
5012 word_at((1, 1, 8), 250, 262, 40, "16"),
5013 word_at((1, 1, 8), 305, 262, 50, "(7%)"),
5014 ];
5015
5016 assert!(looks_like_chart_label_ocr(&words));
5017 assert!(!looks_like_table_ocr(&words));
5018 assert!(!looks_like_dense_prose_image_ocr(&words));
5019 }
5020
5021 #[test]
5022 fn test_chart_label_ocr_detects_stacked_bar_chart_legend_layout() {
5023 let words = vec![
5024 word_at((1, 1, 1), 10, 15, 22, "ano"),
5025 word_at((1, 1, 1), 10, 8, 24, "MW."),
5026 word_at((1, 1, 2), 410, 25, 38, "Waste"),
5027 word_at((1, 1, 2), 452, 25, 55, "materials"),
5028 word_at((1, 1, 3), 11, 38, 21, "350"),
5029 word_at((1, 1, 4), 11, 61, 21, "300"),
5030 word_at((1, 1, 4), 411, 56, 38, "Biogas"),
5031 word_at((1, 1, 5), 7, 79, 25, "250"),
5032 word_at((1, 1, 5), 399, 87, 8, "'™"),
5033 word_at((1, 1, 5), 411, 87, 75, "Construction"),
5034 word_at((1, 1, 5), 490, 86, 33, "wood"),
5035 word_at((1, 1, 5), 527, 87, 35, "waste"),
5036 word_at((1, 1, 6), 11, 106, 21, "200"),
5037 word_at((1, 1, 7), 411, 117, 59, "General"),
5038 word_at((1, 1, 7), 467, 116, 27, "wood"),
5039 word_at((1, 1, 7), 499, 116, 54, "(10MWs)"),
5040 word_at((1, 1, 8), 11, 129, 21, "150"),
5041 word_at((1, 1, 9), 11, 152, 21, "100"),
5042 word_at((1, 1, 9), 399, 148, 7, "="),
5043 word_at((1, 1, 9), 411, 135, 46, "General"),
5044 word_at((1, 1, 9), 464, 135, 27, "wood"),
5045 word_at((1, 1, 9), 498, 146, 56, "(<LOMW)"),
5046 word_at((1, 1, 10), 13, 163, 18, "50"),
5047 word_at((1, 1, 10), 399, 178, 7, "="),
5048 word_at((1, 1, 10), 411, 176, 73, "Unutilised"),
5049 word_at((1, 1, 10), 480, 166, 29, "wood"),
5050 word_at((1, 1, 10), 516, 176, 45, "(2MWs)"),
5051 word_at((1, 1, 11), 24, 197, 7, "o"),
5052 word_at((1, 1, 12), 399, 208, 8, "m="),
5053 word_at((1, 1, 12), 411, 206, 59, "Unutilised"),
5054 word_at((1, 1, 12), 474, 206, 33, "wood"),
5055 word_at((1, 1, 12), 512, 206, 48, "(<2MW)"),
5056 word_at((1, 1, 13), 51, 217, 32, "12-13"),
5057 word_at((1, 1, 13), 96, 217, 28, "2014"),
5058 word_at((1, 1, 13), 139, 217, 28, "2015"),
5059 word_at((1, 1, 13), 182, 217, 28, "2016"),
5060 word_at((1, 1, 13), 225, 217, 28, "2017"),
5061 word_at((1, 1, 13), 268, 217, 28, "2018"),
5062 word_at((1, 1, 13), 311, 217, 28, "2019"),
5063 word_at((1, 1, 13), 354, 217, 28, "2020"),
5064 ];
5065
5066 assert!(looks_like_chart_label_ocr(&words));
5067 assert!(!looks_like_table_ocr(&words));
5068 }
5069
5070 #[test]
5071 fn test_build_numeric_table_border_rejects_sparse_chart_layout() {
5072 let image = image_chunk();
5073 let mut words = Vec::new();
5074 let columns = [20, 55, 90, 125, 160, 195, 230, 265, 300, 335, 370, 405];
5075
5076 for (idx, left) in columns.iter().enumerate() {
5077 words.push(word_at((1, 1, 1), *left, 20, 22, &format!("H{}", idx + 1)));
5078 }
5079 for (idx, left) in [20, 160, 300].into_iter().enumerate() {
5080 words.push(word_at((1, 1, 2), left, 52, 22, &format!("{}", idx + 1)));
5081 }
5082 for (idx, left) in [55, 195, 335].into_iter().enumerate() {
5083 words.push(word_at((1, 1, 3), left, 84, 22, &format!("{}", idx + 4)));
5084 }
5085 for (idx, left) in [90, 230, 370].into_iter().enumerate() {
5086 words.push(word_at((1, 1, 4), left, 116, 22, &format!("{}", idx + 7)));
5087 }
5088 for (idx, left) in columns.iter().enumerate() {
5089 words.push(word_at((1, 1, 5), *left, 148, 22, &format!("{}", idx + 10)));
5090 }
5091
5092 assert!(looks_like_chart_label_ocr(&words));
5093 assert!(!looks_like_table_ocr(&words));
5094 assert!(!looks_like_numeric_table_ocr(&words));
5095 assert!(build_numeric_table_border(&words, &image).is_none());
5096 }
5097
5098 #[test]
5099 fn test_lines_from_ocr_words_merges_wrapped_lines_into_blocks() {
5100 let words = vec![
5101 word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5102 word_at((1, 1, 1), 100, 20, 56, "protects"),
5103 word_at((1, 1, 2), 20, 38, 52, "creative"),
5104 word_at((1, 1, 2), 84, 38, 36, "work"),
5105 word_at((1, 1, 3), 240, 20, 52, "Public"),
5106 word_at((1, 1, 3), 304, 20, 40, "domain"),
5107 word_at((1, 1, 4), 240, 38, 60, "expires"),
5108 word_at((1, 1, 4), 312, 38, 44, "later"),
5109 ];
5110
5111 let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &[]);
5112
5113 assert_eq!(recovered.len(), 2);
5114 assert_eq!(recovered[0].value, "Copyright protects creative work");
5115 assert_eq!(recovered[1].value, "Public domain expires later");
5116 }
5117
5118 #[test]
5119 fn test_page_raster_ocr_skips_bar_chart_tables() {
5120 let mut chart = GrayImage::from_pixel(420, 260, Luma([255]));
5121 for x in 24..396 {
5122 chart.put_pixel(x, 222, Luma([170]));
5123 }
5124 for &(x1, x2, y1, shade) in &[
5125 (46, 82, 132, 222),
5126 (104, 140, 84, 214),
5127 (162, 198, 62, 206),
5128 (220, 256, 144, 228),
5129 ] {
5130 for x in x1..x2 {
5131 for y in y1..222 {
5132 chart.put_pixel(x, y, Luma([shade]));
5133 }
5134 }
5135 }
5136
5137 let page_bbox = BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0);
5138 let mut table = TableBorder {
5139 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0),
5140 index: None,
5141 level: None,
5142 x_coordinates: vec![0.0, 210.0, 420.0],
5143 x_widths: vec![0.0; 3],
5144 y_coordinates: vec![260.0, 130.0, 0.0],
5145 y_widths: vec![0.0; 3],
5146 rows: vec![
5147 TableBorderRow {
5148 bbox: BoundingBox::new(Some(1), 0.0, 130.0, 420.0, 260.0),
5149 index: None,
5150 level: None,
5151 row_number: 0,
5152 cells: vec![
5153 TableBorderCell {
5154 bbox: BoundingBox::new(Some(1), 0.0, 130.0, 210.0, 260.0),
5155 index: None,
5156 level: None,
5157 row_number: 0,
5158 col_number: 0,
5159 row_span: 1,
5160 col_span: 1,
5161 content: Vec::new(),
5162 contents: Vec::new(),
5163 semantic_type: None,
5164 },
5165 TableBorderCell {
5166 bbox: BoundingBox::new(Some(1), 210.0, 130.0, 420.0, 260.0),
5167 index: None,
5168 level: None,
5169 row_number: 0,
5170 col_number: 1,
5171 row_span: 1,
5172 col_span: 1,
5173 content: Vec::new(),
5174 contents: Vec::new(),
5175 semantic_type: None,
5176 },
5177 ],
5178 semantic_type: None,
5179 },
5180 TableBorderRow {
5181 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 130.0),
5182 index: None,
5183 level: None,
5184 row_number: 1,
5185 cells: vec![
5186 TableBorderCell {
5187 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 210.0, 130.0),
5188 index: None,
5189 level: None,
5190 row_number: 1,
5191 col_number: 0,
5192 row_span: 1,
5193 col_span: 1,
5194 content: Vec::new(),
5195 contents: Vec::new(),
5196 semantic_type: None,
5197 },
5198 TableBorderCell {
5199 bbox: BoundingBox::new(Some(1), 210.0, 0.0, 420.0, 130.0),
5200 index: None,
5201 level: None,
5202 row_number: 1,
5203 col_number: 1,
5204 row_span: 1,
5205 col_span: 1,
5206 content: Vec::new(),
5207 contents: Vec::new(),
5208 semantic_type: None,
5209 },
5210 ],
5211 semantic_type: None,
5212 },
5213 ],
5214 num_rows: 2,
5215 num_columns: 2,
5216 is_bad_table: false,
5217 is_table_transformer: true,
5218 previous_table: None,
5219 next_table: None,
5220 };
5221
5222 enrich_empty_table_from_page_raster(&chart, &page_bbox, &mut table);
5223
5224 assert!(table
5225 .rows
5226 .iter()
5227 .flat_map(|row| row.cells.iter())
5228 .all(|cell| cell.content.is_empty()));
5229 }
5230
5231 #[test]
5232 fn test_native_text_chars_in_region_ignores_distant_page_text() {
5233 let table_bbox = BoundingBox::new(Some(1), 40.0, 120.0, 360.0, 280.0);
5234 let distant_text = ContentElement::TextChunk(text_chunk(
5235 &"A".repeat(MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR + 40),
5236 BoundingBox::new(Some(1), 40.0, 500.0, 380.0, 560.0),
5237 ));
5238 let overlapping_text = ContentElement::TextChunk(text_chunk(
5239 "1234",
5240 BoundingBox::new(Some(1), 60.0, 160.0, 100.0, 176.0),
5241 ));
5242 let elements = vec![distant_text, overlapping_text];
5243
5244 assert!(page_native_text_chars(&elements) > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR);
5245 assert_eq!(native_text_chars_in_region(&elements, &table_bbox), 4);
5246 }
5247
5248 #[test]
5249 fn test_table_needs_page_raster_ocr_for_sparse_partial_table() {
5250 let mut table = TableBorder {
5251 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 300.0, 200.0),
5252 index: None,
5253 level: None,
5254 x_coordinates: vec![0.0, 60.0, 120.0, 180.0, 240.0, 300.0],
5255 x_widths: vec![0.0; 6],
5256 y_coordinates: vec![200.0, 160.0, 120.0, 80.0, 40.0, 0.0],
5257 y_widths: vec![0.0; 6],
5258 rows: Vec::new(),
5259 num_rows: 5,
5260 num_columns: 5,
5261 is_bad_table: false,
5262 is_table_transformer: true,
5263 previous_table: None,
5264 next_table: None,
5265 };
5266
5267 for row_idx in 0..5 {
5268 let mut row = TableBorderRow {
5269 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 300.0, 200.0),
5270 index: None,
5271 level: None,
5272 row_number: row_idx,
5273 cells: Vec::new(),
5274 semantic_type: None,
5275 };
5276 for col_idx in 0..5 {
5277 row.cells.push(TableBorderCell {
5278 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 60.0, 40.0),
5279 index: None,
5280 level: None,
5281 row_number: row_idx,
5282 col_number: col_idx,
5283 row_span: 1,
5284 col_span: 1,
5285 content: Vec::new(),
5286 contents: Vec::new(),
5287 semantic_type: None,
5288 });
5289 }
5290 table.rows.push(row);
5291 }
5292
5293 table.rows[0].cells[0].content.push(TableToken {
5294 base: text_chunk("12", BoundingBox::new(Some(1), 0.0, 0.0, 20.0, 10.0)),
5295 token_type: TableTokenType::Text,
5296 });
5297
5298 assert!(table_needs_page_raster_ocr(&table));
5299 }
5300
5301 #[test]
5302 fn test_lines_from_ocr_words_dedupes_against_native_text() {
5303 let words = vec![
5304 word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5305 word_at((1, 1, 1), 100, 20, 56, "protects"),
5306 word_at((1, 1, 2), 20, 38, 52, "creative"),
5307 word_at((1, 1, 2), 84, 38, 36, "work"),
5308 ];
5309 let native = vec![TextChunk {
5310 value: "Copyright protects creative work".to_string(),
5311 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 10.0, 10.0),
5312 font_name: "Native".to_string(),
5313 font_size: 12.0,
5314 font_weight: 400.0,
5315 italic_angle: 0.0,
5316 font_color: "#000000".to_string(),
5317 contrast_ratio: 21.0,
5318 symbol_ends: Vec::new(),
5319 text_format: TextFormat::Normal,
5320 text_type: TextType::Regular,
5321 pdf_layer: PdfLayer::Content,
5322 ocg_visible: true,
5323 index: None,
5324 page_number: Some(1),
5325 level: None,
5326 mcid: None,
5327 }];
5328
5329 let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &native);
5330
5331 assert!(recovered.is_empty());
5332 }
5333}