1use std::collections::{BTreeMap, HashMap, HashSet};
4use std::env;
5use std::fs;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8use std::sync::OnceLock;
9use std::time::{SystemTime, UNIX_EPOCH};
10
11use image::{GenericImageView, GrayImage, Luma};
12use serde::Deserialize;
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::{ImageChunk, TextChunk};
16use crate::models::content::ContentElement;
17use crate::models::enums::{PdfLayer, TextFormat, TextType};
18use crate::models::table::{
19 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
20};
21
22const MIN_IMAGE_WIDTH_RATIO: f64 = 0.40;
24const MIN_IMAGE_AREA_RATIO: f64 = 0.035;
25const MAX_NATIVE_TEXT_CHARS_IN_IMAGE: usize = 250;
26const MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE: usize = 12;
27const MIN_OCR_WORD_CONFIDENCE: f64 = 6.0;
30const MAX_OCR_WORD_CONFIDENCE: f64 = 101.0;
32const RASTER_DARK_THRESHOLD: u8 = 180;
33const RASTER_CHART_INK_THRESHOLD: u8 = 240;
34const MIN_BORDERED_VERTICAL_LINES: usize = 3;
35const MIN_BORDERED_HORIZONTAL_LINES: usize = 3;
36const MIN_LINE_DARK_RATIO: f64 = 0.28;
38const MIN_CELL_SIZE_PX: u32 = 10;
39const CELL_INSET_PX: u32 = 5;
40const TABLE_RASTER_OCR_BORDER_PX: u32 = 14;
41const PDFTOPPM_DPI: u32 = 150;
46const OCR_SCALE_FACTOR: u32 = 2;
47const TESSERACT_EFFECTIVE_DPI: u32 = PDFTOPPM_DPI * OCR_SCALE_FACTOR;
49const MIN_DOMINANT_IMAGE_WIDTH_RATIO: f64 = 0.65;
50const MIN_DOMINANT_IMAGE_AREA_RATIO: f64 = 0.40;
51const MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE: usize = 80;
52const MIN_DOMINANT_IMAGE_OCR_WORDS: usize = 18;
53const MIN_DOMINANT_IMAGE_TEXT_LINES: usize = 6;
54const MIN_DENSE_PROSE_BLOCK_LINES: usize = 3;
55const MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO: f64 = 0.32;
56const MIN_TRUE_GRID_LINE_CONTINUITY: f64 = 0.60;
58const MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR: usize = 180;
59const MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR: f64 = 0.08;
60const MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR: usize = 24;
61const LOCAL_BINARIZATION_RADIUS: u32 = 14;
62const MIN_BINARIZATION_BLOCK_PIXELS: usize = 81;
63const MIN_RASTER_TABLE_TEXT_CELL_RATIO: f64 = 0.05;
65const MIN_RASTER_TABLE_ROWS_WITH_TEXT: usize = 1;
66const MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO: f64 = 0.40;
67const MIN_BORDERED_CELL_DARK_RATIO: f64 = 0.03;
68const MIN_BORDERED_INKED_CELL_RATIO: f64 = 0.18;
69const MIN_BORDERED_ROWS_WITH_INK: usize = 2;
70const MIN_BRIGHT_PHOTO_MID_TONE_RATIO: f64 = 0.24;
71const MIN_BRIGHT_PHOTO_HISTOGRAM_BINS: usize = 8;
72const MIN_BRIGHT_PHOTO_ENTROPY: f64 = 1.6;
73
74#[derive(Debug, Clone)]
75struct OcrWord {
76 line_key: (u32, u32, u32),
77 left: u32,
78 top: u32,
79 width: u32,
80 height: u32,
81 text: String,
82 confidence: f64,
83}
84
85#[derive(Debug, Clone)]
86struct XCluster {
87 center: f64,
88 count: usize,
89 lines: HashSet<(u32, u32, u32)>,
90}
91
92#[derive(Clone)]
93struct OcrRowBuild {
94 top_y: f64,
95 bottom_y: f64,
96 cell_texts: Vec<String>,
97}
98
99#[derive(Debug, Clone)]
100struct EmptyCellRaster {
101 row_idx: usize,
102 cell_idx: usize,
103 x1: u32,
104 y1: u32,
105 x2: u32,
106 y2: u32,
107}
108
109#[derive(Debug, Clone)]
110struct RasterTableGrid {
111 vertical_lines: Vec<u32>,
112 horizontal_lines: Vec<u32>,
113}
114
115#[derive(Debug, Clone)]
116struct OcrCandidateScore {
117 words: Vec<OcrWord>,
118 score: f64,
119}
120
121#[derive(Debug, Clone)]
122struct PdfImagesListEntry {
123 image_type: String,
124}
125
126#[derive(Debug, Clone, Copy, PartialEq, Eq)]
127enum OcrEngine {
128 Tesseract,
129 RapidOcr,
130}
131
132#[derive(Debug, Deserialize)]
133struct RapidOcrLine {
134 left: u32,
135 top: u32,
136 width: u32,
137 height: u32,
138 text: String,
139 confidence: f64,
140}
141
142static OCR_ENGINE: OnceLock<OcrEngine> = OnceLock::new();
143static RAPIDOCR_PYTHON: OnceLock<Option<String>> = OnceLock::new();
144
145const RAPIDOCR_RUNNER: &str = r#"
146import json, sys
147from rapidocr import RapidOCR
148
149engine = RapidOCR()
150result = engine(sys.argv[1], use_det=True, use_cls=True, use_rec=True)
151
152if result is None:
153 print('[]')
154 raise SystemExit(0)
155
156boxes = getattr(result, 'boxes', []) or []
157txts = getattr(result, 'txts', []) or []
158scores = getattr(result, 'scores', []) or []
159out = []
160for box, text, score in zip(boxes, txts, scores):
161 if not text or not str(text).strip():
162 continue
163 xs = [pt[0] for pt in box]
164 ys = [pt[1] for pt in box]
165 out.append({
166 'left': int(min(xs)),
167 'top': int(min(ys)),
168 'width': max(1, int(max(xs) - min(xs))),
169 'height': max(1, int(max(ys) - min(ys))),
170 'text': str(text),
171 'confidence': float(score),
172 })
173print(json.dumps(out, ensure_ascii=False))
174"#;
175
176fn selected_ocr_engine() -> OcrEngine {
177 *OCR_ENGINE.get_or_init(|| match env::var("EDGEPARSE_OCR_ENGINE") {
178 Ok(value) => match value.to_ascii_lowercase().as_str() {
179 "rapidocr" if rapidocr_python_command().is_some() => OcrEngine::RapidOcr,
180 "rapidocr" => OcrEngine::Tesseract,
181 _ => OcrEngine::Tesseract,
182 },
183 Err(_) => OcrEngine::Tesseract,
184 })
185}
186
187fn rapidocr_python_command() -> Option<&'static str> {
188 RAPIDOCR_PYTHON
189 .get_or_init(|| {
190 let preferred = env::var("EDGEPARSE_OCR_PYTHON").ok();
191 let mut candidates = Vec::new();
192 if let Some(cmd) = preferred {
193 candidates.push(cmd);
194 }
195 candidates.push("python3".to_string());
196 candidates.push("python".to_string());
197
198 for candidate in candidates {
199 let ok = Command::new(&candidate)
200 .arg("-c")
201 .arg("import rapidocr")
202 .output()
203 .ok()
204 .is_some_and(|out| out.status.success());
205 if ok {
206 return Some(candidate);
207 }
208 }
209 None
210 })
211 .as_deref()
212}
213
214fn rapidocr_lines_to_words(lines: Vec<RapidOcrLine>) -> Vec<OcrWord> {
215 let mut words = Vec::new();
216
217 for (line_idx, line) in lines.into_iter().enumerate() {
218 let tokens: Vec<&str> = line.text.split_whitespace().collect();
219 if tokens.is_empty() {
220 continue;
221 }
222
223 let total_chars: u32 = tokens
224 .iter()
225 .map(|token| token.chars().count() as u32)
226 .sum();
227 if total_chars == 0 {
228 continue;
229 }
230
231 let mut cursor = line.left;
232 let mut remaining_width = line.width.max(tokens.len() as u32);
233 let mut remaining_chars = total_chars;
234
235 for (token_idx, token) in tokens.iter().enumerate() {
236 let token_chars = token.chars().count() as u32;
237 let width = if token_idx == tokens.len() - 1 || remaining_chars <= token_chars {
238 remaining_width.max(1)
239 } else {
240 let proportional = ((remaining_width as f64) * (token_chars as f64)
241 / (remaining_chars as f64))
242 .round() as u32;
243 proportional.max(1).min(remaining_width)
244 };
245
246 words.push(OcrWord {
247 line_key: (0, line_idx as u32, 0),
248 left: cursor,
249 top: line.top,
250 width,
251 height: line.height.max(1),
252 text: (*token).to_string(),
253 confidence: line.confidence,
254 });
255
256 cursor = cursor.saturating_add(width);
257 remaining_width = remaining_width.saturating_sub(width);
258 remaining_chars = remaining_chars.saturating_sub(token_chars);
259 }
260 }
261
262 words
263}
264
265fn run_rapidocr_words(image: &GrayImage) -> Option<Vec<OcrWord>> {
266 let python = rapidocr_python_command()?;
267 let temp_dir = create_temp_dir(0).ok()?;
268 let image_path = temp_dir.join("ocr.png");
269 if image.save(&image_path).is_err() {
270 let _ = fs::remove_dir_all(&temp_dir);
271 return None;
272 }
273
274 let output = Command::new(python)
275 .current_dir(&temp_dir)
276 .arg("-c")
277 .arg(RAPIDOCR_RUNNER)
278 .arg("ocr.png")
279 .output()
280 .ok()?;
281 let _ = fs::remove_dir_all(&temp_dir);
282 if !output.status.success() {
283 return None;
284 }
285
286 let json = String::from_utf8_lossy(&output.stdout);
287 let lines: Vec<RapidOcrLine> = serde_json::from_str(&json).ok()?;
288 let words = rapidocr_lines_to_words(lines);
289 (!words.is_empty()).then_some(words)
290}
291
292pub fn recover_raster_table_text_chunks(
294 input_path: &Path,
295 page_bbox: &BoundingBox,
296 page_number: u32,
297 text_chunks: &[TextChunk],
298 image_chunks: &[ImageChunk],
299) -> Vec<TextChunk> {
300 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
301 return Vec::new();
302 }
303
304 let candidates: Vec<&ImageChunk> = image_chunks
305 .iter()
306 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
307 .collect();
308 if candidates.is_empty() {
309 return Vec::new();
310 }
311
312 let temp_dir = match create_temp_dir(page_number) {
313 Ok(dir) => dir,
314 Err(_) => return Vec::new(),
315 };
316
317 let result =
318 recover_from_page_images(input_path, &temp_dir, page_number, candidates, text_chunks);
319
320 let _ = fs::remove_dir_all(&temp_dir);
321 result
322}
323
324pub fn recover_dominant_image_text_chunks(
331 input_path: &Path,
332 page_bbox: &BoundingBox,
333 page_number: u32,
334 text_chunks: &[TextChunk],
335 image_chunks: &[ImageChunk],
336) -> Vec<TextChunk> {
337 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
338 return Vec::new();
339 }
340
341 let candidates: Vec<&ImageChunk> = image_chunks
342 .iter()
343 .filter(|image| is_dominant_image_text_candidate(image, page_bbox, text_chunks))
344 .collect();
345 if candidates.is_empty() {
346 return Vec::new();
347 }
348
349 let temp_dir = match create_temp_dir(page_number) {
350 Ok(dir) => dir,
351 Err(_) => return Vec::new(),
352 };
353
354 let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
355 Some(files) => files,
356 None => {
357 let _ = fs::remove_dir_all(&temp_dir);
358 return Vec::new();
359 }
360 };
361
362 let mut recovered = Vec::new();
363 for image in candidates {
364 let Some(image_index) = image.index else {
365 continue;
366 };
367 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
368 continue;
369 };
370 let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
371 continue;
372 };
373 if recover_bordered_raster_table_from_gray(&gray, image).is_some()
374 || is_obvious_bar_chart_raster(&gray)
375 || is_natural_photograph_raster(&gray)
376 || is_dark_ui_screenshot_raster(&gray)
377 {
378 continue;
379 }
380
381 let Some(words) = run_tesseract_tsv_words_best(&gray, &["11", "6"], |candidate| {
382 looks_like_dense_prose_image_ocr(candidate)
383 }) else {
384 continue;
385 };
386
387 recovered.extend(lines_from_ocr_words(
388 &words,
389 image,
390 gray.width(),
391 gray.height(),
392 text_chunks,
393 ));
394 }
395
396 let _ = fs::remove_dir_all(&temp_dir);
397 recovered
398}
399
400pub fn recover_raster_table_borders(
402 input_path: &Path,
403 page_bbox: &BoundingBox,
404 page_number: u32,
405 text_chunks: &[TextChunk],
406 image_chunks: &[ImageChunk],
407) -> Vec<TableBorder> {
408 if page_bbox.area() <= 0.0 || image_chunks.is_empty() {
409 return Vec::new();
410 }
411
412 let candidates: Vec<&ImageChunk> = image_chunks
413 .iter()
414 .filter(|image| is_ocr_candidate(image, page_bbox, text_chunks))
415 .collect();
416 if candidates.is_empty() {
417 return Vec::new();
418 }
419
420 let temp_dir = match create_temp_dir(page_number) {
421 Ok(dir) => dir,
422 Err(_) => return Vec::new(),
423 };
424
425 let image_files = match extract_visible_page_image_files(input_path, page_number, &temp_dir) {
426 Some(files) => files,
427 None => {
428 let _ = fs::remove_dir_all(&temp_dir);
429 return Vec::new();
430 }
431 };
432
433 let mut tables = Vec::new();
434 for image in candidates {
435 let Some(image_index) = image.index else {
436 continue;
437 };
438 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
439 continue;
440 };
441 let Ok(gray) = image::open(image_path).map(|img| img.to_luma8()) else {
442 continue;
443 };
444 if is_obvious_bar_chart_raster(&gray)
445 || is_natural_photograph_raster(&gray)
446 || is_dark_ui_screenshot_raster(&gray)
447 {
448 continue;
449 }
450 if let Some(table) = recover_bordered_raster_table_from_gray(&gray, image) {
451 let chart_words = run_tesseract_tsv_words_best(&gray, &["6", "11"], |_| true);
452 if chart_words
453 .as_deref()
454 .is_some_and(looks_like_chart_label_ocr)
455 {
456 continue;
457 }
458 tables.push(table);
459 continue;
460 }
461 let Some(words) = run_tesseract_tsv_words_best(&gray, &["6", "11"], |candidate| {
462 looks_like_table_ocr(candidate)
463 }) else {
464 continue;
465 };
466
467 if looks_like_numeric_table_ocr(&words) {
468 if let Some(table) = build_numeric_table_border(&words, image) {
469 if is_matrixish_ocr_artifact_table(&table) {
470 continue;
471 }
472 tables.push(table);
473 continue;
474 }
475 }
476
477 if let Some(table) = build_structured_ocr_table_border(&words, image) {
478 if is_matrixish_ocr_artifact_table(&table) {
479 continue;
480 }
481 tables.push(table);
482 }
483 }
484
485 let _ = fs::remove_dir_all(&temp_dir);
486 tables
487}
488
489pub fn recover_page_raster_table_cell_text(
495 input_path: &Path,
496 page_bbox: &BoundingBox,
497 page_number: u32,
498 elements: &mut [ContentElement],
499) {
500 if page_bbox.area() <= 0.0 {
501 return;
502 }
503
504 let native_text_chars = page_native_text_chars(elements);
505 if native_text_chars > MAX_NATIVE_TEXT_CHARS_FOR_PAGE_RASTER_OCR {
506 return;
507 }
508
509 let candidate_indices: Vec<usize> = elements
510 .iter()
511 .enumerate()
512 .filter_map(|(idx, elem)| {
513 table_candidate_ref(elem)
514 .filter(|table| table_needs_page_raster_ocr(table))
515 .map(|_| idx)
516 })
517 .take(MAX_EMPTY_TABLES_FOR_PAGE_RASTER_OCR)
518 .collect();
519 if candidate_indices.is_empty() {
520 return;
521 }
522
523 let coverage: f64 = candidate_indices
524 .iter()
525 .filter_map(|idx| table_candidate_ref(&elements[*idx]).map(|table| table.bbox.area()))
526 .sum::<f64>()
527 / page_bbox.area().max(1.0);
528 if coverage < MIN_EMPTY_TABLE_COVERAGE_FOR_PAGE_RASTER_OCR {
529 return;
530 }
531
532 let temp_dir = match create_temp_dir(page_number) {
533 Ok(dir) => dir,
534 Err(_) => return,
535 };
536 let prefix = temp_dir.join("page");
537 let status = Command::new("pdftoppm")
538 .arg("-png")
539 .arg("-f")
540 .arg(page_number.to_string())
541 .arg("-l")
542 .arg(page_number.to_string())
543 .arg("-singlefile")
544 .arg(input_path)
545 .arg(&prefix)
546 .status();
547 match status {
548 Ok(s) if s.success() => {}
549 _ => {
550 let _ = fs::remove_dir_all(&temp_dir);
551 return;
552 }
553 }
554
555 let page_image_path = prefix.with_extension("png");
556 let gray = match image::open(&page_image_path) {
557 Ok(img) => img.to_luma8(),
558 Err(_) => {
559 let _ = fs::remove_dir_all(&temp_dir);
560 return;
561 }
562 };
563
564 for idx in candidate_indices {
565 let Some(elem) = elements.get_mut(idx) else {
566 continue;
567 };
568 let Some(table) = table_candidate_mut(elem) else {
569 continue;
570 };
571 enrich_empty_table_from_page_raster(&gray, page_bbox, table);
572 }
573
574 let _ = fs::remove_dir_all(&temp_dir);
575}
576
577fn table_candidate_ref(elem: &ContentElement) -> Option<&TableBorder> {
578 match elem {
579 ContentElement::TableBorder(table) => Some(table),
580 ContentElement::Table(table) => Some(&table.table_border),
581 _ => None,
582 }
583}
584
585fn table_candidate_mut(elem: &mut ContentElement) -> Option<&mut TableBorder> {
586 match elem {
587 ContentElement::TableBorder(table) => Some(table),
588 ContentElement::Table(table) => Some(&mut table.table_border),
589 _ => None,
590 }
591}
592
593fn recover_from_page_images(
594 input_path: &Path,
595 temp_dir: &Path,
596 page_number: u32,
597 candidates: Vec<&ImageChunk>,
598 text_chunks: &[TextChunk],
599) -> Vec<TextChunk> {
600 let image_files = match extract_visible_page_image_files(input_path, page_number, temp_dir) {
601 Some(files) => files,
602 None => return Vec::new(),
603 };
604 if image_files.is_empty() {
605 return Vec::new();
606 }
607
608 let mut recovered = Vec::new();
609 for image in candidates {
610 let Some(image_index) = image.index else {
611 continue;
612 };
613 let Some(image_path) = image_files.get(image_index.saturating_sub(1) as usize) else {
614 continue;
615 };
616 let bordered_table = recover_bordered_raster_table(image_path, image);
617 if let Some(caption) = recover_bordered_raster_caption(image_path, image) {
618 recovered.push(caption);
619 }
620 if bordered_table.is_some() {
621 continue;
622 }
623 let Some(file_name) = image_path.file_name().and_then(|name| name.to_str()) else {
624 continue;
625 };
626 let native_dpi = PDFTOPPM_DPI.to_string();
630 let Ok(tsv_output) = Command::new("tesseract")
631 .current_dir(temp_dir)
632 .arg(file_name)
633 .arg("stdout")
634 .arg("--dpi")
635 .arg(&native_dpi)
636 .arg("--psm")
637 .arg("6")
638 .arg("-c")
639 .arg("load_system_dawg=0")
640 .arg("-c")
641 .arg("load_freq_dawg=0")
642 .arg("tsv")
643 .output()
644 else {
645 continue;
646 };
647 if !tsv_output.status.success() {
648 continue;
649 }
650
651 let tsv = String::from_utf8_lossy(&tsv_output.stdout);
652 let words = parse_tesseract_tsv(&tsv);
653 if !looks_like_table_ocr(&words) {
654 continue;
655 }
656
657 recovered.extend(words_to_text_chunks(&words, image, text_chunks));
658 }
659
660 recovered
661}
662
663fn page_native_text_chars(elements: &[ContentElement]) -> usize {
664 elements
665 .iter()
666 .map(|elem| match elem {
667 ContentElement::Paragraph(p) => p.base.value().chars().count(),
668 ContentElement::Heading(h) => h.base.base.value().chars().count(),
669 ContentElement::NumberHeading(h) => h.base.base.base.value().chars().count(),
670 ContentElement::TextBlock(tb) => tb.value().chars().count(),
671 ContentElement::TextLine(tl) => tl.value().chars().count(),
672 ContentElement::TextChunk(tc) => tc.value.chars().count(),
673 ContentElement::List(list) => list
674 .list_items
675 .iter()
676 .flat_map(|item| item.contents.iter())
677 .map(|content| match content {
678 ContentElement::Paragraph(p) => p.base.value().chars().count(),
679 ContentElement::TextBlock(tb) => tb.value().chars().count(),
680 ContentElement::TextLine(tl) => tl.value().chars().count(),
681 ContentElement::TextChunk(tc) => tc.value.chars().count(),
682 _ => 0,
683 })
684 .sum(),
685 _ => 0,
686 })
687 .sum()
688}
689
690fn table_needs_page_raster_ocr(table: &TableBorder) -> bool {
691 table.num_rows >= 1
692 && table.num_columns >= 2
693 && table
694 .rows
695 .iter()
696 .flat_map(|row| row.cells.iter())
697 .all(|cell| {
698 !cell
699 .content
700 .iter()
701 .any(|token| matches!(token.token_type, TableTokenType::Text))
702 })
703}
704
705fn enrich_empty_table_from_page_raster(
706 gray: &GrayImage,
707 page_bbox: &BoundingBox,
708 table: &mut TableBorder,
709) {
710 let mut empty_cells: Vec<EmptyCellRaster> = Vec::new();
713 for (row_idx, row) in table.rows.iter().enumerate() {
714 for (cell_idx, cell) in row.cells.iter().enumerate() {
715 if cell
716 .content
717 .iter()
718 .any(|token| matches!(token.token_type, TableTokenType::Text))
719 {
720 continue;
721 }
722 let Some((x1, y1, x2, y2)) = page_bbox_to_raster_box(gray, page_bbox, &cell.bbox)
723 else {
724 continue;
725 };
726 empty_cells.push(EmptyCellRaster {
727 row_idx,
728 cell_idx,
729 x1,
730 y1,
731 x2,
732 y2,
733 });
734 }
735 }
736 if empty_cells.is_empty() {
737 return;
738 }
739
740 let Some((tx1, ty1, tx2, ty2)) = page_bbox_to_raster_box(gray, page_bbox, &table.bbox) else {
742 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
743 return;
744 };
745
746 let pad = CELL_INSET_PX * 2;
747 let crop_left = tx1.saturating_sub(pad);
748 let crop_top = ty1.saturating_sub(pad);
749 let crop_right = (tx2 + pad).min(gray.width());
750 let crop_bottom = (ty2 + pad).min(gray.height());
751 if crop_right <= crop_left || crop_bottom <= crop_top {
752 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
753 return;
754 }
755
756 let crop_width = crop_right - crop_left;
757 let crop_height = crop_bottom - crop_top;
758 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
759 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
760 return;
761 }
762
763 let cropped = gray
764 .view(crop_left, crop_top, crop_width, crop_height)
765 .to_image();
766 if is_obvious_bar_chart_raster(&cropped)
767 || is_natural_photograph_raster(&cropped)
768 || is_dark_ui_screenshot_raster(&cropped)
769 {
770 return;
771 }
772 let bordered = expand_white_border(&cropped, TABLE_RASTER_OCR_BORDER_PX);
773 let scaled = image::imageops::resize(
774 &bordered,
775 bordered.width() * OCR_SCALE_FACTOR,
776 bordered.height() * OCR_SCALE_FACTOR,
777 image::imageops::FilterType::Lanczos3,
778 );
779
780 let Some(words) = run_tesseract_tsv_words(&scaled, "6") else {
781 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
782 return;
783 };
784 if words.is_empty() {
785 fill_cells_with_per_cell_ocr(gray, table, &empty_cells);
786 return;
787 }
788 if looks_like_chart_label_ocr(&words) {
789 return;
790 }
791
792 let mut buckets: Vec<Vec<(u32, u32, String)>> = vec![Vec::new(); empty_cells.len()];
793 let scale = f64::from(OCR_SCALE_FACTOR);
794 let border = f64::from(TABLE_RASTER_OCR_BORDER_PX);
795
796 for word in &words {
797 let cx_scaled = f64::from(word.left) + f64::from(word.width) / 2.0;
798 let cy_scaled = f64::from(word.top) + f64::from(word.height) / 2.0;
799
800 let cx_crop = cx_scaled / scale - border;
801 let cy_crop = cy_scaled / scale - border;
802 if cx_crop < 0.0 || cy_crop < 0.0 {
803 continue;
804 }
805
806 let cx_page = match u32::try_from(cx_crop.round() as i64) {
807 Ok(v) => crop_left.saturating_add(v),
808 Err(_) => continue,
809 };
810 let cy_page = match u32::try_from(cy_crop.round() as i64) {
811 Ok(v) => crop_top.saturating_add(v),
812 Err(_) => continue,
813 };
814
815 for (idx, cell) in empty_cells.iter().enumerate() {
816 if cx_page >= cell.x1 && cx_page < cell.x2 && cy_page >= cell.y1 && cy_page < cell.y2 {
817 buckets[idx].push((cy_page, cx_page, word.text.clone()));
818 break;
819 }
820 }
821 }
822
823 for (idx, cell) in empty_cells.iter().enumerate() {
824 let Some(row) = table.rows.get_mut(cell.row_idx) else {
825 continue;
826 };
827 let Some(target) = row.cells.get_mut(cell.cell_idx) else {
828 continue;
829 };
830 if target
831 .content
832 .iter()
833 .any(|token| matches!(token.token_type, TableTokenType::Text))
834 {
835 continue;
836 }
837 let mut parts = std::mem::take(&mut buckets[idx]);
838 if parts.is_empty() {
839 continue;
840 }
841 parts.sort_by(|a, b| (a.0, a.1).cmp(&(b.0, b.1)));
842 let raw = parts
843 .into_iter()
844 .map(|(_, _, t)| t)
845 .collect::<Vec<_>>()
846 .join(" ");
847 let text = normalize_page_raster_cell_text(&target.bbox, raw);
848 if text.is_empty() {
849 continue;
850 }
851 target.content.push(TableToken {
852 base: TextChunk {
853 value: text,
854 bbox: target.bbox.clone(),
855 font_name: "OCR".to_string(),
856 font_size: target.bbox.height().max(6.0),
857 font_weight: 400.0,
858 italic_angle: 0.0,
859 font_color: "#000000".to_string(),
860 contrast_ratio: 21.0,
861 symbol_ends: Vec::new(),
862 text_format: TextFormat::Normal,
863 text_type: TextType::Regular,
864 pdf_layer: PdfLayer::Content,
865 ocg_visible: true,
866 index: None,
867 page_number: target.bbox.page_number,
868 level: None,
869 mcid: None,
870 },
871 token_type: TableTokenType::Text,
872 });
873 }
874}
875
876fn fill_cells_with_per_cell_ocr(
877 gray: &GrayImage,
878 table: &mut TableBorder,
879 empty_cells: &[EmptyCellRaster],
880) {
881 for cell in empty_cells {
882 let Some(row) = table.rows.get_mut(cell.row_idx) else {
883 continue;
884 };
885 let Some(target) = row.cells.get_mut(cell.cell_idx) else {
886 continue;
887 };
888 if target
889 .content
890 .iter()
891 .any(|token| matches!(token.token_type, TableTokenType::Text))
892 {
893 continue;
894 }
895 let Some(text) =
896 extract_page_raster_cell_text(gray, &target.bbox, cell.x1, cell.y1, cell.x2, cell.y2)
897 else {
898 continue;
899 };
900 if text.is_empty() {
901 continue;
902 }
903 target.content.push(TableToken {
904 base: TextChunk {
905 value: text,
906 bbox: target.bbox.clone(),
907 font_name: "OCR".to_string(),
908 font_size: target.bbox.height().max(6.0),
909 font_weight: 400.0,
910 italic_angle: 0.0,
911 font_color: "#000000".to_string(),
912 contrast_ratio: 21.0,
913 symbol_ends: Vec::new(),
914 text_format: TextFormat::Normal,
915 text_type: TextType::Regular,
916 pdf_layer: PdfLayer::Content,
917 ocg_visible: true,
918 index: None,
919 page_number: target.bbox.page_number,
920 level: None,
921 mcid: None,
922 },
923 token_type: TableTokenType::Text,
924 });
925 }
926}
927
928fn page_bbox_to_raster_box(
929 gray: &GrayImage,
930 page_bbox: &BoundingBox,
931 bbox: &BoundingBox,
932) -> Option<(u32, u32, u32, u32)> {
933 if page_bbox.width() <= 0.0 || page_bbox.height() <= 0.0 {
934 return None;
935 }
936
937 let left = ((bbox.left_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
938 .clamp(0.0, f64::from(gray.width()));
939 let right = ((bbox.right_x - page_bbox.left_x) / page_bbox.width() * f64::from(gray.width()))
940 .clamp(0.0, f64::from(gray.width()));
941 let top = ((page_bbox.top_y - bbox.top_y) / page_bbox.height() * f64::from(gray.height()))
942 .clamp(0.0, f64::from(gray.height()));
943 let bottom = ((page_bbox.top_y - bbox.bottom_y) / page_bbox.height()
944 * f64::from(gray.height()))
945 .clamp(0.0, f64::from(gray.height()));
946
947 let x1 = left.floor() as u32;
948 let x2 = right.ceil() as u32;
949 let y1 = top.floor() as u32;
950 let y2 = bottom.ceil() as u32;
951 (x2 > x1 && y2 > y1).then_some((x1, y1, x2, y2))
952}
953
954fn extract_page_raster_cell_text(
955 gray: &GrayImage,
956 cell_bbox: &BoundingBox,
957 x1: u32,
958 y1: u32,
959 x2: u32,
960 y2: u32,
961) -> Option<String> {
962 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
963 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
964 let crop_left = x1 + inset_x;
965 let crop_top = y1 + inset_y;
966 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
967 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
968 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
969 return Some(String::new());
970 }
971
972 let cropped = gray
973 .view(crop_left, crop_top, crop_width, crop_height)
974 .to_image();
975 let bordered = expand_white_border(&cropped, 12);
976 let scaled = image::imageops::resize(
977 &bordered,
978 bordered.width() * OCR_SCALE_FACTOR,
979 bordered.height() * OCR_SCALE_FACTOR,
980 image::imageops::FilterType::Lanczos3,
981 );
982
983 let aspect_ratio = cell_bbox.width() / cell_bbox.height();
985 let is_vertical = aspect_ratio < 0.8;
986
987 let psm_modes: [&str; 5] = if is_vertical {
997 ["7", "8", "6", "11", "13"]
998 } else {
999 ["6", "7", "8", "11", "13"]
1000 };
1001
1002 let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
1003 Some(normalize_page_raster_cell_text(cell_bbox, raw_text))
1004}
1005
1006fn normalize_page_raster_cell_text(cell_bbox: &BoundingBox, text: String) -> String {
1007 let normalized = text
1008 .replace('|', " ")
1009 .replace('—', "-")
1010 .replace(['“', '”'], "\"")
1011 .replace('’', "'")
1012 .split_whitespace()
1013 .collect::<Vec<_>>()
1014 .join(" ");
1015
1016 if normalized.is_empty() {
1017 return normalized;
1018 }
1019
1020 let narrow_cell = cell_bbox.width() <= cell_bbox.height() * 1.15;
1021 if narrow_cell && normalized.len() <= 3 && !normalized.chars().any(|ch| ch.is_ascii_digit()) {
1022 return String::new();
1023 }
1024
1025 normalized
1026}
1027
1028fn is_ocr_candidate(
1029 image: &ImageChunk,
1030 page_bbox: &BoundingBox,
1031 text_chunks: &[TextChunk],
1032) -> bool {
1033 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1034 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1035 if width_ratio < MIN_IMAGE_WIDTH_RATIO || area_ratio < MIN_IMAGE_AREA_RATIO {
1036 return false;
1037 }
1038
1039 let overlapping_chunks: Vec<&TextChunk> = text_chunks
1040 .iter()
1041 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1042 .collect();
1043 let native_text_chars: usize = overlapping_chunks
1044 .iter()
1045 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1046 .sum();
1047
1048 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_IMAGE
1049 || overlapping_chunks.len() <= MAX_NATIVE_TEXT_CHUNKS_IN_IMAGE
1050}
1051
1052fn is_dominant_image_text_candidate(
1053 image: &ImageChunk,
1054 page_bbox: &BoundingBox,
1055 text_chunks: &[TextChunk],
1056) -> bool {
1057 let width_ratio = image.bbox.width() / page_bbox.width().max(1.0);
1058 let area_ratio = image.bbox.area() / page_bbox.area().max(1.0);
1059 if width_ratio < MIN_DOMINANT_IMAGE_WIDTH_RATIO || area_ratio < MIN_DOMINANT_IMAGE_AREA_RATIO {
1060 return false;
1061 }
1062
1063 let native_text_chars: usize = text_chunks
1064 .iter()
1065 .filter(|chunk| image.bbox.intersection_percent(&chunk.bbox) >= 0.7)
1066 .map(|chunk| chunk.value.chars().filter(|ch| !ch.is_whitespace()).count())
1067 .sum();
1068
1069 native_text_chars <= MAX_NATIVE_TEXT_CHARS_IN_DOMINANT_IMAGE
1070}
1071
1072fn parse_tesseract_tsv(tsv: &str) -> Vec<OcrWord> {
1073 let mut words = Vec::new();
1074 for line in tsv.lines().skip(1) {
1075 let mut cols = line.splitn(12, '\t');
1076 let level = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1077 if level != 5 {
1078 continue;
1079 }
1080 let _page_num = cols.next();
1081 let block_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1082 let par_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1083 let line_num = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1084 let _word_num = cols.next();
1085 let left = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1086 let top = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1087 let width = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1088 let height = cols.next().and_then(|s| s.parse::<u32>().ok()).unwrap_or(0);
1089 let confidence = cols
1090 .next()
1091 .and_then(|s| s.parse::<f64>().ok())
1092 .unwrap_or(-1.0);
1093 let text = cols.next().unwrap_or("").trim().to_string();
1094 if !(MIN_OCR_WORD_CONFIDENCE..=MAX_OCR_WORD_CONFIDENCE).contains(&confidence)
1095 || text.is_empty()
1096 || width == 0
1097 || height == 0
1098 || !text.chars().any(|ch| ch.is_alphanumeric())
1099 {
1100 continue;
1101 }
1102 words.push(OcrWord {
1103 line_key: (block_num, par_num, line_num),
1104 left,
1105 top,
1106 width,
1107 height,
1108 text,
1109 confidence,
1110 });
1111 }
1112 words
1113}
1114
1115fn looks_like_chart_label_ocr(words: &[OcrWord]) -> bool {
1116 if words.len() < 8 {
1117 return false;
1118 }
1119
1120 let min_left = words.iter().map(|word| word.left).min().unwrap_or(0);
1121 let min_top = words.iter().map(|word| word.top).min().unwrap_or(0);
1122 let max_right = words
1123 .iter()
1124 .map(|word| word.left.saturating_add(word.width))
1125 .max()
1126 .unwrap_or(0);
1127 let max_bottom = words
1128 .iter()
1129 .map(|word| word.top.saturating_add(word.height))
1130 .max()
1131 .unwrap_or(0);
1132 let image_width = max_right.saturating_sub(min_left);
1133 let image_height = max_bottom.saturating_sub(min_top);
1134 if image_width < 160 || image_height < 120 {
1135 return false;
1136 }
1137
1138 let width_f = f64::from(image_width);
1139 let height_f = f64::from(image_height);
1140 let outer_x = width_f * 0.18;
1141 let outer_y = height_f * 0.18;
1142 let inner_left = width_f * 0.22;
1143 let inner_right = width_f * 0.78;
1144 let inner_top = height_f * 0.22;
1145 let inner_bottom = height_f * 0.78;
1146
1147 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1148 let mut outer_words = 0usize;
1149 let mut inner_words = 0usize;
1150
1151 for word in words {
1152 by_line.entry(word.line_key).or_default().push(word);
1153
1154 let center_x = f64::from(word.left.saturating_sub(min_left)) + f64::from(word.width) / 2.0;
1155 let center_y = f64::from(word.top.saturating_sub(min_top)) + f64::from(word.height) / 2.0;
1156
1157 if center_x <= outer_x
1158 || center_x >= width_f - outer_x
1159 || center_y <= outer_y
1160 || center_y >= height_f - outer_y
1161 {
1162 outer_words += 1;
1163 }
1164
1165 if center_x >= inner_left
1166 && center_x <= inner_right
1167 && center_y >= inner_top
1168 && center_y <= inner_bottom
1169 {
1170 inner_words += 1;
1171 }
1172 }
1173
1174 if by_line.len() < 5 {
1175 return false;
1176 }
1177
1178 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1179 let mut clusters: Vec<XCluster> = Vec::new();
1180 for line_words in by_line.values() {
1181 for word in line_words {
1182 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1183 if let Some(cluster) = clusters
1184 .iter_mut()
1185 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1186 {
1187 cluster.center =
1188 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1189 cluster.count += 1;
1190 cluster.lines.insert(word.line_key);
1191 } else {
1192 let mut lines = HashSet::new();
1193 lines.insert(word.line_key);
1194 clusters.push(XCluster {
1195 center,
1196 count: 1,
1197 lines,
1198 });
1199 }
1200 }
1201 }
1202
1203 let stable_centers: Vec<f64> = clusters
1204 .iter()
1205 .filter(|cluster| cluster.lines.len() >= 4 && cluster.count >= 4)
1206 .map(|cluster| cluster.center)
1207 .collect();
1208 let mut sorted_stable_centers = stable_centers.clone();
1209 sorted_stable_centers
1210 .sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1211 let max_stable_gap = sorted_stable_centers
1212 .windows(2)
1213 .map(|pair| pair[1] - pair[0])
1214 .fold(0.0, f64::max);
1215 let spans_full_table_width = stable_centers.len() >= 3
1216 && stable_centers
1217 .iter()
1218 .any(|center| *center - f64::from(min_left) <= width_f * 0.25)
1219 && stable_centers
1220 .iter()
1221 .any(|center| *center - f64::from(min_left) >= width_f * 0.75)
1222 && stable_centers.iter().any(|center| {
1223 let rel = *center - f64::from(min_left);
1224 rel >= inner_left && rel <= inner_right
1225 })
1226 && max_stable_gap <= width_f * 0.45;
1227 if spans_full_table_width {
1228 let table_like_lines = by_line
1229 .values()
1230 .filter(|line_words| {
1231 let mut seen = HashSet::<usize>::new();
1232 for word in *line_words {
1233 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1234 for (idx, stable_center) in stable_centers.iter().enumerate() {
1235 if (center - stable_center).abs() <= tolerance {
1236 seen.insert(idx);
1237 }
1238 }
1239 }
1240 seen.len() >= 3
1241 })
1242 .count();
1243 if table_like_lines >= 4 {
1244 return false;
1245 }
1246 }
1247
1248 let mut short_lines = 0usize;
1249 let mut peripheral_label_lines = 0usize;
1250 let mut wide_sentence_lines = 0usize;
1251 let mut axisish_numeric_lines = 0usize;
1252
1253 for line_words in by_line.values() {
1254 let line_left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1255 let line_top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1256 let line_right = line_words
1257 .iter()
1258 .map(|word| word.left.saturating_add(word.width))
1259 .max()
1260 .unwrap_or(0);
1261 let line_bottom = line_words
1262 .iter()
1263 .map(|word| word.top.saturating_add(word.height))
1264 .max()
1265 .unwrap_or(0);
1266 if line_right <= line_left || line_bottom <= line_top {
1267 continue;
1268 }
1269
1270 let word_count = line_words.len();
1271 let numeric_in_line = line_words
1272 .iter()
1273 .filter(|word| is_numeric_like(&word.text))
1274 .count();
1275 let line_width_ratio =
1276 f64::from(line_right.saturating_sub(line_left)) / f64::from(image_width.max(1));
1277 let touches_outer_band = f64::from(line_left.saturating_sub(min_left)) <= outer_x
1278 || f64::from(line_right.saturating_sub(min_left)) >= width_f - outer_x
1279 || f64::from(line_top.saturating_sub(min_top)) <= outer_y
1280 || f64::from(line_bottom.saturating_sub(min_top)) >= height_f - outer_y;
1281
1282 if word_count <= 3 {
1283 short_lines += 1;
1284 }
1285 if touches_outer_band && word_count <= 4 {
1286 peripheral_label_lines += 1;
1287 }
1288 if touches_outer_band && word_count <= 3 && numeric_in_line > 0 {
1289 axisish_numeric_lines += 1;
1290 }
1291 if word_count >= 4 && line_width_ratio >= 0.45 && numeric_in_line == 0 {
1292 wide_sentence_lines += 1;
1293 }
1294 }
1295
1296 let total_lines = by_line.len();
1297 let outer_dominant = outer_words * 10 >= words.len() * 5;
1298 let inner_sparse = inner_words * 10 <= words.len() * 5;
1299 let label_dominant = peripheral_label_lines * 10 >= total_lines * 6;
1300 let short_line_dominant = short_lines * 10 >= total_lines * 6;
1301 let axis_signal = axisish_numeric_lines >= 2;
1302
1303 outer_dominant
1304 && inner_sparse
1305 && label_dominant
1306 && short_line_dominant
1307 && axis_signal
1308 && wide_sentence_lines <= 2
1309}
1310
1311fn looks_like_matrix_formula_ocr(words: &[OcrWord]) -> bool {
1312 if words.len() < 6 {
1313 return false;
1314 }
1315
1316 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1317 for word in words {
1318 by_line.entry(word.line_key).or_default().push(word);
1319 }
1320
1321 if by_line.len() < 2 || by_line.len() > 4 {
1322 return false;
1323 }
1324
1325 let substantive_words = words
1326 .iter()
1327 .filter(|word| is_substantive_table_word(&word.text))
1328 .count();
1329 let short_formulaish_words = words
1330 .iter()
1331 .filter(|word| is_short_formulaish_word(&word.text))
1332 .count();
1333 let slash_words = words.iter().filter(|word| word.text.contains('/')).count();
1334 let equation_label_words = words
1335 .iter()
1336 .filter(|word| looks_like_equation_label_word(&word.text))
1337 .count();
1338 let dense_lines = by_line.values().filter(|line| line.len() >= 3).count();
1339 let short_lines = by_line
1340 .values()
1341 .filter(|line| line.iter().all(|word| is_short_formulaish_word(&word.text)))
1342 .count();
1343
1344 substantive_words == 0
1345 && dense_lines >= 2
1346 && short_lines * 10 >= by_line.len() * 7
1347 && short_formulaish_words * 10 >= words.len() * 7
1348 && (slash_words > 0 || equation_label_words >= 2)
1349}
1350
1351fn is_substantive_table_word(text: &str) -> bool {
1352 let normalized: String = text
1353 .chars()
1354 .filter(|ch| ch.is_alphanumeric())
1355 .flat_map(char::to_lowercase)
1356 .collect();
1357 if normalized.is_empty() {
1358 return false;
1359 }
1360
1361 let alpha_count = normalized.chars().filter(|ch| ch.is_alphabetic()).count();
1362 let digit_count = normalized.chars().filter(|ch| ch.is_ascii_digit()).count();
1363 let has_non_binary_digit = normalized
1364 .chars()
1365 .any(|ch| ch.is_ascii_digit() && !matches!(ch, '0' | '1'));
1366
1367 alpha_count >= 4
1368 || (digit_count >= 2 && alpha_count == 0 && has_non_binary_digit)
1369 || (normalized.len() >= 5 && alpha_count >= 2)
1370}
1371
1372fn is_short_formulaish_word(text: &str) -> bool {
1373 let normalized: String = text
1374 .chars()
1375 .filter(|ch| ch.is_alphanumeric())
1376 .flat_map(char::to_lowercase)
1377 .collect();
1378 if normalized.is_empty() {
1379 return true;
1380 }
1381
1382 normalized.len() <= 3 || (text.contains('/') && normalized.len() <= 4)
1383}
1384
1385fn looks_like_equation_label_word(text: &str) -> bool {
1386 let trimmed = text.trim_matches(|ch: char| !ch.is_alphanumeric());
1387 let mut chars = trimmed.chars();
1388 let Some(first) = chars.next() else {
1389 return false;
1390 };
1391 if !first.is_ascii_alphabetic() || !first.is_ascii_uppercase() {
1392 return false;
1393 }
1394
1395 let remainder: String = chars.collect();
1396 !remainder.is_empty() && remainder.len() <= 3 && remainder.chars().all(|ch| ch.is_ascii_digit())
1397}
1398
1399fn looks_like_table_ocr(words: &[OcrWord]) -> bool {
1400 if words.len() < 8 {
1401 return false;
1402 }
1403
1404 if looks_like_chart_label_ocr(words) {
1405 return false;
1406 }
1407
1408 if looks_like_matrix_formula_ocr(words) {
1409 return false;
1410 }
1411
1412 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1413 for word in words {
1414 by_line.entry(word.line_key).or_default().push(word);
1415 }
1416
1417 let mut qualifying_lines = Vec::new();
1418 let mut numeric_like_count = 0usize;
1419 let mut max_right = 0u32;
1420 for line_words in by_line.values_mut() {
1421 line_words.sort_by_key(|word| word.left);
1422 let numeric_words = line_words
1423 .iter()
1424 .filter(|word| is_numeric_like(&word.text))
1425 .count();
1426 numeric_like_count += numeric_words;
1427 if line_words.len() >= 3 || numeric_words >= 2 {
1428 max_right = max_right.max(
1429 line_words
1430 .iter()
1431 .map(|word| word.left.saturating_add(word.width))
1432 .max()
1433 .unwrap_or(0),
1434 );
1435 qualifying_lines.push(line_words.clone());
1436 }
1437 }
1438
1439 if qualifying_lines.len() < 2 {
1440 return false;
1441 }
1442
1443 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1444 let mut clusters: Vec<XCluster> = Vec::new();
1445 for line in &qualifying_lines {
1446 for word in line {
1447 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1448 if let Some(cluster) = clusters
1449 .iter_mut()
1450 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1451 {
1452 cluster.center =
1453 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1454 cluster.count += 1;
1455 cluster.lines.insert(word.line_key);
1456 } else {
1457 let mut lines = HashSet::new();
1458 lines.insert(word.line_key);
1459 clusters.push(XCluster {
1460 center,
1461 count: 1,
1462 lines,
1463 });
1464 }
1465 }
1466 }
1467
1468 let repeated_clusters: Vec<&XCluster> = clusters
1469 .iter()
1470 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1471 .collect();
1472 if repeated_clusters.len() < 3 {
1473 return false;
1474 }
1475
1476 let repeated_centers: Vec<f64> = repeated_clusters
1477 .iter()
1478 .map(|cluster| cluster.center)
1479 .collect();
1480 let structured_lines = qualifying_lines
1481 .iter()
1482 .filter(|line| {
1483 let mut seen = HashSet::<usize>::new();
1484 for word in *line {
1485 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1486 for (idx, repeated_center) in repeated_centers.iter().enumerate() {
1487 if (center - repeated_center).abs() <= tolerance {
1488 seen.insert(idx);
1489 }
1490 }
1491 }
1492 seen.len() >= 3
1493 || (seen.len() >= 2
1494 && line.iter().filter(|w| is_numeric_like(&w.text)).count() >= 2)
1495 })
1496 .count();
1497
1498 let alphabetic_words = words
1499 .iter()
1500 .filter(|word| word.text.chars().any(|ch| ch.is_alphabetic()))
1501 .count();
1502
1503 if numeric_like_count == 0
1507 && alphabetic_words * 10 >= words.len() * 9
1508 && repeated_clusters.len() <= 4
1509 {
1510 return false;
1511 }
1512
1513 structured_lines >= 3
1514 || (structured_lines >= 2 && numeric_like_count >= 6 && repeated_clusters.len() >= 4)
1515}
1516
1517fn looks_like_numeric_table_ocr(words: &[OcrWord]) -> bool {
1518 if !looks_like_table_ocr(words) {
1519 return false;
1520 }
1521
1522 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1523 for word in words {
1524 by_line.entry(word.line_key).or_default().push(word);
1525 }
1526
1527 let numeric_like_count = words
1528 .iter()
1529 .filter(|word| is_numeric_like(&word.text))
1530 .count();
1531 let numeric_lines = by_line
1532 .values()
1533 .filter(|line| {
1534 line.iter()
1535 .filter(|word| is_numeric_like(&word.text))
1536 .count()
1537 >= 2
1538 })
1539 .count();
1540
1541 numeric_like_count >= 12 && numeric_lines >= 3
1542}
1543
1544fn looks_like_dense_prose_image_ocr(words: &[OcrWord]) -> bool {
1545 if words.len() < MIN_DOMINANT_IMAGE_OCR_WORDS || looks_like_table_ocr(words) {
1546 return false;
1547 }
1548
1549 if looks_like_chart_label_ocr(words) {
1550 return false;
1551 }
1552
1553 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1554 let mut alphabetic_words = 0usize;
1555 let mut numeric_like_words = 0usize;
1556 for word in words {
1557 by_line.entry(word.line_key).or_default().push(word);
1558 if word.text.chars().any(|ch| ch.is_alphabetic()) {
1559 alphabetic_words += 1;
1560 }
1561 if is_numeric_like(&word.text) {
1562 numeric_like_words += 1;
1563 }
1564 }
1565
1566 if by_line.len() < MIN_DOMINANT_IMAGE_TEXT_LINES || alphabetic_words * 3 < words.len() * 2 {
1567 return false;
1568 }
1569 if numeric_like_words * 4 > words.len() {
1570 return false;
1571 }
1572
1573 let multiword_lines = by_line
1574 .values()
1575 .filter(|line| line.iter().filter(|word| word.text.len() >= 2).count() >= 3)
1576 .count();
1577 multiword_lines >= 4 && has_dense_prose_block_geometry(words)
1578}
1579
1580fn has_dense_prose_block_geometry(words: &[OcrWord]) -> bool {
1581 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1582 for word in words {
1583 by_line.entry(word.line_key).or_default().push(word);
1584 }
1585
1586 let mut spatial_lines = Vec::new();
1587 for line_words in by_line.values() {
1588 if line_words.len() < 3 {
1589 continue;
1590 }
1591
1592 let left = line_words.iter().map(|word| word.left).min().unwrap_or(0);
1593 let right = line_words
1594 .iter()
1595 .map(|word| word.left.saturating_add(word.width))
1596 .max()
1597 .unwrap_or(0);
1598 let top = line_words.iter().map(|word| word.top).min().unwrap_or(0);
1599 let bottom = line_words
1600 .iter()
1601 .map(|word| word.top.saturating_add(word.height))
1602 .max()
1603 .unwrap_or(0);
1604
1605 if right <= left || bottom <= top {
1606 continue;
1607 }
1608
1609 spatial_lines.push(SpatialOcrLine {
1610 left,
1611 top,
1612 right,
1613 bottom,
1614 text: String::new(),
1615 word_count: line_words.len(),
1616 line_count: 1,
1617 line_height_sum: bottom.saturating_sub(top).max(1),
1618 });
1619 }
1620
1621 spatial_lines.sort_by_key(|line| (line.top, line.left));
1622 if spatial_lines.len() < MIN_DENSE_PROSE_BLOCK_LINES {
1623 return false;
1624 }
1625
1626 let image_width = spatial_lines
1627 .iter()
1628 .map(|line| line.right)
1629 .max()
1630 .unwrap_or(0);
1631 if image_width == 0 {
1632 return false;
1633 }
1634
1635 let median_height = {
1636 let mut heights: Vec<u32> = spatial_lines
1637 .iter()
1638 .map(|line| line.bottom.saturating_sub(line.top).max(1))
1639 .collect();
1640 heights.sort_unstable();
1641 heights[heights.len() / 2]
1642 };
1643
1644 let mut best_line_count = 1usize;
1645 let mut best_left = spatial_lines[0].left;
1646 let mut best_right = spatial_lines[0].right;
1647 let mut current_line_count = 1usize;
1648 let mut current_left = spatial_lines[0].left;
1649 let mut current_right = spatial_lines[0].right;
1650
1651 for pair in spatial_lines.windows(2) {
1652 let prev = &pair[0];
1653 let curr = &pair[1];
1654 if spatial_lines_share_block_geometry(prev, curr, image_width, median_height) {
1655 current_line_count += 1;
1656 current_left = current_left.min(curr.left);
1657 current_right = current_right.max(curr.right);
1658 } else {
1659 if current_line_count > best_line_count {
1660 best_line_count = current_line_count;
1661 best_left = current_left;
1662 best_right = current_right;
1663 }
1664 current_line_count = 1;
1665 current_left = curr.left;
1666 current_right = curr.right;
1667 }
1668 }
1669
1670 if current_line_count > best_line_count {
1671 best_line_count = current_line_count;
1672 best_left = current_left;
1673 best_right = current_right;
1674 }
1675
1676 let block_width_ratio =
1677 f64::from(best_right.saturating_sub(best_left)) / f64::from(image_width);
1678 best_line_count >= MIN_DENSE_PROSE_BLOCK_LINES
1679 && block_width_ratio >= MIN_DENSE_PROSE_BLOCK_WIDTH_RATIO
1680}
1681
1682fn build_numeric_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1683 let image_width = words
1684 .iter()
1685 .map(|word| word.left.saturating_add(word.width))
1686 .max()?;
1687 let image_height = words
1688 .iter()
1689 .map(|word| word.top.saturating_add(word.height))
1690 .max()?;
1691 if image_width == 0 || image_height == 0 {
1692 return None;
1693 }
1694
1695 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1696 for word in words {
1697 by_line.entry(word.line_key).or_default().push(word);
1698 }
1699
1700 let max_right = words
1701 .iter()
1702 .map(|word| word.left.saturating_add(word.width))
1703 .max()
1704 .unwrap_or(0);
1705 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1706
1707 let mut clusters: Vec<XCluster> = Vec::new();
1708 for line_words in by_line.values() {
1709 for word in line_words {
1710 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1711 if let Some(cluster) = clusters
1712 .iter_mut()
1713 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1714 {
1715 cluster.center =
1716 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1717 cluster.count += 1;
1718 cluster.lines.insert(word.line_key);
1719 } else {
1720 let mut lines = HashSet::new();
1721 lines.insert(word.line_key);
1722 clusters.push(XCluster {
1723 center,
1724 count: 1,
1725 lines,
1726 });
1727 }
1728 }
1729 }
1730 let mut centers: Vec<f64> = clusters
1731 .into_iter()
1732 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1733 .map(|cluster| cluster.center)
1734 .collect();
1735 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1736 if centers.len() < 3 {
1737 return None;
1738 }
1739
1740 let mut built_rows = Vec::<OcrRowBuild>::new();
1741 let mut row_fill_counts = Vec::<usize>::new();
1742 for line_words in by_line.values() {
1743 let mut sorted_words = line_words.clone();
1744 sorted_words.sort_by_key(|word| word.left);
1745
1746 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1747 for word in &sorted_words {
1748 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1749 if let Some((col_idx, distance)) = centers
1750 .iter()
1751 .enumerate()
1752 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1753 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1754 {
1755 if distance <= tolerance {
1756 cells[col_idx].push(word);
1757 }
1758 }
1759 }
1760
1761 let filled_cells = cells.iter().filter(|cell| !cell.is_empty()).count();
1762 let numeric_cells = cells
1763 .iter()
1764 .filter(|cell| cell.iter().any(|word| is_numeric_like(&word.text)))
1765 .count();
1766 if filled_cells < 3 && numeric_cells < 2 {
1767 continue;
1768 }
1769 row_fill_counts.push(filled_cells);
1770
1771 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
1772 let bottom_px = sorted_words
1773 .iter()
1774 .map(|word| word.top.saturating_add(word.height))
1775 .max()
1776 .unwrap_or(0);
1777 let top_y =
1778 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
1779 let bottom_y = image.bbox.top_y
1780 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
1781 let cell_texts = cells
1782 .iter()
1783 .map(|cell_words| {
1784 cell_words
1785 .iter()
1786 .map(|word| word.text.as_str())
1787 .collect::<Vec<_>>()
1788 .join(" ")
1789 })
1790 .collect();
1791 built_rows.push(OcrRowBuild {
1792 top_y,
1793 bottom_y,
1794 cell_texts,
1795 });
1796 }
1797
1798 if built_rows.len() < 2 {
1799 return None;
1800 }
1801 if row_fill_counts.is_empty() {
1802 return None;
1803 }
1804
1805 let mut sorted_fill_counts = row_fill_counts.clone();
1806 sorted_fill_counts.sort_unstable();
1807 let median_fill_ratio =
1808 sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
1809 if median_fill_ratio < MIN_NUMERIC_TABLE_MEDIAN_FILL_RATIO {
1810 return None;
1811 }
1812
1813 built_rows.sort_by(|a, b| {
1814 b.top_y
1815 .partial_cmp(&a.top_y)
1816 .unwrap_or(std::cmp::Ordering::Equal)
1817 });
1818 let x_coordinates =
1819 build_boundaries_from_centers(¢ers, image.bbox.left_x, image.bbox.right_x);
1820 let row_bounds: Vec<(f64, f64)> = built_rows
1821 .iter()
1822 .map(|row| (row.top_y, row.bottom_y))
1823 .collect();
1824 let y_coordinates = build_row_boundaries(&row_bounds);
1825 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
1826 return None;
1827 }
1828
1829 let mut rows = Vec::new();
1830 for (row_idx, row_build) in built_rows.iter().enumerate() {
1831 let row_bbox = BoundingBox::new(
1832 image.bbox.page_number,
1833 image.bbox.left_x,
1834 y_coordinates[row_idx + 1],
1835 image.bbox.right_x,
1836 y_coordinates[row_idx],
1837 );
1838 let mut cells = Vec::new();
1839 for col_idx in 0..centers.len() {
1840 let cell_bbox = BoundingBox::new(
1841 image.bbox.page_number,
1842 x_coordinates[col_idx],
1843 y_coordinates[row_idx + 1],
1844 x_coordinates[col_idx + 1],
1845 y_coordinates[row_idx],
1846 );
1847 let text = row_build
1848 .cell_texts
1849 .get(col_idx)
1850 .cloned()
1851 .unwrap_or_default();
1852 let mut content = Vec::new();
1853 if !text.trim().is_empty() {
1854 content.push(TableToken {
1855 base: TextChunk {
1856 value: text.trim().to_string(),
1857 bbox: cell_bbox.clone(),
1858 font_name: "OCR".to_string(),
1859 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
1860 font_weight: 400.0,
1861 italic_angle: 0.0,
1862 font_color: "#000000".to_string(),
1863 contrast_ratio: 21.0,
1864 symbol_ends: Vec::new(),
1865 text_format: TextFormat::Normal,
1866 text_type: TextType::Regular,
1867 pdf_layer: PdfLayer::Content,
1868 ocg_visible: true,
1869 index: None,
1870 page_number: image.bbox.page_number,
1871 level: None,
1872 mcid: None,
1873 },
1874 token_type: TableTokenType::Text,
1875 });
1876 }
1877 cells.push(TableBorderCell {
1878 bbox: cell_bbox,
1879 index: None,
1880 level: None,
1881 row_number: row_idx,
1882 col_number: col_idx,
1883 row_span: 1,
1884 col_span: 1,
1885 content,
1886 contents: Vec::new(),
1887 semantic_type: None,
1888 });
1889 }
1890 rows.push(TableBorderRow {
1891 bbox: row_bbox,
1892 index: None,
1893 level: None,
1894 row_number: row_idx,
1895 cells,
1896 semantic_type: None,
1897 });
1898 }
1899
1900 Some(TableBorder {
1901 bbox: image.bbox.clone(),
1902 index: None,
1903 level: None,
1904 x_coordinates: x_coordinates.clone(),
1905 x_widths: vec![0.0; x_coordinates.len()],
1906 y_coordinates: y_coordinates.clone(),
1907 y_widths: vec![0.0; y_coordinates.len()],
1908 rows,
1909 num_rows: built_rows.len(),
1910 num_columns: centers.len(),
1911 is_bad_table: false,
1912 is_table_transformer: true,
1913 previous_table: None,
1914 next_table: None,
1915 })
1916}
1917
1918fn build_structured_ocr_table_border(words: &[OcrWord], image: &ImageChunk) -> Option<TableBorder> {
1919 let image_width = words
1920 .iter()
1921 .map(|word| word.left.saturating_add(word.width))
1922 .max()?;
1923 let image_height = words
1924 .iter()
1925 .map(|word| word.top.saturating_add(word.height))
1926 .max()?;
1927 if image_width == 0 || image_height == 0 {
1928 return None;
1929 }
1930
1931 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
1932 for word in words {
1933 by_line.entry(word.line_key).or_default().push(word);
1934 }
1935
1936 let max_right = words
1937 .iter()
1938 .map(|word| word.left.saturating_add(word.width))
1939 .max()
1940 .unwrap_or(0);
1941 let tolerance = (f64::from(max_right) * 0.035).max(18.0);
1942
1943 let mut clusters: Vec<XCluster> = Vec::new();
1944 for line_words in by_line.values() {
1945 for word in line_words {
1946 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1947 if let Some(cluster) = clusters
1948 .iter_mut()
1949 .find(|cluster| (cluster.center - center).abs() <= tolerance)
1950 {
1951 cluster.center =
1952 (cluster.center * cluster.count as f64 + center) / (cluster.count as f64 + 1.0);
1953 cluster.count += 1;
1954 cluster.lines.insert(word.line_key);
1955 } else {
1956 let mut lines = HashSet::new();
1957 lines.insert(word.line_key);
1958 clusters.push(XCluster {
1959 center,
1960 count: 1,
1961 lines,
1962 });
1963 }
1964 }
1965 }
1966
1967 let mut centers: Vec<f64> = clusters
1968 .into_iter()
1969 .filter(|cluster| cluster.lines.len() >= 2 && cluster.count >= 2)
1970 .map(|cluster| cluster.center)
1971 .collect();
1972 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1973 if centers.len() < 3 {
1974 return None;
1975 }
1976
1977 let mut built_rows = Vec::<OcrRowBuild>::new();
1978 let mut row_fill_counts = Vec::<usize>::new();
1979 let mut occupied_columns = vec![0usize; centers.len()];
1980
1981 for line_words in by_line.values() {
1982 let mut sorted_words = line_words.clone();
1983 sorted_words.sort_by_key(|word| word.left);
1984
1985 let mut cells = vec![Vec::<&OcrWord>::new(); centers.len()];
1986 for word in &sorted_words {
1987 let center = f64::from(word.left) + f64::from(word.width) / 2.0;
1988 if let Some((col_idx, distance)) = centers
1989 .iter()
1990 .enumerate()
1991 .map(|(idx, col_center)| (idx, (center - col_center).abs()))
1992 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
1993 {
1994 if distance <= tolerance {
1995 cells[col_idx].push(word);
1996 }
1997 }
1998 }
1999
2000 let filled_indices: Vec<usize> = cells
2001 .iter()
2002 .enumerate()
2003 .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
2004 .collect();
2005 if filled_indices.len() < 2 {
2006 continue;
2007 }
2008
2009 let span = filled_indices.last().unwrap_or(&0) - filled_indices.first().unwrap_or(&0) + 1;
2010 if filled_indices.len() < 3 && span < 3 {
2011 continue;
2012 }
2013
2014 row_fill_counts.push(filled_indices.len());
2015 for idx in &filled_indices {
2016 if let Some(count) = occupied_columns.get_mut(*idx) {
2017 *count += 1;
2018 }
2019 }
2020
2021 let top_px = sorted_words.iter().map(|word| word.top).min().unwrap_or(0);
2022 let bottom_px = sorted_words
2023 .iter()
2024 .map(|word| word.top.saturating_add(word.height))
2025 .max()
2026 .unwrap_or(0);
2027 let top_y =
2028 image.bbox.top_y - image.bbox.height() * (f64::from(top_px) / f64::from(image_height));
2029 let bottom_y = image.bbox.top_y
2030 - image.bbox.height() * (f64::from(bottom_px) / f64::from(image_height));
2031 let cell_texts = cells
2032 .iter()
2033 .map(|cell_words| {
2034 let mut sorted_cell_words = cell_words.clone();
2035 sorted_cell_words.sort_by_key(|word| word.left);
2036 sorted_cell_words
2037 .iter()
2038 .map(|word| word.text.as_str())
2039 .collect::<Vec<_>>()
2040 .join(" ")
2041 })
2042 .collect();
2043 built_rows.push(OcrRowBuild {
2044 top_y,
2045 bottom_y,
2046 cell_texts,
2047 });
2048 }
2049
2050 if built_rows.len() < 3 || row_fill_counts.is_empty() {
2051 return None;
2052 }
2053
2054 let repeated_columns = occupied_columns.iter().filter(|count| **count >= 2).count();
2055 if repeated_columns < 3 {
2056 return None;
2057 }
2058
2059 let mut sorted_fill_counts = row_fill_counts.clone();
2060 sorted_fill_counts.sort_unstable();
2061 let median_fill_ratio =
2062 sorted_fill_counts[sorted_fill_counts.len() / 2] as f64 / centers.len() as f64;
2063 if median_fill_ratio < 0.5 {
2064 return None;
2065 }
2066
2067 built_rows.sort_by(|a, b| {
2068 b.top_y
2069 .partial_cmp(&a.top_y)
2070 .unwrap_or(std::cmp::Ordering::Equal)
2071 });
2072 let x_coordinates =
2073 build_boundaries_from_centers(¢ers, image.bbox.left_x, image.bbox.right_x);
2074 let row_bounds: Vec<(f64, f64)> = built_rows
2075 .iter()
2076 .map(|row| (row.top_y, row.bottom_y))
2077 .collect();
2078 let y_coordinates = build_row_boundaries(&row_bounds);
2079 if x_coordinates.len() != centers.len() + 1 || y_coordinates.len() != built_rows.len() + 1 {
2080 return None;
2081 }
2082
2083 let mut rows = Vec::new();
2084 for (row_idx, row_build) in built_rows.iter().enumerate() {
2085 let row_bbox = BoundingBox::new(
2086 image.bbox.page_number,
2087 image.bbox.left_x,
2088 y_coordinates[row_idx + 1],
2089 image.bbox.right_x,
2090 y_coordinates[row_idx],
2091 );
2092 let mut cells = Vec::new();
2093 for col_idx in 0..centers.len() {
2094 let cell_bbox = BoundingBox::new(
2095 image.bbox.page_number,
2096 x_coordinates[col_idx],
2097 y_coordinates[row_idx + 1],
2098 x_coordinates[col_idx + 1],
2099 y_coordinates[row_idx],
2100 );
2101 let text = row_build
2102 .cell_texts
2103 .get(col_idx)
2104 .cloned()
2105 .unwrap_or_default();
2106 let mut content = Vec::new();
2107 if !text.trim().is_empty() {
2108 content.push(TableToken {
2109 base: TextChunk {
2110 value: text.trim().to_string(),
2111 bbox: cell_bbox.clone(),
2112 font_name: "OCR".to_string(),
2113 font_size: (row_build.top_y - row_build.bottom_y).max(6.0),
2114 font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2115 italic_angle: 0.0,
2116 font_color: "#000000".to_string(),
2117 contrast_ratio: 21.0,
2118 symbol_ends: Vec::new(),
2119 text_format: TextFormat::Normal,
2120 text_type: TextType::Regular,
2121 pdf_layer: PdfLayer::Content,
2122 ocg_visible: true,
2123 index: None,
2124 page_number: image.bbox.page_number,
2125 level: None,
2126 mcid: None,
2127 },
2128 token_type: TableTokenType::Text,
2129 });
2130 }
2131 cells.push(TableBorderCell {
2132 bbox: cell_bbox,
2133 index: None,
2134 level: None,
2135 row_number: row_idx,
2136 col_number: col_idx,
2137 row_span: 1,
2138 col_span: 1,
2139 content,
2140 contents: Vec::new(),
2141 semantic_type: None,
2142 });
2143 }
2144 rows.push(TableBorderRow {
2145 bbox: row_bbox,
2146 index: None,
2147 level: None,
2148 row_number: row_idx,
2149 cells,
2150 semantic_type: None,
2151 });
2152 }
2153
2154 Some(TableBorder {
2155 bbox: image.bbox.clone(),
2156 index: None,
2157 level: None,
2158 x_coordinates: x_coordinates.clone(),
2159 x_widths: vec![0.0; x_coordinates.len()],
2160 y_coordinates: y_coordinates.clone(),
2161 y_widths: vec![0.0; y_coordinates.len()],
2162 rows,
2163 num_rows: built_rows.len(),
2164 num_columns: centers.len(),
2165 is_bad_table: false,
2166 is_table_transformer: true,
2167 previous_table: None,
2168 next_table: None,
2169 })
2170}
2171
2172fn is_matrixish_ocr_artifact_table(table: &TableBorder) -> bool {
2173 if !table.is_table_transformer
2174 || table.num_rows < 2
2175 || table.num_rows > 4
2176 || table.num_columns < 3
2177 || table.bbox.height() > table.bbox.width() * 0.55
2178 {
2179 return false;
2180 }
2181
2182 let texts: Vec<String> = table
2183 .rows
2184 .iter()
2185 .flat_map(|row| row.cells.iter())
2186 .map(table_cell_text)
2187 .filter(|text| !text.is_empty())
2188 .collect();
2189 if texts.len() < 6 {
2190 return false;
2191 }
2192
2193 let substantive_cells = texts
2194 .iter()
2195 .filter(|text| is_substantive_ocr_cell_text(text))
2196 .count();
2197 let short_cells = texts
2198 .iter()
2199 .filter(|text| is_short_ocr_cell_text(text))
2200 .count();
2201 let ambiguous_cells = texts
2202 .iter()
2203 .filter(|text| is_ambiguous_matrix_cell_text(text))
2204 .count();
2205
2206 substantive_cells == 0
2207 && short_cells * 10 >= texts.len() * 8
2208 && ambiguous_cells * 10 >= texts.len() * 5
2209}
2210
2211fn table_cell_text(cell: &TableBorderCell) -> String {
2212 cell.content
2213 .iter()
2214 .map(|token| token.base.value.trim())
2215 .filter(|value| !value.is_empty())
2216 .collect::<Vec<_>>()
2217 .join(" ")
2218}
2219
2220fn is_substantive_ocr_cell_text(text: &str) -> bool {
2221 text.split_whitespace().any(is_substantive_table_word)
2222}
2223
2224fn is_short_ocr_cell_text(text: &str) -> bool {
2225 let normalized: String = text
2226 .chars()
2227 .filter(|ch| ch.is_alphanumeric())
2228 .flat_map(char::to_lowercase)
2229 .collect();
2230 !normalized.is_empty() && normalized.len() <= 4
2231}
2232
2233fn is_ambiguous_matrix_cell_text(text: &str) -> bool {
2234 if text.contains(['/', '\\', '=', '|', '[', ']', '{', '}', '(', ')']) {
2235 return true;
2236 }
2237
2238 let normalized: String = text
2239 .chars()
2240 .filter(|ch| ch.is_alphanumeric())
2241 .flat_map(char::to_lowercase)
2242 .collect();
2243 !normalized.is_empty()
2244 && normalized.len() <= 4
2245 && normalized
2246 .chars()
2247 .all(|ch| matches!(ch, '0' | '1' | 'o' | 'd' | 'q' | 'i' | 'l'))
2248}
2249
2250fn recover_bordered_raster_caption(image_path: &Path, image: &ImageChunk) -> Option<TextChunk> {
2251 let gray = image::open(image_path).ok()?.to_luma8();
2252 recover_bordered_raster_caption_from_gray(&gray, image)
2253}
2254
2255fn recover_bordered_raster_caption_from_gray(
2256 gray: &GrayImage,
2257 image: &ImageChunk,
2258) -> Option<TextChunk> {
2259 let grid = detect_bordered_raster_grid(gray)?;
2260 let first_h = *grid.horizontal_lines.first()?;
2261 if first_h <= 2 {
2262 return None;
2263 }
2264
2265 let crop = gray.view(0, 0, gray.width(), first_h).to_image();
2266 let caption_text = normalize_caption_text(&run_tesseract_plain_text(&crop, "7")?);
2267 if caption_text.is_empty() || !caption_text.chars().any(|ch| ch.is_alphabetic()) {
2268 return None;
2269 }
2270
2271 let bbox = raster_box_to_page_bbox(
2272 image,
2273 0,
2274 0,
2275 gray.width(),
2276 first_h.max(1),
2277 gray.width().max(1),
2278 gray.height().max(1),
2279 )?;
2280 let font_size = (bbox.height() * 0.55).clamp(10.0, 16.0);
2281 Some(TextChunk {
2282 value: caption_text,
2283 bbox,
2284 font_name: "OCR".to_string(),
2285 font_size,
2286 font_weight: 700.0,
2287 italic_angle: 0.0,
2288 font_color: "#000000".to_string(),
2289 contrast_ratio: 21.0,
2290 symbol_ends: Vec::new(),
2291 text_format: TextFormat::Normal,
2292 text_type: TextType::Regular,
2293 pdf_layer: PdfLayer::Content,
2294 ocg_visible: true,
2295 index: None,
2296 page_number: image.bbox.page_number,
2297 level: None,
2298 mcid: None,
2299 })
2300}
2301
2302fn recover_bordered_raster_table(image_path: &Path, image: &ImageChunk) -> Option<TableBorder> {
2303 let gray = image::open(image_path).ok()?.to_luma8();
2304 recover_bordered_raster_table_from_gray(&gray, image)
2305}
2306
2307fn recover_bordered_raster_table_from_gray(
2308 gray: &GrayImage,
2309 image: &ImageChunk,
2310) -> Option<TableBorder> {
2311 let grid = detect_bordered_raster_grid(gray)?;
2312 let num_cols = grid.vertical_lines.len().checked_sub(1)?;
2313 let num_rows = grid.horizontal_lines.len().checked_sub(1)?;
2314 if num_cols < 2 || num_rows < 2 {
2315 return None;
2316 }
2317 let table_bbox = raster_box_to_page_bbox(
2318 image,
2319 *grid.vertical_lines.first()?,
2320 *grid.horizontal_lines.first()?,
2321 *grid.vertical_lines.last()?,
2322 *grid.horizontal_lines.last()?,
2323 gray.width(),
2324 gray.height(),
2325 )?;
2326
2327 let x_coordinates = raster_boundaries_to_page(
2328 &grid.vertical_lines,
2329 image.bbox.left_x,
2330 image.bbox.right_x,
2331 gray.width(),
2332 )?;
2333 let y_coordinates = raster_boundaries_to_page_desc(
2334 &grid.horizontal_lines,
2335 image.bbox.bottom_y,
2336 image.bbox.top_y,
2337 gray.height(),
2338 )?;
2339
2340 if !bordered_grid_has_cell_ink(gray, &grid) {
2341 return None;
2342 }
2343
2344 let mut rows = Vec::with_capacity(num_rows);
2345 let mut non_empty_cells = 0usize;
2346 let mut rows_with_text = 0usize;
2347 let mut total_cells = 0usize;
2348 for row_idx in 0..num_rows {
2349 let row_bbox = BoundingBox::new(
2350 image.bbox.page_number,
2351 image.bbox.left_x,
2352 y_coordinates[row_idx + 1],
2353 image.bbox.right_x,
2354 y_coordinates[row_idx],
2355 );
2356 let mut cells = Vec::with_capacity(num_cols);
2357 let mut row_has_text = false;
2358
2359 for col_idx in 0..num_cols {
2360 let x1 = grid.vertical_lines[col_idx];
2361 let x2 = grid.vertical_lines[col_idx + 1];
2362 let y1 = grid.horizontal_lines[row_idx];
2363 let y2 = grid.horizontal_lines[row_idx + 1];
2364 let cell_bbox = BoundingBox::new(
2365 image.bbox.page_number,
2366 x_coordinates[col_idx],
2367 y_coordinates[row_idx + 1],
2368 x_coordinates[col_idx + 1],
2369 y_coordinates[row_idx],
2370 );
2371 let text = extract_raster_cell_text(gray, row_idx, col_idx, x1, y1, x2, y2)
2372 .unwrap_or_default();
2373 total_cells += 1;
2374
2375 let mut content = Vec::new();
2376 if !text.is_empty() {
2377 row_has_text = true;
2378 non_empty_cells += 1;
2379 content.push(TableToken {
2380 base: TextChunk {
2381 value: text,
2382 bbox: cell_bbox.clone(),
2383 font_name: "OCR".to_string(),
2384 font_size: (cell_bbox.height() * 0.55).max(6.0),
2385 font_weight: if row_idx == 0 { 700.0 } else { 400.0 },
2386 italic_angle: 0.0,
2387 font_color: "#000000".to_string(),
2388 contrast_ratio: 21.0,
2389 symbol_ends: Vec::new(),
2390 text_format: TextFormat::Normal,
2391 text_type: TextType::Regular,
2392 pdf_layer: PdfLayer::Content,
2393 ocg_visible: true,
2394 index: None,
2395 page_number: image.bbox.page_number,
2396 level: None,
2397 mcid: None,
2398 },
2399 token_type: TableTokenType::Text,
2400 });
2401 }
2402
2403 cells.push(TableBorderCell {
2404 bbox: cell_bbox,
2405 index: None,
2406 level: None,
2407 row_number: row_idx,
2408 col_number: col_idx,
2409 row_span: 1,
2410 col_span: 1,
2411 content,
2412 contents: Vec::new(),
2413 semantic_type: None,
2414 });
2415 }
2416
2417 if row_has_text {
2418 rows_with_text += 1;
2419 }
2420
2421 rows.push(TableBorderRow {
2422 bbox: row_bbox,
2423 index: None,
2424 level: None,
2425 row_number: row_idx,
2426 cells,
2427 semantic_type: None,
2428 });
2429 }
2430
2431 if total_cells == 0 {
2432 return None;
2433 }
2434 let text_cell_ratio = non_empty_cells as f64 / total_cells as f64;
2435 if text_cell_ratio < MIN_RASTER_TABLE_TEXT_CELL_RATIO
2436 || rows_with_text < MIN_RASTER_TABLE_ROWS_WITH_TEXT
2437 {
2438 return None;
2439 }
2440
2441 Some(TableBorder {
2442 bbox: table_bbox,
2443 index: None,
2444 level: None,
2445 x_coordinates: x_coordinates.clone(),
2446 x_widths: vec![0.0; x_coordinates.len()],
2447 y_coordinates: y_coordinates.clone(),
2448 y_widths: vec![0.0; y_coordinates.len()],
2449 rows,
2450 num_rows,
2451 num_columns: num_cols,
2452 is_bad_table: false,
2453 is_table_transformer: true,
2454 previous_table: None,
2455 next_table: None,
2456 })
2457}
2458
2459fn is_obvious_bar_chart_raster(gray: &GrayImage) -> bool {
2460 let width = gray.width();
2461 let height = gray.height();
2462 if width < 160 || height < 120 {
2463 return false;
2464 }
2465
2466 let min_ink_pixels = (f64::from(width) * 0.35).ceil() as u32;
2467 let min_run_height = (height / 80).max(6);
2468 let wide_ink_row_runs = merge_runs(
2469 (0..height)
2470 .filter(|&y| count_ink_in_row(gray, y, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels),
2471 );
2472 let thick_runs = wide_ink_row_runs
2473 .into_iter()
2474 .filter(|(start, end)| end.saturating_sub(*start) + 1 >= min_run_height)
2475 .count();
2476
2477 thick_runs >= 3 || is_obvious_vertical_bar_chart_raster(gray)
2478}
2479
2480fn is_obvious_vertical_bar_chart_raster(gray: &GrayImage) -> bool {
2481 let width = gray.width();
2482 let height = gray.height();
2483 if width < 160 || height < 120 {
2484 return false;
2485 }
2486
2487 let min_ink_pixels = (f64::from(height) * 0.08).ceil() as u32;
2488 let min_bar_width = (width / 28).max(10);
2489 let min_bar_height = (height / 8).max(16);
2490 let max_baseline_delta = (height / 14).max(8);
2491 let min_fill_ratio = 0.10;
2492
2493 let candidate_runs =
2494 merge_runs((0..width).filter(|&x| {
2495 count_ink_in_column(gray, x, RASTER_CHART_INK_THRESHOLD) >= min_ink_pixels
2496 }));
2497 let mut baselines = Vec::new();
2498 let mut has_dominant_bar = false;
2499 let mut qualifying_bars = 0usize;
2500
2501 for (start, end) in candidate_runs {
2502 let run_width = end.saturating_sub(start) + 1;
2503 if run_width < min_bar_width {
2504 continue;
2505 }
2506
2507 let mut top = height;
2508 let mut bottom = 0u32;
2509 let mut ink_pixels = 0usize;
2510 for x in start..=end {
2511 for y in 0..height {
2512 if gray.get_pixel(x, y).0[0] < RASTER_CHART_INK_THRESHOLD {
2513 top = top.min(y);
2514 bottom = bottom.max(y);
2515 ink_pixels += 1;
2516 }
2517 }
2518 }
2519
2520 if top >= height || bottom <= top {
2521 continue;
2522 }
2523
2524 let run_height = bottom.saturating_sub(top) + 1;
2525 if run_height < min_bar_height {
2526 continue;
2527 }
2528
2529 let bbox_area = run_width as usize * run_height as usize;
2530 if bbox_area == 0 {
2531 continue;
2532 }
2533
2534 let fill_ratio = ink_pixels as f64 / bbox_area as f64;
2535 if fill_ratio < min_fill_ratio {
2536 continue;
2537 }
2538
2539 qualifying_bars += 1;
2540 if run_width >= min_bar_width.saturating_mul(2) {
2541 has_dominant_bar = true;
2542 }
2543 baselines.push(bottom);
2544 }
2545
2546 if baselines.len() < 2 {
2547 return false;
2548 }
2549
2550 baselines.sort_unstable();
2551 let median_baseline = baselines[baselines.len() / 2];
2552 let aligned_baselines = baselines
2553 .iter()
2554 .filter(|baseline| baseline.abs_diff(median_baseline) <= max_baseline_delta)
2555 .count();
2556
2557 aligned_baselines >= 2 && (has_dominant_bar || (qualifying_bars >= 4 && aligned_baselines >= 4))
2558}
2559
2560fn is_natural_photograph_raster(gray: &GrayImage) -> bool {
2573 let total = (gray.width() * gray.height()) as usize;
2574 if total < 400 {
2575 return false;
2576 }
2577
2578 let mut histogram = [0usize; 256];
2579 for pixel in gray.pixels() {
2580 histogram[pixel[0] as usize] += 1;
2581 }
2582
2583 let mid_tone_count: usize = histogram[40..=215].iter().sum();
2584 if mid_tone_count * 10 >= total * 3 {
2585 return true;
2586 }
2587
2588 let mut coarse_histogram = [0usize; 16];
2589 for (value, count) in histogram.iter().enumerate() {
2590 coarse_histogram[value / 16] += count;
2591 }
2592
2593 let occupied_bins = coarse_histogram
2594 .iter()
2595 .filter(|count| **count as f64 >= total as f64 * 0.01)
2596 .count();
2597 let entropy = coarse_histogram.iter().fold(0.0, |acc, count| {
2598 if *count == 0 {
2599 return acc;
2600 }
2601 let probability = *count as f64 / total as f64;
2602 acc - probability * probability.log2()
2603 });
2604
2605 mid_tone_count as f64 / total as f64 >= MIN_BRIGHT_PHOTO_MID_TONE_RATIO
2606 && occupied_bins >= MIN_BRIGHT_PHOTO_HISTOGRAM_BINS
2607 && entropy >= MIN_BRIGHT_PHOTO_ENTROPY
2608}
2609
2610fn is_dark_ui_screenshot_raster(gray: &GrayImage) -> bool {
2613 let total = (gray.width() * gray.height()) as usize;
2614 if total < 400 {
2615 return false;
2616 }
2617
2618 let very_dark_count = gray.pixels().filter(|p| p[0] <= 39).count();
2619 let non_extreme_count = gray.pixels().filter(|p| p[0] >= 15 && p[0] <= 240).count();
2620 let bright_detail_count = gray.pixels().filter(|p| p[0] >= 180 && p[0] <= 245).count();
2621
2622 very_dark_count * 20 >= total * 13
2623 && non_extreme_count * 2 >= total
2624 && bright_detail_count * 20 >= total
2625}
2626
2627fn bordered_grid_has_cell_ink(gray: &GrayImage, grid: &RasterTableGrid) -> bool {
2628 let num_cols = match grid.vertical_lines.len().checked_sub(1) {
2629 Some(value) => value,
2630 None => return false,
2631 };
2632 let num_rows = match grid.horizontal_lines.len().checked_sub(1) {
2633 Some(value) => value,
2634 None => return false,
2635 };
2636 if num_cols == 0 || num_rows == 0 {
2637 return false;
2638 }
2639
2640 let mut total_cells = 0usize;
2641 let mut inked_cells = 0usize;
2642 let mut rows_with_ink = 0usize;
2643
2644 for row_idx in 0..num_rows {
2645 let mut row_has_ink = false;
2646 for col_idx in 0..num_cols {
2647 total_cells += 1;
2648 let x1 = grid.vertical_lines[col_idx];
2649 let x2 = grid.vertical_lines[col_idx + 1];
2650 let y1 = grid.horizontal_lines[row_idx];
2651 let y2 = grid.horizontal_lines[row_idx + 1];
2652
2653 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2654 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2655 let crop_left = x1 + inset_x;
2656 let crop_top = y1 + inset_y;
2657 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2658 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2659 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2660 continue;
2661 }
2662
2663 let dark_pixels = (crop_top..crop_top + crop_height)
2664 .flat_map(|y| (crop_left..crop_left + crop_width).map(move |x| (x, y)))
2665 .filter(|&(x, y)| gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2666 .count();
2667 let area = (crop_width as usize) * (crop_height as usize);
2668 if area == 0 {
2669 continue;
2670 }
2671
2672 let dark_ratio = dark_pixels as f64 / area as f64;
2673 if dark_ratio >= MIN_BORDERED_CELL_DARK_RATIO {
2674 inked_cells += 1;
2675 row_has_ink = true;
2676 }
2677 }
2678 if row_has_ink {
2679 rows_with_ink += 1;
2680 }
2681 }
2682
2683 if total_cells == 0 {
2684 return false;
2685 }
2686
2687 (inked_cells as f64 / total_cells as f64) >= MIN_BORDERED_INKED_CELL_RATIO
2688 && rows_with_ink >= MIN_BORDERED_ROWS_WITH_INK
2689}
2690
2691fn detect_bordered_raster_grid(gray: &GrayImage) -> Option<RasterTableGrid> {
2692 let mut best_grid: Option<(RasterTableGrid, f64)> = None;
2693 for variant in build_ocr_variants(gray) {
2694 let Some((grid, score)) = detect_bordered_raster_grid_single(&variant) else {
2695 continue;
2696 };
2697 match &best_grid {
2698 Some((_, best_score)) if *best_score >= score => {}
2699 _ => best_grid = Some((grid, score)),
2700 }
2701 }
2702 best_grid.map(|(grid, _)| grid)
2703}
2704
2705fn detect_bordered_raster_grid_single(gray: &GrayImage) -> Option<(RasterTableGrid, f64)> {
2706 let width = gray.width();
2707 let height = gray.height();
2708 if width < 100 || height < 80 {
2709 return None;
2710 }
2711
2712 let min_vertical_dark = (f64::from(height) * MIN_LINE_DARK_RATIO).ceil() as u32;
2713 let min_horizontal_dark = (f64::from(width) * MIN_LINE_DARK_RATIO).ceil() as u32;
2714
2715 let vertical_runs =
2716 merge_runs((0..width).filter(|&x| count_dark_in_column(gray, x) >= min_vertical_dark));
2717 let horizontal_runs =
2718 merge_runs((0..height).filter(|&y| count_dark_in_row(gray, y) >= min_horizontal_dark));
2719 if vertical_runs.len() < MIN_BORDERED_VERTICAL_LINES
2720 || horizontal_runs.len() < MIN_BORDERED_HORIZONTAL_LINES
2721 {
2722 return None;
2723 }
2724
2725 let mut vertical_lines: Vec<u32> = vertical_runs
2726 .into_iter()
2727 .map(|(start, end)| (start + end) / 2)
2728 .collect();
2729 let mut horizontal_lines: Vec<u32> = horizontal_runs
2730 .into_iter()
2731 .map(|(start, end)| (start + end) / 2)
2732 .collect();
2733
2734 let (&rough_min_x, &rough_max_x) = vertical_lines.first().zip(vertical_lines.last())?;
2735 let (&rough_min_y, &rough_max_y) = horizontal_lines.first().zip(horizontal_lines.last())?;
2736 if rough_max_x <= rough_min_x || rough_max_y <= rough_min_y {
2737 return None;
2738 }
2739
2740 vertical_lines.retain(|&x| {
2741 dark_ratio_in_column(gray, x, rough_min_y, rough_max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY
2742 });
2743 horizontal_lines.retain(|&y| {
2744 dark_ratio_in_row(gray, y, rough_min_x, rough_max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY
2745 });
2746 if vertical_lines.len() < MIN_BORDERED_VERTICAL_LINES
2747 || horizontal_lines.len() < MIN_BORDERED_HORIZONTAL_LINES
2748 {
2749 return None;
2750 }
2751
2752 if vertical_lines
2753 .windows(2)
2754 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2755 || horizontal_lines
2756 .windows(2)
2757 .any(|w| w[1] <= w[0] + MIN_CELL_SIZE_PX)
2758 {
2759 return None;
2760 }
2761 if !grid_lines_are_continuous(&vertical_lines, &horizontal_lines, gray) {
2762 return None;
2763 }
2764
2765 let continuity = grid_continuity_score(&vertical_lines, &horizontal_lines, gray);
2766 let line_score = vertical_lines.len() as f64 + horizontal_lines.len() as f64;
2767 let score = continuity * 100.0 + line_score;
2768
2769 Some((
2770 RasterTableGrid {
2771 vertical_lines,
2772 horizontal_lines,
2773 },
2774 score,
2775 ))
2776}
2777
2778fn grid_lines_are_continuous(
2779 vertical_lines: &[u32],
2780 horizontal_lines: &[u32],
2781 gray: &GrayImage,
2782) -> bool {
2783 let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2784 return false;
2785 };
2786 let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2787 return false;
2788 };
2789 if max_x <= min_x || max_y <= min_y {
2790 return false;
2791 }
2792
2793 vertical_lines
2794 .iter()
2795 .all(|&x| dark_ratio_in_column(gray, x, min_y, max_y) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2796 && horizontal_lines
2797 .iter()
2798 .all(|&y| dark_ratio_in_row(gray, y, min_x, max_x) >= MIN_TRUE_GRID_LINE_CONTINUITY)
2799}
2800
2801fn grid_continuity_score(
2802 vertical_lines: &[u32],
2803 horizontal_lines: &[u32],
2804 gray: &GrayImage,
2805) -> f64 {
2806 let Some((&min_x, &max_x)) = vertical_lines.first().zip(vertical_lines.last()) else {
2807 return 0.0;
2808 };
2809 let Some((&min_y, &max_y)) = horizontal_lines.first().zip(horizontal_lines.last()) else {
2810 return 0.0;
2811 };
2812 if max_x <= min_x || max_y <= min_y {
2813 return 0.0;
2814 }
2815
2816 let mut samples = 0usize;
2817 let mut sum = 0.0;
2818 for &x in vertical_lines {
2819 sum += dark_ratio_in_column(gray, x, min_y, max_y);
2820 samples += 1;
2821 }
2822 for &y in horizontal_lines {
2823 sum += dark_ratio_in_row(gray, y, min_x, max_x);
2824 samples += 1;
2825 }
2826 if samples == 0 {
2827 0.0
2828 } else {
2829 sum / samples as f64
2830 }
2831}
2832
2833fn count_dark_in_column(gray: &GrayImage, x: u32) -> u32 {
2834 count_ink_in_column(gray, x, RASTER_DARK_THRESHOLD)
2835}
2836
2837fn count_ink_in_column(gray: &GrayImage, x: u32, threshold: u8) -> u32 {
2838 (0..gray.height())
2839 .filter(|&y| gray.get_pixel(x, y).0[0] < threshold)
2840 .count() as u32
2841}
2842
2843fn count_dark_in_row(gray: &GrayImage, y: u32) -> u32 {
2844 count_ink_in_row(gray, y, RASTER_DARK_THRESHOLD)
2845}
2846
2847fn count_ink_in_row(gray: &GrayImage, y: u32, threshold: u8) -> u32 {
2848 (0..gray.width())
2849 .filter(|&x| gray.get_pixel(x, y).0[0] < threshold)
2850 .count() as u32
2851}
2852
2853fn dark_ratio_in_column(gray: &GrayImage, x: u32, y1: u32, y2: u32) -> f64 {
2854 if y2 <= y1 || x >= gray.width() {
2855 return 0.0;
2856 }
2857 let dark = (y1..=y2)
2858 .filter(|&y| y < gray.height() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2859 .count();
2860 dark as f64 / f64::from(y2 - y1 + 1)
2861}
2862
2863fn dark_ratio_in_row(gray: &GrayImage, y: u32, x1: u32, x2: u32) -> f64 {
2864 if x2 <= x1 || y >= gray.height() {
2865 return 0.0;
2866 }
2867 let dark = (x1..=x2)
2868 .filter(|&x| x < gray.width() && gray.get_pixel(x, y).0[0] < RASTER_DARK_THRESHOLD)
2869 .count();
2870 dark as f64 / f64::from(x2 - x1 + 1)
2871}
2872
2873fn merge_runs(values: impl Iterator<Item = u32>) -> Vec<(u32, u32)> {
2874 let mut runs = Vec::new();
2875 let mut start = None;
2876 let mut prev = 0u32;
2877 for value in values {
2878 match start {
2879 None => {
2880 start = Some(value);
2881 prev = value;
2882 }
2883 Some(s) if value == prev + 1 => {
2884 prev = value;
2885 start = Some(s);
2886 }
2887 Some(s) => {
2888 runs.push((s, prev));
2889 start = Some(value);
2890 prev = value;
2891 }
2892 }
2893 }
2894 if let Some(s) = start {
2895 runs.push((s, prev));
2896 }
2897 runs
2898}
2899
2900fn build_boundaries_from_centers(centers: &[f64], left_edge: f64, right_edge: f64) -> Vec<f64> {
2901 let mut boundaries = Vec::with_capacity(centers.len() + 1);
2902 boundaries.push(left_edge);
2903 for pair in centers.windows(2) {
2904 boundaries.push((pair[0] + pair[1]) / 2.0);
2905 }
2906 boundaries.push(right_edge);
2907 boundaries
2908}
2909
2910fn build_row_boundaries(rows: &[(f64, f64)]) -> Vec<f64> {
2911 let mut boundaries = Vec::with_capacity(rows.len() + 1);
2912 boundaries.push(rows[0].0);
2913 for pair in rows.windows(2) {
2914 boundaries.push((pair[0].1 + pair[1].0) / 2.0);
2915 }
2916 boundaries.push(rows[rows.len() - 1].1);
2917 boundaries
2918}
2919
2920fn raster_boundaries_to_page(
2921 lines: &[u32],
2922 left_edge: f64,
2923 right_edge: f64,
2924 image_width: u32,
2925) -> Option<Vec<f64>> {
2926 if image_width == 0 {
2927 return None;
2928 }
2929 let scale = (right_edge - left_edge) / f64::from(image_width);
2930 Some(
2931 lines
2932 .iter()
2933 .map(|line| left_edge + f64::from(*line) * scale)
2934 .collect(),
2935 )
2936}
2937
2938fn raster_boundaries_to_page_desc(
2939 lines: &[u32],
2940 bottom_edge: f64,
2941 top_edge: f64,
2942 image_height: u32,
2943) -> Option<Vec<f64>> {
2944 if image_height == 0 {
2945 return None;
2946 }
2947 let page_height = top_edge - bottom_edge;
2948 Some(
2949 lines
2950 .iter()
2951 .map(|line| top_edge - f64::from(*line) / f64::from(image_height) * page_height)
2952 .collect(),
2953 )
2954}
2955
2956fn raster_box_to_page_bbox(
2957 image: &ImageChunk,
2958 x1: u32,
2959 y1: u32,
2960 x2: u32,
2961 y2: u32,
2962 image_width: u32,
2963 image_height: u32,
2964) -> Option<BoundingBox> {
2965 if x2 <= x1 || y2 <= y1 || image_width == 0 || image_height == 0 {
2966 return None;
2967 }
2968 let left_x = image.bbox.left_x + image.bbox.width() * (f64::from(x1) / f64::from(image_width));
2969 let right_x = image.bbox.left_x + image.bbox.width() * (f64::from(x2) / f64::from(image_width));
2970 let top_y = image.bbox.top_y - image.bbox.height() * (f64::from(y1) / f64::from(image_height));
2971 let bottom_y =
2972 image.bbox.top_y - image.bbox.height() * (f64::from(y2) / f64::from(image_height));
2973 Some(BoundingBox::new(
2974 image.bbox.page_number,
2975 left_x,
2976 bottom_y,
2977 right_x,
2978 top_y,
2979 ))
2980}
2981
2982fn extract_raster_cell_text(
2983 gray: &GrayImage,
2984 row_idx: usize,
2985 col_idx: usize,
2986 x1: u32,
2987 y1: u32,
2988 x2: u32,
2989 y2: u32,
2990) -> Option<String> {
2991 let inset_x = CELL_INSET_PX.min((x2 - x1) / 4);
2992 let inset_y = CELL_INSET_PX.min((y2 - y1) / 4);
2993 let crop_left = x1 + inset_x;
2994 let crop_top = y1 + inset_y;
2995 let crop_width = x2.saturating_sub(x1 + inset_x * 2);
2996 let crop_height = y2.saturating_sub(y1 + inset_y * 2);
2997 if crop_width < MIN_CELL_SIZE_PX || crop_height < MIN_CELL_SIZE_PX {
2998 return Some(String::new());
2999 }
3000
3001 let cropped = gray
3002 .view(crop_left, crop_top, crop_width, crop_height)
3003 .to_image();
3004 let bordered = expand_white_border(&cropped, 12);
3005 let scaled = image::imageops::resize(
3006 &bordered,
3007 bordered.width() * OCR_SCALE_FACTOR,
3008 bordered.height() * OCR_SCALE_FACTOR,
3009 image::imageops::FilterType::Lanczos3,
3010 );
3011 let psm_modes: [&str; 3] = if row_idx == 0 {
3012 ["6", "11", "7"]
3013 } else {
3014 ["7", "6", "11"]
3015 };
3016 let raw_text = run_tesseract_cell_text_best(&scaled, &psm_modes)?;
3017 Some(normalize_raster_cell_text(row_idx, col_idx, raw_text))
3018}
3019
3020fn expand_white_border(image: &GrayImage, border: u32) -> GrayImage {
3021 let mut expanded = GrayImage::from_pixel(
3022 image.width() + border * 2,
3023 image.height() + border * 2,
3024 Luma([255]),
3025 );
3026 for y in 0..image.height() {
3027 for x in 0..image.width() {
3028 expanded.put_pixel(x + border, y + border, *image.get_pixel(x, y));
3029 }
3030 }
3031 expanded
3032}
3033
3034fn run_tesseract_tsv_words(image: &GrayImage, psm: &str) -> Option<Vec<OcrWord>> {
3035 match selected_ocr_engine() {
3036 OcrEngine::RapidOcr => run_rapidocr_words(image),
3037 OcrEngine::Tesseract => run_tesseract_tsv_words_with_oem(image, psm, "3"),
3038 }
3039}
3040
3041fn run_tesseract_tsv_words_with_oem(
3042 image: &GrayImage,
3043 psm: &str,
3044 oem: &str,
3045) -> Option<Vec<OcrWord>> {
3046 let temp_dir = create_temp_dir(0).ok()?;
3047 let image_path = temp_dir.join("ocr.png");
3048 if image.save(&image_path).is_err() {
3049 let _ = fs::remove_dir_all(&temp_dir);
3050 return None;
3051 }
3052
3053 let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3054 let output = Command::new("tesseract")
3055 .current_dir(&temp_dir)
3056 .arg("ocr.png")
3057 .arg("stdout")
3058 .arg("--dpi")
3061 .arg(&dpi)
3062 .arg("--oem")
3063 .arg(oem)
3064 .arg("--psm")
3065 .arg(psm)
3066 .arg("-c")
3070 .arg("load_system_dawg=0")
3071 .arg("-c")
3072 .arg("load_freq_dawg=0")
3073 .arg("tsv")
3074 .output()
3075 .ok()?;
3076 let _ = fs::remove_dir_all(&temp_dir);
3077 if !output.status.success() {
3078 return None;
3079 }
3080
3081 let tsv = String::from_utf8_lossy(&output.stdout);
3082 Some(parse_tesseract_tsv(&tsv))
3083}
3084
3085fn run_tesseract_cell_text_best(image: &GrayImage, psm_modes: &[&str]) -> Option<String> {
3086 let mut best: Option<(String, f64)> = None;
3087
3088 if matches!(selected_ocr_engine(), OcrEngine::Tesseract) {
3089 let consensus_words = collect_consensus_words(image, psm_modes);
3091 if !consensus_words.is_empty() {
3092 let text = words_to_plain_line_text(&consensus_words);
3093 if !text.is_empty() {
3094 let score = score_ocr_words(&consensus_words, image.width(), image.height());
3095 best = Some((text, score));
3096 }
3097 }
3098 }
3099
3100 if best.is_none() {
3102 for variant in build_ocr_variants(image) {
3103 for psm in psm_modes {
3104 let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3105 continue;
3106 };
3107 if words.is_empty() {
3108 continue;
3109 }
3110 let text = words_to_plain_line_text(&words);
3111 if text.is_empty() {
3112 continue;
3113 }
3114 let score = score_ocr_words(&words, variant.width(), variant.height());
3115 match &best {
3116 Some((_, best_score)) if *best_score >= score => {}
3117 _ => best = Some((text, score)),
3118 }
3119
3120 if let Some(text) = run_tesseract_plain_text_with_variant(&variant, psm) {
3121 let norm_len = normalize_text(&text).len() as f64;
3122 if norm_len > 0.0 {
3123 match &best {
3124 Some((_, best_score)) if *best_score >= norm_len => {}
3125 _ => best = Some((text, norm_len)),
3126 }
3127 }
3128 }
3129 }
3130
3131 if let Some(words) = run_rapidocr_words(&variant) {
3136 let text = words_to_plain_line_text(&words);
3137 if !text.is_empty() {
3138 let score = score_ocr_words(&words, variant.width(), variant.height());
3139 match &best {
3140 Some((_, best_score)) if *best_score >= score => {}
3141 _ => best = Some((text, score)),
3142 }
3143 }
3144 }
3145 }
3146 }
3147
3148 best.map(|(text, _)| text)
3149}
3150
3151fn collect_consensus_words(image: &GrayImage, psm_modes: &[&str]) -> Vec<OcrWord> {
3152 let variants = build_ocr_variants(image);
3153
3154 let oems = ["1", "3"]; let mut perspective_best: HashMap<(String, String, String), OcrWord> = HashMap::new();
3169
3170 for variant in &variants {
3171 for psm in psm_modes {
3172 for oem in oems {
3173 let Some(words) = run_tesseract_tsv_words_with_oem(variant, psm, oem) else {
3174 continue;
3175 };
3176 for word in words {
3177 let key = (psm.to_string(), oem.to_string(), word.text.to_lowercase());
3178 perspective_best
3179 .entry(key)
3180 .and_modify(|best| {
3181 if word.confidence > best.confidence {
3182 *best = word.clone();
3183 }
3184 })
3185 .or_insert(word);
3186 }
3187 }
3188 }
3189 }
3190
3191 const MIN_PERSPECTIVES: usize = 2;
3194
3195 let mut text_to_perspectives: HashMap<String, HashSet<(String, String)>> = HashMap::new();
3196 for (psm, oem, norm_text) in perspective_best.keys() {
3197 text_to_perspectives
3198 .entry(norm_text.clone())
3199 .or_default()
3200 .insert((psm.clone(), oem.clone()));
3201 }
3202
3203 let mut consensus: Vec<OcrWord> = text_to_perspectives
3205 .iter()
3206 .filter(|(_, perspectives)| perspectives.len() >= MIN_PERSPECTIVES)
3207 .filter_map(|(norm_text, _)| {
3208 perspective_best
3209 .iter()
3210 .filter(|((_, _, t), _)| t == norm_text)
3211 .max_by(|(_, a), (_, b)| {
3212 a.confidence
3213 .partial_cmp(&b.confidence)
3214 .unwrap_or(std::cmp::Ordering::Equal)
3215 })
3216 .map(|(_, w)| w.clone())
3217 })
3218 .collect();
3219
3220 consensus.sort_by_key(|w| (w.top, w.left));
3221 consensus
3222}
3223
3224fn filter_words_by_spatial_coherence(words: &[OcrWord]) -> Vec<OcrWord> {
3225 if words.len() <= 1 {
3226 return words.to_vec();
3227 }
3228
3229 let median_h: u32 = {
3236 let mut heights: Vec<u32> = words.iter().map(|w| w.height.max(1)).collect();
3237 heights.sort_unstable();
3238 heights[heights.len() / 2]
3239 };
3240 let gap_threshold = (median_h * 3).max(8);
3242 let narrow_threshold = (median_h / 2).max(4);
3244 let min_iso_width = (median_h * 2 / 5).max(4);
3246 let min_iso_height = (median_h * 2 / 5).max(3);
3247
3248 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3250 for word in words {
3251 by_line.entry(word.line_key).or_default().push(word);
3252 }
3253
3254 let mut filtered = Vec::new();
3255
3256 for line_words in by_line.values_mut() {
3258 if line_words.len() <= 1 {
3259 if let Some(word) = line_words.first() {
3261 if word.width >= min_iso_width && word.height >= min_iso_height {
3262 filtered.push((*word).clone());
3263 }
3264 }
3265 continue;
3266 }
3267
3268 line_words.sort_by_key(|word| word.left);
3269
3270 for (i, word) in line_words.iter().enumerate() {
3272 let is_isolated = if i > 0 {
3273 let prev = line_words[i - 1];
3274 let gap = word
3275 .left
3276 .saturating_sub(prev.left.saturating_add(prev.width));
3277 gap > gap_threshold && word.width < narrow_threshold
3278 } else if i < line_words.len() - 1 {
3279 let next = line_words[i + 1];
3280 let gap = next
3281 .left
3282 .saturating_sub(word.left.saturating_add(word.width));
3283 gap > gap_threshold && word.width < narrow_threshold
3284 } else {
3285 false
3286 };
3287
3288 if !is_isolated {
3289 filtered.push((*word).clone());
3290 }
3291 }
3292 }
3293
3294 filtered
3295}
3296
3297fn cluster_words_by_proximity(words: &[OcrWord], gap_tolerance: u32) -> Vec<Vec<OcrWord>> {
3298 if words.is_empty() {
3299 return Vec::new();
3300 }
3301
3302 let mut sorted_words = words.to_vec();
3303 sorted_words.sort_by_key(|w| (w.top, w.left));
3304
3305 let median_h: i32 = {
3309 let mut heights: Vec<u32> = sorted_words.iter().map(|w| w.height.max(1)).collect();
3310 heights.sort_unstable();
3311 heights[heights.len() / 2] as i32
3312 };
3313 let vertical_tolerance = (median_h / 2).max(2);
3314
3315 let mut clusters: Vec<Vec<OcrWord>> = Vec::new();
3316 let mut current_cluster = vec![sorted_words[0].clone()];
3317
3318 for word in &sorted_words[1..] {
3319 if let Some(last) = current_cluster.last() {
3320 let vertical_gap = (word.top as i32 - last.top as i32).abs();
3321 let horizontal_gap = word
3322 .left
3323 .saturating_sub(last.left.saturating_add(last.width));
3324
3325 if vertical_gap <= vertical_tolerance && horizontal_gap <= gap_tolerance {
3326 current_cluster.push(word.clone());
3327 } else {
3328 clusters.push(current_cluster);
3329 current_cluster = vec![word.clone()];
3330 }
3331 }
3332 }
3333
3334 if !current_cluster.is_empty() {
3335 clusters.push(current_cluster);
3336 }
3337
3338 clusters
3339}
3340
3341fn words_to_plain_line_text(words: &[OcrWord]) -> String {
3342 let filtered_words = filter_words_by_spatial_coherence(words);
3344
3345 if filtered_words.is_empty() {
3346 return String::new();
3347 }
3348
3349 let avg_word_width =
3351 filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3352 let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3353 let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3354
3355 let mut lines: Vec<String> = Vec::new();
3356 for cluster in clusters {
3357 let mut sorted_cluster = cluster;
3358 sorted_cluster.sort_by_key(|w| w.left);
3359
3360 let line = sorted_cluster
3361 .iter()
3362 .map(|word| word.text.as_str())
3363 .collect::<Vec<_>>()
3364 .join(" ")
3365 .trim()
3366 .to_string();
3367
3368 if !line.is_empty() {
3369 lines.push(line);
3370 }
3371 }
3372
3373 lines.join(" ")
3374}
3375
3376fn run_tesseract_tsv_words_best<F>(
3381 image: &GrayImage,
3382 psm_modes: &[&str],
3383 accept: F,
3384) -> Option<Vec<OcrWord>>
3385where
3386 F: Fn(&[OcrWord]) -> bool,
3387{
3388 let variants = build_ocr_variants(image);
3389 let mut best: Option<OcrCandidateScore> = None;
3390
3391 for variant in variants {
3392 for psm in psm_modes {
3393 let Some(words) = run_tesseract_tsv_words(&variant, psm) else {
3394 continue;
3395 };
3396 if !accept(&words) {
3397 continue;
3398 }
3399 let score = score_ocr_words(&words, variant.width(), variant.height());
3400 match &best {
3401 Some(current) if current.score >= score => {}
3402 _ => {
3403 best = Some(OcrCandidateScore { words, score });
3404 }
3405 }
3406 }
3407 }
3408
3409 best.map(|candidate| candidate.words)
3410}
3411
3412fn score_ocr_words(words: &[OcrWord], width: u32, height: u32) -> f64 {
3413 if words.is_empty() || width == 0 || height == 0 {
3414 return 0.0;
3415 }
3416
3417 let mut by_line: BTreeMap<(u32, u32, u32), Vec<&OcrWord>> = BTreeMap::new();
3418 let mut alpha_words = 0usize;
3419 let mut area_coverage = 0f64;
3420 let mut vertical_spread_top = height;
3421 let mut vertical_spread_bottom = 0u32;
3422 let mut total_confidence = 0f64;
3423
3424 for word in words {
3425 by_line.entry(word.line_key).or_default().push(word);
3426 if word.text.chars().any(|ch| ch.is_alphabetic()) {
3427 alpha_words += 1;
3428 }
3429 area_coverage += f64::from(word.width.saturating_mul(word.height));
3430 vertical_spread_top = vertical_spread_top.min(word.top);
3431 vertical_spread_bottom = vertical_spread_bottom.max(word.top.saturating_add(word.height));
3432 total_confidence += word.confidence;
3433 }
3434
3435 let line_count = by_line.len() as f64;
3436 let alpha_ratio = alpha_words as f64 / words.len() as f64;
3437 let density = (area_coverage / f64::from(width.saturating_mul(height))).clamp(0.0, 1.0);
3438 let spread = if vertical_spread_bottom > vertical_spread_top {
3439 f64::from(vertical_spread_bottom - vertical_spread_top) / f64::from(height)
3440 } else {
3441 0.0
3442 };
3443 let avg_confidence = total_confidence / words.len() as f64;
3444 let confidence_bonus = (avg_confidence / 100.0).clamp(0.0, 1.0);
3446
3447 let horizontal_spread = if words.is_empty() {
3449 0.0
3450 } else {
3451 let min_left = words.iter().map(|w| w.left).min().unwrap_or(0);
3452 let max_right = words
3453 .iter()
3454 .map(|w| w.left + w.width)
3455 .max()
3456 .unwrap_or(width);
3457 f64::from(max_right.saturating_sub(min_left)) / f64::from(width)
3458 };
3459
3460 words.len() as f64
3461 + line_count * 1.5
3462 + alpha_ratio * 6.0
3463 + density * 25.0
3464 + spread * 3.0
3465 + horizontal_spread * 2.0
3466 + confidence_bonus * 5.0 }
3468
3469fn build_ocr_variants(gray: &GrayImage) -> Vec<GrayImage> {
3470 vec![
3471 gray.clone(),
3472 contrast_stretch(gray),
3473 global_otsu_binarize(gray),
3474 local_mean_binarize(gray, LOCAL_BINARIZATION_RADIUS),
3475 morphological_clean(gray),
3477 unsharp_mask(gray, 1.5),
3480 gamma_correct(gray, 0.6),
3482 ]
3483}
3484
3485fn unsharp_mask(gray: &GrayImage, amount: f32) -> GrayImage {
3489 let width = gray.width() as i32;
3490 let height = gray.height() as i32;
3491 let mut out = GrayImage::new(gray.width(), gray.height());
3492 for y in 0..height {
3493 for x in 0..width {
3494 let mut sum = 0i32;
3495 let mut count = 0i32;
3496 for dy in -1i32..=1 {
3497 for dx in -1i32..=1 {
3498 let nx = x + dx;
3499 let ny = y + dy;
3500 if nx >= 0 && ny >= 0 && nx < width && ny < height {
3501 sum += gray.get_pixel(nx as u32, ny as u32).0[0] as i32;
3502 count += 1;
3503 }
3504 }
3505 }
3506 let blurred = if count > 0 {
3507 sum / count
3508 } else {
3509 gray.get_pixel(x as u32, y as u32).0[0] as i32
3510 };
3511 let original = gray.get_pixel(x as u32, y as u32).0[0] as i32;
3512 let sharpened = original + ((original - blurred) as f32 * amount) as i32;
3513 out.put_pixel(x as u32, y as u32, Luma([sharpened.clamp(0, 255) as u8]));
3514 }
3515 }
3516 out
3517}
3518
3519fn gamma_correct(gray: &GrayImage, gamma: f32) -> GrayImage {
3522 let mut out = GrayImage::new(gray.width(), gray.height());
3523 for (x, y, pixel) in gray.enumerate_pixels() {
3524 let v = pixel.0[0] as f32 / 255.0;
3525 let corrected = (v.powf(gamma) * 255.0).round() as u8;
3526 out.put_pixel(x, y, Luma([corrected]));
3527 }
3528 out
3529}
3530
3531fn contrast_stretch(gray: &GrayImage) -> GrayImage {
3532 let mut min_val = u8::MAX;
3533 let mut max_val = u8::MIN;
3534 for pixel in gray.pixels() {
3535 let value = pixel.0[0];
3536 min_val = min_val.min(value);
3537 max_val = max_val.max(value);
3538 }
3539
3540 if max_val <= min_val {
3541 return gray.clone();
3542 }
3543
3544 let in_range = (max_val - min_val) as f64;
3545 let mut out = GrayImage::new(gray.width(), gray.height());
3546 for (x, y, pixel) in gray.enumerate_pixels() {
3547 let value = pixel.0[0];
3548 let normalized = ((value.saturating_sub(min_val)) as f64 / in_range * 255.0).round() as u8;
3549 out.put_pixel(x, y, Luma([normalized]));
3550 }
3551 out
3552}
3553
3554fn global_otsu_binarize(gray: &GrayImage) -> GrayImage {
3555 let threshold = otsu_threshold(gray);
3556 let mut out = GrayImage::new(gray.width(), gray.height());
3557 for (x, y, pixel) in gray.enumerate_pixels() {
3558 let value = if pixel.0[0] <= threshold { 0 } else { 255 };
3559 out.put_pixel(x, y, Luma([value]));
3560 }
3561 out
3562}
3563
3564fn otsu_threshold(gray: &GrayImage) -> u8 {
3565 let mut histogram = [0u64; 256];
3566 for pixel in gray.pixels() {
3567 histogram[pixel.0[0] as usize] += 1;
3568 }
3569
3570 let total = (gray.width() as u64) * (gray.height() as u64);
3571 if total == 0 {
3572 return 127;
3573 }
3574
3575 let sum_total: f64 = histogram
3576 .iter()
3577 .enumerate()
3578 .map(|(idx, count)| idx as f64 * *count as f64)
3579 .sum();
3580
3581 let mut sum_background = 0f64;
3582 let mut weight_background = 0f64;
3583 let mut max_variance = -1f64;
3584 let mut best_threshold = 127u8;
3585
3586 for (idx, count) in histogram.iter().enumerate() {
3587 weight_background += *count as f64;
3588 if weight_background <= 0.0 {
3589 continue;
3590 }
3591
3592 let weight_foreground = total as f64 - weight_background;
3593 if weight_foreground <= 0.0 {
3594 break;
3595 }
3596
3597 sum_background += idx as f64 * *count as f64;
3598 let mean_background = sum_background / weight_background;
3599 let mean_foreground = (sum_total - sum_background) / weight_foreground;
3600 let between_class_variance =
3601 weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
3602
3603 if between_class_variance > max_variance {
3604 max_variance = between_class_variance;
3605 best_threshold = idx as u8;
3606 }
3607 }
3608
3609 best_threshold
3610}
3611
3612fn local_mean_binarize(gray: &GrayImage, radius: u32) -> GrayImage {
3613 if gray.width() == 0 || gray.height() == 0 {
3614 return gray.clone();
3615 }
3616
3617 let width = gray.width() as usize;
3618 let height = gray.height() as usize;
3619 let (integral, stride) = integral_image(gray);
3620 let mut out = GrayImage::new(gray.width(), gray.height());
3621
3622 for y in 0..height {
3623 for x in 0..width {
3624 let x1 = x.saturating_sub(radius as usize);
3625 let y1 = y.saturating_sub(radius as usize);
3626 let x2 = (x + radius as usize).min(width - 1);
3627 let y2 = (y + radius as usize).min(height - 1);
3628
3629 let area = (x2 - x1 + 1) * (y2 - y1 + 1);
3630 let sum = region_sum(&integral, stride, x1, y1, x2, y2);
3631 let local_mean = (sum as f64) / (area as f64);
3632 let offset = if area >= MIN_BINARIZATION_BLOCK_PIXELS {
3633 8.0
3634 } else {
3635 4.0
3636 };
3637 let threshold = (local_mean - offset).clamp(0.0, 255.0);
3638
3639 let pixel_value = gray.get_pixel(x as u32, y as u32).0[0] as f64;
3640 let value = if pixel_value <= threshold { 0 } else { 255 };
3641 out.put_pixel(x as u32, y as u32, Luma([value]));
3642 }
3643 }
3644
3645 out
3646}
3647
3648fn morphological_clean(gray: &GrayImage) -> GrayImage {
3651 if gray.width() == 0 || gray.height() == 0 {
3652 return gray.clone();
3653 }
3654
3655 let binary = global_otsu_binarize(gray);
3657
3658 let dilated = morphological_dilate(&binary, 2);
3660 morphological_erode(&dilated, 2)
3661}
3662
3663fn morphological_dilate(gray: &GrayImage, iterations: u32) -> GrayImage {
3664 let mut result = gray.clone();
3665 for _ in 0..iterations {
3666 let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3667
3668 for y in 1..gray.height().saturating_sub(1) {
3669 for x in 1..gray.width().saturating_sub(1) {
3670 let mut has_black = false;
3672 for dy in 0..3 {
3673 for dx in 0..3 {
3674 let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3675 if px < 128 {
3676 has_black = true;
3677 break;
3678 }
3679 }
3680 if has_black {
3681 break;
3682 }
3683 }
3684 next.put_pixel(x, y, if has_black { Luma([0]) } else { Luma([255]) });
3685 }
3686 }
3687 result = next;
3688 }
3689 result
3690}
3691
3692fn morphological_erode(gray: &GrayImage, iterations: u32) -> GrayImage {
3693 let mut result = gray.clone();
3694 for _ in 0..iterations {
3695 let mut next = GrayImage::from_pixel(gray.width(), gray.height(), Luma([255]));
3696
3697 for y in 1..gray.height().saturating_sub(1) {
3698 for x in 1..gray.width().saturating_sub(1) {
3699 let mut all_black = true;
3702 for dy in 0..3 {
3703 for dx in 0..3 {
3704 let px = result.get_pixel(x + dx - 1, y + dy - 1).0[0];
3705 if px >= 128 {
3706 all_black = false;
3707 break;
3708 }
3709 }
3710 if !all_black {
3711 break;
3712 }
3713 }
3714 next.put_pixel(x, y, if all_black { Luma([0]) } else { Luma([255]) });
3715 }
3716 }
3717 result = next;
3718 }
3719 result
3720}
3721
3722fn integral_image(gray: &GrayImage) -> (Vec<u64>, usize) {
3723 let width = gray.width() as usize;
3724 let height = gray.height() as usize;
3725 let stride = width + 1;
3726 let mut integral = vec![0u64; (width + 1) * (height + 1)];
3727
3728 for y in 0..height {
3729 let mut row_sum = 0u64;
3730 for x in 0..width {
3731 row_sum += gray.get_pixel(x as u32, y as u32).0[0] as u64;
3732 let idx = (y + 1) * stride + (x + 1);
3733 integral[idx] = integral[y * stride + (x + 1)] + row_sum;
3734 }
3735 }
3736
3737 (integral, stride)
3738}
3739
3740fn region_sum(integral: &[u64], stride: usize, x1: usize, y1: usize, x2: usize, y2: usize) -> u64 {
3741 let a = integral[y1 * stride + x1];
3742 let b = integral[y1 * stride + (x2 + 1)];
3743 let c = integral[(y2 + 1) * stride + x1];
3744 let d = integral[(y2 + 1) * stride + (x2 + 1)];
3745 d + a - b - c
3746}
3747
3748fn run_tesseract_plain_text(image: &GrayImage, psm: &str) -> Option<String> {
3749 run_tesseract_plain_text_with_variant(image, psm)
3750}
3751
3752fn run_tesseract_plain_text_with_variant(image: &GrayImage, psm: &str) -> Option<String> {
3753 if matches!(selected_ocr_engine(), OcrEngine::RapidOcr) {
3754 return run_rapidocr_words(image).map(|words| words_to_plain_line_text(&words));
3755 }
3756
3757 let temp_dir = create_temp_dir(0).ok()?;
3758 let image_path = temp_dir.join("ocr.png");
3759 if image.save(&image_path).is_err() {
3760 let _ = fs::remove_dir_all(&temp_dir);
3761 return None;
3762 }
3763
3764 let dpi = TESSERACT_EFFECTIVE_DPI.to_string();
3765 let output = Command::new("tesseract")
3766 .current_dir(&temp_dir)
3767 .arg("ocr.png")
3768 .arg("stdout")
3769 .arg("--dpi")
3770 .arg(&dpi)
3771 .arg("--oem")
3772 .arg("3")
3773 .arg("--psm")
3774 .arg(psm)
3775 .arg("-c")
3776 .arg("load_system_dawg=0")
3777 .arg("-c")
3778 .arg("load_freq_dawg=0")
3779 .output()
3780 .ok()?;
3781 let _ = fs::remove_dir_all(&temp_dir);
3782 if !output.status.success() {
3783 return None;
3784 }
3785
3786 Some(
3787 String::from_utf8_lossy(&output.stdout)
3788 .replace('\n', " ")
3789 .split_whitespace()
3790 .collect::<Vec<_>>()
3791 .join(" "),
3792 )
3793}
3794
3795fn words_to_text_chunks(
3796 words: &[OcrWord],
3797 image: &ImageChunk,
3798 text_chunks: &[TextChunk],
3799) -> Vec<TextChunk> {
3800 let mut image_size = (0u32, 0u32);
3801 for word in words {
3802 image_size.0 = image_size.0.max(word.left.saturating_add(word.width));
3803 image_size.1 = image_size.1.max(word.top.saturating_add(word.height));
3804 }
3805 if image_size.0 == 0 || image_size.1 == 0 {
3806 return Vec::new();
3807 }
3808
3809 let mut dedupe: HashMap<String, usize> = HashMap::new();
3810 for chunk in text_chunks {
3811 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3812 }
3813
3814 let mut recovered = Vec::new();
3815 for word in words {
3816 let normalized = normalize_text(&word.text);
3817 if normalized.len() >= 4 && dedupe.contains_key(&normalized) {
3818 continue;
3819 }
3820
3821 let left_ratio = f64::from(word.left) / f64::from(image_size.0);
3822 let right_ratio = f64::from(word.left.saturating_add(word.width)) / f64::from(image_size.0);
3823 let top_ratio = f64::from(word.top) / f64::from(image_size.1);
3824 let bottom_ratio =
3825 f64::from(word.top.saturating_add(word.height)) / f64::from(image_size.1);
3826
3827 let left_x = image.bbox.left_x + image.bbox.width() * left_ratio;
3828 let right_x = image.bbox.left_x + image.bbox.width() * right_ratio;
3829 let top_y = image.bbox.top_y - image.bbox.height() * top_ratio;
3830 let bottom_y = image.bbox.top_y - image.bbox.height() * bottom_ratio;
3831 if right_x <= left_x || top_y <= bottom_y {
3832 continue;
3833 }
3834
3835 recovered.push(TextChunk {
3836 value: word.text.clone(),
3837 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3838 font_name: "OCR".to_string(),
3839 font_size: (top_y - bottom_y).max(6.0),
3840 font_weight: 400.0,
3841 italic_angle: 0.0,
3842 font_color: "#000000".to_string(),
3843 contrast_ratio: 21.0,
3844 symbol_ends: Vec::new(),
3845 text_format: TextFormat::Normal,
3846 text_type: TextType::Regular,
3847 pdf_layer: PdfLayer::Content,
3848 ocg_visible: true,
3849 index: None,
3850 page_number: image.bbox.page_number,
3851 level: None,
3852 mcid: None,
3853 });
3854 }
3855
3856 recovered
3857}
3858
3859fn lines_from_ocr_words(
3860 words: &[OcrWord],
3861 image: &ImageChunk,
3862 image_width: u32,
3863 image_height: u32,
3864 text_chunks: &[TextChunk],
3865) -> Vec<TextChunk> {
3866 if image_width == 0 || image_height == 0 {
3867 return Vec::new();
3868 }
3869
3870 let mut dedupe: HashMap<String, usize> = HashMap::new();
3871 for chunk in text_chunks {
3872 dedupe.insert(normalize_text(&chunk.value), dedupe.len());
3873 }
3874
3875 let spatial_lines = build_spatial_ocr_lines(words);
3876 if spatial_lines.is_empty() {
3877 return Vec::new();
3878 }
3879
3880 let blocks = merge_spatial_ocr_lines_into_blocks(&spatial_lines, image_width);
3881 if blocks.is_empty() {
3882 return Vec::new();
3883 }
3884
3885 let mut recovered = Vec::new();
3886 for block in blocks {
3887 let normalized = normalize_text(&block.text);
3888 if normalized.len() >= 8 && dedupe.contains_key(&normalized) {
3889 continue;
3890 }
3891
3892 if block.right <= block.left || block.bottom <= block.top {
3893 continue;
3894 }
3895
3896 let left_x = image.bbox.left_x
3897 + image.bbox.width() * (f64::from(block.left) / f64::from(image_width));
3898 let right_x = image.bbox.left_x
3899 + image.bbox.width() * (f64::from(block.right) / f64::from(image_width));
3900 let top_y = image.bbox.top_y
3901 - image.bbox.height() * (f64::from(block.top) / f64::from(image_height));
3902 let bottom_y = image.bbox.top_y
3903 - image.bbox.height() * (f64::from(block.bottom) / f64::from(image_height));
3904 if right_x <= left_x || top_y <= bottom_y {
3905 continue;
3906 }
3907
3908 recovered.push(TextChunk {
3909 value: block.text,
3910 bbox: BoundingBox::new(image.bbox.page_number, left_x, bottom_y, right_x, top_y),
3911 font_name: "OCR".to_string(),
3912 font_size: (f64::from(block.line_height_sum) / block.line_count.max(1) as f64).max(6.0),
3913 font_weight: 400.0,
3914 italic_angle: 0.0,
3915 font_color: "#000000".to_string(),
3916 contrast_ratio: 21.0,
3917 symbol_ends: Vec::new(),
3918 text_format: TextFormat::Normal,
3919 text_type: TextType::Regular,
3920 pdf_layer: PdfLayer::Content,
3921 ocg_visible: true,
3922 index: None,
3923 page_number: image.bbox.page_number,
3924 level: None,
3925 mcid: None,
3926 });
3927 }
3928
3929 recovered
3930}
3931
3932#[derive(Debug, Clone)]
3933struct SpatialOcrLine {
3934 left: u32,
3935 top: u32,
3936 right: u32,
3937 bottom: u32,
3938 text: String,
3939 word_count: usize,
3940 line_count: usize,
3941 line_height_sum: u32,
3942}
3943
3944fn build_spatial_ocr_lines(words: &[OcrWord]) -> Vec<SpatialOcrLine> {
3945 let filtered_words = filter_words_by_spatial_coherence(words);
3946 if filtered_words.is_empty() {
3947 return Vec::new();
3948 }
3949
3950 let avg_word_width =
3951 filtered_words.iter().map(|w| w.width).sum::<u32>() as f64 / filtered_words.len() as f64;
3952 let gap_tolerance = (avg_word_width * 0.8).ceil() as u32;
3953 let clusters = cluster_words_by_proximity(&filtered_words, gap_tolerance);
3954
3955 let mut lines = Vec::new();
3956 for mut cluster in clusters {
3957 cluster.sort_by_key(|word| word.left);
3958 let text = cluster
3959 .iter()
3960 .map(|word| word.text.as_str())
3961 .collect::<Vec<_>>()
3962 .join(" ")
3963 .trim()
3964 .to_string();
3965 if text.is_empty() {
3966 continue;
3967 }
3968
3969 let left = cluster.iter().map(|word| word.left).min().unwrap_or(0);
3970 let right = cluster
3971 .iter()
3972 .map(|word| word.left.saturating_add(word.width))
3973 .max()
3974 .unwrap_or(0);
3975 let top = cluster.iter().map(|word| word.top).min().unwrap_or(0);
3976 let bottom = cluster
3977 .iter()
3978 .map(|word| word.top.saturating_add(word.height))
3979 .max()
3980 .unwrap_or(0);
3981 if right <= left || bottom <= top {
3982 continue;
3983 }
3984
3985 lines.push(SpatialOcrLine {
3986 left,
3987 top,
3988 right,
3989 bottom,
3990 text,
3991 word_count: cluster.len(),
3992 line_count: 1,
3993 line_height_sum: bottom.saturating_sub(top).max(1),
3994 });
3995 }
3996
3997 lines.sort_by_key(|line| (line.top, line.left));
3998 lines
3999}
4000
4001fn merge_spatial_ocr_lines_into_blocks(
4002 lines: &[SpatialOcrLine],
4003 image_width: u32,
4004) -> Vec<SpatialOcrLine> {
4005 if lines.is_empty() {
4006 return Vec::new();
4007 }
4008
4009 let median_height = {
4010 let mut heights: Vec<u32> = lines
4011 .iter()
4012 .map(|line| line.bottom.saturating_sub(line.top).max(1))
4013 .collect();
4014 heights.sort_unstable();
4015 heights[heights.len() / 2]
4016 };
4017 let vertical_tolerance = (median_height / 2).max(3);
4018 let max_vertical_gap = median_height.saturating_mul(2).max(8);
4019
4020 let mut blocks: Vec<SpatialOcrLine> = Vec::new();
4021 for line in lines {
4022 let merge_idx = blocks.iter().rposition(|block| {
4023 let vertical_gap = line.top.saturating_sub(block.bottom);
4024 if vertical_gap > max_vertical_gap {
4025 return false;
4026 }
4027 if line.top + vertical_tolerance < block.bottom {
4028 return false;
4029 }
4030
4031 spatial_lines_share_block_geometry(block, line, image_width, median_height)
4032 });
4033
4034 if let Some(merge_idx) = merge_idx {
4035 let block = &mut blocks[merge_idx];
4036 block.left = block.left.min(line.left);
4037 block.top = block.top.min(line.top);
4038 block.right = block.right.max(line.right);
4039 block.bottom = block.bottom.max(line.bottom);
4040 block.word_count += line.word_count;
4041 block.line_count += line.line_count;
4042 block.line_height_sum = block.line_height_sum.saturating_add(line.line_height_sum);
4043 if !block.text.ends_with('-') {
4044 block.text.push(' ');
4045 }
4046 block.text.push_str(&line.text);
4047 continue;
4048 }
4049
4050 blocks.push(line.clone());
4051 }
4052
4053 blocks
4054 .into_iter()
4055 .filter_map(|mut block| {
4056 block.text = block.text.split_whitespace().collect::<Vec<_>>().join(" ");
4057 let alphabetic = block.text.chars().filter(|ch| ch.is_alphabetic()).count();
4058 let min_chars = if block.word_count >= 4 { 10 } else { 16 };
4059 if block.text.len() < min_chars || alphabetic < 4 {
4060 return None;
4061 }
4062 Some(block)
4063 })
4064 .collect()
4065}
4066
4067fn spatial_lines_share_block_geometry(
4068 upper: &SpatialOcrLine,
4069 lower: &SpatialOcrLine,
4070 image_width: u32,
4071 median_height: u32,
4072) -> bool {
4073 let overlap_left = upper.left.max(lower.left);
4074 let overlap_right = upper.right.min(lower.right);
4075 let overlap = overlap_right.saturating_sub(overlap_left);
4076 let upper_width = upper.right.saturating_sub(upper.left).max(1);
4077 let lower_width = lower.right.saturating_sub(lower.left).max(1);
4078 let min_width = upper_width.min(lower_width);
4079 let max_width = upper_width.max(lower_width);
4080 let overlap_ratio = overlap as f64 / min_width as f64;
4081 let width_ratio = min_width as f64 / max_width as f64;
4082 let max_left_shift = ((f64::from(image_width) * 0.045).round() as u32)
4083 .max(median_height.saturating_mul(2))
4084 .max(8);
4085 let left_shift = upper.left.abs_diff(lower.left);
4086
4087 overlap_ratio >= 0.40
4088 || (overlap_ratio >= 0.15 && left_shift <= max_left_shift && width_ratio >= 0.55)
4089}
4090
4091fn is_numeric_like(text: &str) -> bool {
4092 text.chars().any(|ch| ch.is_ascii_digit())
4093}
4094
4095fn normalize_text(text: &str) -> String {
4096 text.chars()
4097 .filter(|ch| ch.is_alphanumeric())
4098 .flat_map(|ch| ch.to_lowercase())
4099 .collect()
4100}
4101
4102fn normalize_caption_text(text: &str) -> String {
4103 text.replace("CarolinaBLUTM", "CarolinaBLU™")
4104 .replace("CarolinaBLU™™", "CarolinaBLU™")
4105 .trim()
4106 .to_string()
4107}
4108
4109fn normalize_raster_cell_text(row_idx: usize, _col_idx: usize, text: String) -> String {
4110 let mut normalized = text
4111 .replace('|', " ")
4112 .replace('—', "-")
4113 .replace("AorB", "A or B")
4114 .replace("Aor B", "A or B")
4115 .replace("H,O", "H2O")
4116 .split_whitespace()
4117 .collect::<Vec<_>>()
4118 .join(" ");
4119
4120 if row_idx > 0 && !normalized.chars().any(|ch| ch.is_ascii_digit()) && normalized.len() <= 2 {
4121 return String::new();
4122 }
4123 if row_idx > 0
4124 && normalized
4125 .chars()
4126 .all(|ch| matches!(ch, 'O' | 'o' | 'S' | 'B'))
4127 {
4128 return String::new();
4129 }
4130
4131 normalized = normalized
4132 .replace(" ywL", " μL")
4133 .replace(" yuL", " μL")
4134 .replace(" yL", " μL")
4135 .replace(" wL", " μL")
4136 .replace(" uL", " μL")
4137 .replace(" pL", " μL");
4138
4139 normalized.trim().to_string()
4140}
4141
4142fn create_temp_dir(page_number: u32) -> std::io::Result<PathBuf> {
4143 let unique = SystemTime::now()
4144 .duration_since(UNIX_EPOCH)
4145 .unwrap_or_default()
4146 .as_nanos();
4147 let dir = std::env::temp_dir().join(format!(
4148 "edgeparse-raster-ocr-{}-{}-{}",
4149 std::process::id(),
4150 page_number,
4151 unique
4152 ));
4153 fs::create_dir_all(&dir)?;
4154 Ok(dir)
4155}
4156
4157fn extract_visible_page_image_files(
4158 input_path: &Path,
4159 page_number: u32,
4160 temp_dir: &Path,
4161) -> Option<Vec<PathBuf>> {
4162 let list_output = Command::new("pdfimages")
4163 .arg("-f")
4164 .arg(page_number.to_string())
4165 .arg("-l")
4166 .arg(page_number.to_string())
4167 .arg("-list")
4168 .arg(input_path)
4169 .output()
4170 .ok()?;
4171 if !list_output.status.success() {
4172 return None;
4173 }
4174
4175 let entries = parse_pdfimages_list(&String::from_utf8_lossy(&list_output.stdout));
4176 let visible_indices: Vec<usize> = entries
4177 .iter()
4178 .enumerate()
4179 .filter_map(|(idx, entry)| (entry.image_type == "image").then_some(idx))
4180 .collect();
4181 if visible_indices.is_empty() {
4182 return Some(Vec::new());
4183 }
4184
4185 let prefix = temp_dir.join("img");
4186 let status = Command::new("pdfimages")
4187 .arg("-f")
4188 .arg(page_number.to_string())
4189 .arg("-l")
4190 .arg(page_number.to_string())
4191 .arg("-png")
4192 .arg(input_path)
4193 .arg(&prefix)
4194 .status()
4195 .ok()?;
4196 if !status.success() {
4197 return None;
4198 }
4199
4200 let mut image_files: Vec<PathBuf> = fs::read_dir(temp_dir)
4201 .ok()?
4202 .filter_map(|entry| entry.ok().map(|e| e.path()))
4203 .filter(|path| path.extension().and_then(|ext| ext.to_str()) == Some("png"))
4204 .collect();
4205 image_files.sort();
4206
4207 let visible_files: Vec<PathBuf> = visible_indices
4208 .into_iter()
4209 .filter_map(|idx| image_files.get(idx).cloned())
4210 .collect();
4211 Some(visible_files)
4212}
4213
4214fn parse_pdfimages_list(output: &str) -> Vec<PdfImagesListEntry> {
4215 let mut entries = Vec::new();
4216 let mut in_rows = false;
4217
4218 for line in output.lines() {
4219 let trimmed = line.trim();
4220 if trimmed.is_empty() {
4221 continue;
4222 }
4223 if trimmed.starts_with("---") {
4224 in_rows = true;
4225 continue;
4226 }
4227 if !in_rows {
4228 continue;
4229 }
4230
4231 let mut cols = trimmed.split_whitespace();
4232 let Some(_page) = cols.next() else {
4233 continue;
4234 };
4235 let Some(_num) = cols.next() else {
4236 continue;
4237 };
4238 let Some(image_type) = cols.next() else {
4239 continue;
4240 };
4241
4242 entries.push(PdfImagesListEntry {
4243 image_type: image_type.to_string(),
4244 });
4245 }
4246
4247 entries
4248}
4249
4250#[cfg(test)]
4251mod tests {
4252 use super::*;
4253 use image::GrayImage;
4254
4255 fn image_chunk() -> ImageChunk {
4256 ImageChunk {
4257 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 400.0, 400.0),
4258 index: Some(1),
4259 level: None,
4260 }
4261 }
4262
4263 fn word(line: (u32, u32, u32), left: u32, text: &str) -> OcrWord {
4264 OcrWord {
4265 line_key: line,
4266 left,
4267 top: 0,
4268 width: 40,
4269 height: 12,
4270 text: text.to_string(),
4271 confidence: 90.0,
4272 }
4273 }
4274
4275 fn word_at(line: (u32, u32, u32), left: u32, top: u32, width: u32, text: &str) -> OcrWord {
4276 OcrWord {
4277 line_key: line,
4278 left,
4279 top,
4280 width,
4281 height: 12,
4282 text: text.to_string(),
4283 confidence: 90.0,
4284 }
4285 }
4286
4287 fn test_cell_text(cell: &TableBorderCell) -> String {
4288 cell.content
4289 .iter()
4290 .map(|token| token.base.value.trim())
4291 .filter(|value| !value.is_empty())
4292 .collect::<Vec<_>>()
4293 .join(" ")
4294 }
4295
4296 #[test]
4297 fn test_table_like_ocr_detects_repeated_columns() {
4298 let words = vec![
4299 word((1, 1, 1), 10, "Temperature"),
4300 word((1, 1, 1), 120, "Viscosity"),
4301 word((1, 1, 1), 240, "Temperature"),
4302 word((1, 1, 1), 360, "Viscosity"),
4303 word((1, 1, 2), 10, "0"),
4304 word((1, 1, 2), 120, "1.793E-06"),
4305 word((1, 1, 2), 240, "25"),
4306 word((1, 1, 2), 360, "8.930E-07"),
4307 word((1, 1, 3), 10, "1"),
4308 word((1, 1, 3), 120, "1.732E-06"),
4309 word((1, 1, 3), 240, "26"),
4310 word((1, 1, 3), 360, "8.760E-07"),
4311 ];
4312 assert!(!looks_like_chart_label_ocr(&words));
4313 assert!(looks_like_table_ocr(&words));
4314 }
4315
4316 #[test]
4317 fn test_structured_ocr_table_border_recovers_non_numeric_table() {
4318 let image = image_chunk();
4319 let words = vec![
4320 word_at((1, 1, 1), 10, 10, 80, "Tube"),
4321 word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4322 word_at((1, 1, 1), 305, 10, 70, "DNA"),
4323 word_at((1, 1, 2), 10, 42, 80, "1"),
4324 word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4325 word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4326 word_at((1, 1, 3), 10, 74, 80, "2"),
4327 word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4328 word_at((1, 1, 3), 305, 74, 70, "lambda"),
4329 word_at((1, 1, 4), 10, 106, 80, "3"),
4330 word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4331 word_at((1, 1, 4), 305, 106, 70, "control"),
4332 ];
4333
4334 assert!(!looks_like_chart_label_ocr(&words));
4335 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4336 assert_eq!(table.num_columns, 3);
4337 assert_eq!(table.num_rows, 4);
4338 assert_eq!(test_cell_text(&table.rows[0].cells[0]), "Tube");
4339 assert_eq!(test_cell_text(&table.rows[1].cells[1]), "BamHI");
4340 assert_eq!(test_cell_text(&table.rows[3].cells[2]), "control");
4341 }
4342
4343 #[test]
4344 fn test_chart_label_ocr_does_not_reject_five_row_table() {
4345 let words = vec![
4346 word_at((1, 1, 1), 10, 10, 80, "Tube"),
4347 word_at((1, 1, 1), 145, 10, 110, "Enzyme"),
4348 word_at((1, 1, 1), 305, 10, 70, "DNA"),
4349 word_at((1, 1, 2), 10, 42, 80, "1"),
4350 word_at((1, 1, 2), 145, 42, 110, "BamHI"),
4351 word_at((1, 1, 2), 305, 42, 70, "pUC19"),
4352 word_at((1, 1, 3), 10, 74, 80, "2"),
4353 word_at((1, 1, 3), 145, 74, 110, "HindIII"),
4354 word_at((1, 1, 3), 305, 74, 70, "lambda"),
4355 word_at((1, 1, 4), 10, 106, 80, "3"),
4356 word_at((1, 1, 4), 145, 106, 110, "EcoRI"),
4357 word_at((1, 1, 4), 305, 106, 70, "control"),
4358 word_at((1, 1, 5), 10, 138, 80, "4"),
4359 word_at((1, 1, 5), 145, 138, 110, "NotI"),
4360 word_at((1, 1, 5), 305, 138, 70, "sample"),
4361 ];
4362
4363 assert!(!looks_like_chart_label_ocr(&words));
4364 assert!(looks_like_table_ocr(&words));
4365 }
4366
4367 #[test]
4368 fn test_structured_ocr_table_border_rejects_two_column_prose_layout() {
4369 let image = image_chunk();
4370 let words = vec![
4371 word_at((1, 1, 1), 10, 10, 90, "Summary"),
4372 word_at((1, 1, 1), 220, 10, 120, "Detailed findings"),
4373 word_at((1, 1, 2), 10, 42, 90, "Background"),
4374 word_at((1, 1, 2), 220, 42, 120, "Additional context"),
4375 word_at((1, 1, 3), 10, 74, 90, "Notes"),
4376 word_at((1, 1, 3), 220, 74, 120, "Further explanation"),
4377 ];
4378
4379 assert!(build_structured_ocr_table_border(&words, &image).is_none());
4380 }
4381
4382 #[test]
4383 fn test_parse_pdfimages_list_ignores_smask_entries() {
4384 let output = "page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio\n--------------------------------------------------------------------------------------------\n 1 0 image 1320 358 icc 3 8 image no 46 0 208 208 63.5K 4.6%\n 1 1 smask 1320 358 gray 1 8 image no 46 0 208 208 483B 0.1%\n";
4385
4386 let entries = parse_pdfimages_list(output);
4387 assert_eq!(entries.len(), 2);
4388 assert_eq!(entries[0].image_type, "image");
4389 assert_eq!(entries[1].image_type, "smask");
4390 }
4391
4392 #[test]
4393 fn test_table_like_ocr_rejects_single_line_caption() {
4394 let words = vec![
4395 word((1, 1, 1), 10, "Figure"),
4396 word((1, 1, 1), 90, "7.2"),
4397 word((1, 1, 1), 150, "Viscosity"),
4398 word((1, 1, 1), 260, "of"),
4399 word((1, 1, 1), 300, "Water"),
4400 ];
4401 assert!(!looks_like_table_ocr(&words));
4402 }
4403
4404 #[test]
4405 fn test_normalize_raster_cell_text_fixes_units_and_artifacts() {
4406 assert_eq!(
4407 normalize_raster_cell_text(1, 1, "3 ywL".to_string()),
4408 "3 μL"
4409 );
4410 assert_eq!(normalize_raster_cell_text(1, 4, "OS".to_string()), "");
4411 assert_eq!(normalize_raster_cell_text(0, 6, "H,O".to_string()), "H2O");
4412 }
4413
4414 #[test]
4415 fn test_detect_bordered_raster_grid_finds_strong_lines() {
4416 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
4417 for x in [10, 40, 80, 110] {
4418 for y in 10..71 {
4419 image.put_pixel(x, y, Luma([0]));
4420 }
4421 }
4422 for y in [10, 30, 50, 70] {
4423 for x in 10..111 {
4424 image.put_pixel(x, y, Luma([0]));
4425 }
4426 }
4427
4428 let grid = detect_bordered_raster_grid(&image).expect("grid");
4429 assert_eq!(grid.vertical_lines.len(), 4);
4430 assert_eq!(grid.horizontal_lines.len(), 4);
4431 }
4432
4433 #[test]
4434 fn test_obvious_bar_chart_raster_is_rejected() {
4435 let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4436 for &(y1, y2) in &[(25, 40), (70, 85), (115, 130), (160, 175)] {
4437 for y in y1..y2 {
4438 for x in 40..280 {
4439 image.put_pixel(x, y, Luma([80]));
4440 }
4441 }
4442 }
4443
4444 assert!(is_obvious_bar_chart_raster(&image));
4445 }
4446
4447 #[test]
4448 fn test_vertical_bar_chart_raster_is_rejected() {
4449 let mut image = GrayImage::from_pixel(360, 240, Luma([255]));
4450 for &(x1, x2, y1) in &[
4451 (40, 78, 52),
4452 (92, 126, 118),
4453 (140, 170, 146),
4454 (184, 210, 162),
4455 ] {
4456 for x in x1..x2 {
4457 for y in y1..212 {
4458 image.put_pixel(x, y, Luma([90]));
4459 }
4460 }
4461 }
4462
4463 assert!(is_obvious_bar_chart_raster(&image));
4464 }
4465
4466 #[test]
4467 fn test_light_fill_vertical_bar_chart_raster_is_rejected() {
4468 let mut image = GrayImage::from_pixel(420, 260, Luma([255]));
4469 for x in 24..396 {
4470 image.put_pixel(x, 222, Luma([170]));
4471 }
4472 for &(x1, x2, y1, shade) in &[
4473 (46, 82, 132, 222),
4474 (104, 140, 84, 214),
4475 (162, 198, 62, 206),
4476 (220, 256, 144, 228),
4477 ] {
4478 for x in x1..x2 {
4479 for y in y1..222 {
4480 image.put_pixel(x, y, Luma([shade]));
4481 }
4482 }
4483 }
4484
4485 assert!(is_obvious_bar_chart_raster(&image));
4486 }
4487
4488 #[test]
4489 fn test_grouped_vertical_bar_chart_raster_is_rejected() {
4490 let mut image = GrayImage::from_pixel(420, 240, Luma([255]));
4491 for x in 28..392 {
4492 image.put_pixel(x, 214, Luma([175]));
4493 }
4494 for &(x1, x2, y1, shade) in &[
4495 (44, 60, 98, 210),
4496 (64, 80, 140, 225),
4497 (108, 124, 116, 214),
4498 (128, 144, 148, 229),
4499 (172, 188, 88, 206),
4500 (192, 208, 128, 222),
4501 (236, 252, 104, 212),
4502 (256, 272, 156, 228),
4503 ] {
4504 for x in x1..x2 {
4505 for y in y1..214 {
4506 image.put_pixel(x, y, Luma([shade]));
4507 }
4508 }
4509 }
4510
4511 assert!(is_obvious_bar_chart_raster(&image));
4512 }
4513
4514 #[test]
4515 fn test_natural_photograph_raster_is_detected() {
4516 let w = 100u32;
4518 let h = 100u32;
4519 let mut image = GrayImage::new(w, h);
4520 for y in 0..h {
4522 for x in 0..w {
4523 let v = ((x + y) * 255 / (w + h - 2)) as u8;
4524 image.put_pixel(x, y, Luma([v]));
4525 }
4526 }
4527 assert!(is_natural_photograph_raster(&image));
4529 }
4530
4531 #[test]
4532 fn test_chart_image_is_not_classified_as_photograph() {
4533 let mut image = GrayImage::from_pixel(200, 160, Luma([255]));
4535 for x in 20..180 {
4537 image.put_pixel(x, 20, Luma([0]));
4538 image.put_pixel(x, 80, Luma([0]));
4539 image.put_pixel(x, 140, Luma([0]));
4540 }
4541 for y in 20..141 {
4542 image.put_pixel(20, y, Luma([0]));
4543 image.put_pixel(180, y, Luma([0]));
4544 }
4545 assert!(!is_natural_photograph_raster(&image));
4547 assert!(!is_dark_ui_screenshot_raster(&image));
4548 }
4549
4550 #[test]
4551 fn test_bright_natural_photograph_raster_is_detected() {
4552 let mut image = GrayImage::from_pixel(240, 180, Luma([250]));
4553 for y in 24..148 {
4554 for x in 52..156 {
4555 let tone = 72 + (((x - 52) * 11 + (y - 24) * 7) % 132) as u8;
4556 image.put_pixel(x, y, Luma([tone]));
4557 }
4558 }
4559
4560 assert!(is_natural_photograph_raster(&image));
4561 }
4562
4563 #[test]
4564 fn test_dark_ui_screenshot_raster_is_detected() {
4565 let mut image = GrayImage::from_pixel(260, 180, Luma([20]));
4566 for x in 18..242 {
4567 for y in 18..34 {
4568 image.put_pixel(x, y, Luma([210]));
4569 }
4570 }
4571 for &(x1, y1, x2, y2, shade) in &[
4572 (26, 58, 84, 108, 198),
4573 (94, 58, 152, 108, 210),
4574 (162, 58, 220, 108, 192),
4575 (26, 118, 220, 134, 224),
4576 ] {
4577 for x in x1..x2 {
4578 for y in y1..y2 {
4579 image.put_pixel(x, y, Luma([shade]));
4580 }
4581 }
4582 }
4583
4584 assert!(is_dark_ui_screenshot_raster(&image));
4585 }
4586
4587 #[test]
4588 fn test_table_like_ocr_rejects_matrix_formula_layout() {
4589 let words = vec![
4590 word_at((1, 1, 1), 14, 10, 36, "B23"),
4591 word_at((1, 1, 1), 160, 10, 22, "C1"),
4592 word_at((1, 1, 1), 230, 10, 22, "C2"),
4593 word_at((1, 1, 1), 300, 10, 22, "C3"),
4594 word_at((1, 1, 2), 20, 44, 24, "0/0"),
4595 word_at((1, 1, 2), 150, 44, 18, "0"),
4596 word_at((1, 1, 2), 220, 44, 28, "001"),
4597 word_at((1, 1, 2), 300, 44, 28, "000"),
4598 word_at((1, 1, 3), 20, 76, 24, "0/1"),
4599 word_at((1, 1, 3), 150, 76, 28, "000"),
4600 word_at((1, 1, 3), 220, 76, 28, "010"),
4601 word_at((1, 1, 3), 300, 76, 28, "000"),
4602 ];
4603
4604 assert!(looks_like_matrix_formula_ocr(&words));
4605 assert!(!looks_like_table_ocr(&words));
4606 }
4607
4608 #[test]
4609 fn test_table_like_ocr_keeps_small_numeric_table_with_real_headers() {
4610 let words = vec![
4611 word_at((1, 1, 1), 10, 10, 64, "Year"),
4612 word_at((1, 1, 1), 130, 10, 28, "Q1"),
4613 word_at((1, 1, 1), 220, 10, 28, "Q2"),
4614 word_at((1, 1, 1), 310, 10, 28, "Q3"),
4615 word_at((1, 1, 2), 10, 42, 64, "2022"),
4616 word_at((1, 1, 2), 130, 42, 24, "10"),
4617 word_at((1, 1, 2), 220, 42, 24, "25"),
4618 word_at((1, 1, 2), 310, 42, 24, "30"),
4619 word_at((1, 1, 3), 10, 74, 64, "2023"),
4620 word_at((1, 1, 3), 130, 74, 24, "11"),
4621 word_at((1, 1, 3), 220, 74, 24, "26"),
4622 word_at((1, 1, 3), 310, 74, 24, "31"),
4623 ];
4624
4625 assert!(!looks_like_matrix_formula_ocr(&words));
4626 assert!(looks_like_table_ocr(&words));
4627 }
4628
4629 #[test]
4630 fn test_matrixish_small_ocr_table_is_rejected_after_build() {
4631 let image = ImageChunk {
4632 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 120.0),
4633 index: Some(1),
4634 level: None,
4635 };
4636 let words = vec![
4637 word_at((1, 1, 1), 14, 10, 36, "B23"),
4638 word_at((1, 1, 1), 160, 10, 22, "C1"),
4639 word_at((1, 1, 1), 230, 10, 22, "C2"),
4640 word_at((1, 1, 1), 300, 10, 22, "C3"),
4641 word_at((1, 1, 2), 20, 44, 24, "0/0"),
4642 word_at((1, 1, 2), 150, 44, 18, "0"),
4643 word_at((1, 1, 2), 220, 44, 28, "001"),
4644 word_at((1, 1, 2), 300, 44, 28, "000"),
4645 word_at((1, 1, 3), 20, 76, 24, "0/1"),
4646 word_at((1, 1, 3), 150, 76, 28, "000"),
4647 word_at((1, 1, 3), 220, 76, 28, "010"),
4648 word_at((1, 1, 3), 300, 76, 28, "000"),
4649 ];
4650
4651 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4652 assert!(is_matrixish_ocr_artifact_table(&table));
4653 }
4654
4655 #[test]
4656 fn test_small_numeric_table_with_real_headers_is_not_rejected_after_build() {
4657 let image = ImageChunk {
4658 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 440.0, 140.0),
4659 index: Some(1),
4660 level: None,
4661 };
4662 let words = vec![
4663 word_at((1, 1, 1), 10, 10, 64, "Year"),
4664 word_at((1, 1, 1), 130, 10, 28, "Q1"),
4665 word_at((1, 1, 1), 220, 10, 28, "Q2"),
4666 word_at((1, 1, 1), 310, 10, 28, "Q3"),
4667 word_at((1, 1, 2), 10, 42, 64, "2022"),
4668 word_at((1, 1, 2), 130, 42, 24, "10"),
4669 word_at((1, 1, 2), 220, 42, 24, "25"),
4670 word_at((1, 1, 2), 310, 42, 24, "30"),
4671 word_at((1, 1, 3), 10, 74, 64, "2023"),
4672 word_at((1, 1, 3), 130, 74, 24, "11"),
4673 word_at((1, 1, 3), 220, 74, 24, "26"),
4674 word_at((1, 1, 3), 310, 74, 24, "31"),
4675 ];
4676
4677 let table = build_structured_ocr_table_border(&words, &image).expect("structured table");
4678 assert!(!is_matrixish_ocr_artifact_table(&table));
4679 }
4680
4681 #[test]
4682 fn test_bordered_table_raster_is_not_rejected_as_chart() {
4683 let mut image = GrayImage::from_pixel(320, 200, Luma([255]));
4684 for x in [20, 110, 210, 300] {
4685 for y in 20..181 {
4686 image.put_pixel(x, y, Luma([0]));
4687 }
4688 }
4689 for y in [20, 70, 120, 180] {
4690 for x in 20..301 {
4691 image.put_pixel(x, y, Luma([0]));
4692 }
4693 }
4694
4695 assert!(!is_obvious_bar_chart_raster(&image));
4696 }
4697
4698 #[test]
4699 fn test_morphological_erode_preserves_white_background() {
4700 let image = GrayImage::from_fn(9, 9, |x, y| {
4701 if x == 4 || y == 4 {
4702 Luma([0])
4703 } else {
4704 Luma([255])
4705 }
4706 });
4707
4708 let eroded = morphological_erode(&image, 1);
4709
4710 assert_eq!(eroded.get_pixel(0, 0).0[0], 255);
4711 assert_eq!(eroded.get_pixel(8, 8).0[0], 255);
4712 assert_eq!(eroded.get_pixel(4, 4).0[0], 255);
4713 }
4714
4715 #[test]
4716 fn test_dense_prose_image_ocr_detects_infographic_text() {
4717 let mut words = Vec::new();
4718 let mut top = 20;
4719 for line_num in 1..=8 {
4720 for (idx, (left, text)) in [
4721 (20, "Copyright"),
4722 (120, "protects"),
4723 (240, "creative"),
4724 (350, "work"),
4725 ]
4726 .into_iter()
4727 .enumerate()
4728 {
4729 words.push(OcrWord {
4730 line_key: (1, 1, line_num),
4731 left,
4732 top,
4733 width: 60,
4734 height: 14,
4735 confidence: 85.0,
4736 text: if idx == 0 && line_num % 2 == 0 {
4737 "Creators".to_string()
4738 } else {
4739 text.to_string()
4740 },
4741 });
4742 }
4743 top += 22;
4744 }
4745
4746 assert!(looks_like_dense_prose_image_ocr(&words));
4747 }
4748
4749 #[test]
4750 fn test_dense_prose_image_ocr_rejects_chart_like_words() {
4751 let words = vec![
4752 word((1, 1, 1), 10, "70.2"),
4753 word((1, 1, 1), 90, "75.6"),
4754 word((1, 1, 1), 170, "92.4"),
4755 word((1, 1, 2), 10, "80.4"),
4756 word((1, 1, 2), 90, "94.2"),
4757 word((1, 1, 2), 170, "95.5"),
4758 word((1, 1, 3), 10, "Company"),
4759 word((1, 1, 3), 90, "A"),
4760 word((1, 1, 3), 170, "B"),
4761 word((1, 1, 4), 10, "Scene"),
4762 word((1, 1, 4), 90, "Document"),
4763 word((1, 1, 5), 10, "65"),
4764 word((1, 1, 5), 90, "70"),
4765 word((1, 1, 5), 170, "75"),
4766 word((1, 1, 6), 10, "80"),
4767 word((1, 1, 6), 90, "85"),
4768 word((1, 1, 6), 170, "90"),
4769 word((1, 1, 7), 10, "95"),
4770 word((1, 1, 7), 90, "100"),
4771 ];
4772
4773 assert!(!looks_like_dense_prose_image_ocr(&words));
4774 }
4775
4776 #[test]
4777 fn test_dense_prose_image_ocr_rejects_scattered_chart_labels() {
4778 let words = vec![
4779 word_at((1, 1, 1), 20, 20, 80, "Participation"),
4780 word_at((1, 1, 1), 120, 20, 70, "of"),
4781 word_at((1, 1, 1), 210, 20, 90, "Institutions"),
4782 word_at((1, 1, 2), 310, 50, 50, "57"),
4783 word_at((1, 1, 2), 380, 50, 60, "(24%)"),
4784 word_at((1, 1, 3), 290, 86, 40, "20"),
4785 word_at((1, 1, 3), 345, 86, 50, "(8%)"),
4786 word_at((1, 1, 4), 80, 124, 120, "Government"),
4787 word_at((1, 1, 4), 260, 124, 90, "Other"),
4788 word_at((1, 1, 4), 360, 124, 60, "State"),
4789 word_at((1, 1, 5), 70, 160, 80, "Civil"),
4790 word_at((1, 1, 5), 170, 160, 80, "Society"),
4791 word_at((1, 1, 5), 280, 160, 110, "Organizations"),
4792 word_at((1, 1, 6), 300, 194, 50, "31"),
4793 word_at((1, 1, 6), 365, 194, 60, "(13%)"),
4794 word_at((1, 1, 7), 35, 228, 120, "Educational"),
4795 word_at((1, 1, 7), 180, 228, 100, "Institution"),
4796 word_at((1, 1, 8), 250, 262, 40, "16"),
4797 word_at((1, 1, 8), 305, 262, 50, "(7%)"),
4798 ];
4799
4800 assert!(looks_like_chart_label_ocr(&words));
4801 assert!(!looks_like_table_ocr(&words));
4802 assert!(!looks_like_dense_prose_image_ocr(&words));
4803 }
4804
4805 #[test]
4806 fn test_chart_label_ocr_detects_stacked_bar_chart_legend_layout() {
4807 let words = vec![
4808 word_at((1, 1, 1), 10, 15, 22, "ano"),
4809 word_at((1, 1, 1), 10, 8, 24, "MW."),
4810 word_at((1, 1, 2), 410, 25, 38, "Waste"),
4811 word_at((1, 1, 2), 452, 25, 55, "materials"),
4812 word_at((1, 1, 3), 11, 38, 21, "350"),
4813 word_at((1, 1, 4), 11, 61, 21, "300"),
4814 word_at((1, 1, 4), 411, 56, 38, "Biogas"),
4815 word_at((1, 1, 5), 7, 79, 25, "250"),
4816 word_at((1, 1, 5), 399, 87, 8, "'™"),
4817 word_at((1, 1, 5), 411, 87, 75, "Construction"),
4818 word_at((1, 1, 5), 490, 86, 33, "wood"),
4819 word_at((1, 1, 5), 527, 87, 35, "waste"),
4820 word_at((1, 1, 6), 11, 106, 21, "200"),
4821 word_at((1, 1, 7), 411, 117, 59, "General"),
4822 word_at((1, 1, 7), 467, 116, 27, "wood"),
4823 word_at((1, 1, 7), 499, 116, 54, "(10MWs)"),
4824 word_at((1, 1, 8), 11, 129, 21, "150"),
4825 word_at((1, 1, 9), 11, 152, 21, "100"),
4826 word_at((1, 1, 9), 399, 148, 7, "="),
4827 word_at((1, 1, 9), 411, 135, 46, "General"),
4828 word_at((1, 1, 9), 464, 135, 27, "wood"),
4829 word_at((1, 1, 9), 498, 146, 56, "(<LOMW)"),
4830 word_at((1, 1, 10), 13, 163, 18, "50"),
4831 word_at((1, 1, 10), 399, 178, 7, "="),
4832 word_at((1, 1, 10), 411, 176, 73, "Unutilised"),
4833 word_at((1, 1, 10), 480, 166, 29, "wood"),
4834 word_at((1, 1, 10), 516, 176, 45, "(2MWs)"),
4835 word_at((1, 1, 11), 24, 197, 7, "o"),
4836 word_at((1, 1, 12), 399, 208, 8, "m="),
4837 word_at((1, 1, 12), 411, 206, 59, "Unutilised"),
4838 word_at((1, 1, 12), 474, 206, 33, "wood"),
4839 word_at((1, 1, 12), 512, 206, 48, "(<2MW)"),
4840 word_at((1, 1, 13), 51, 217, 32, "12-13"),
4841 word_at((1, 1, 13), 96, 217, 28, "2014"),
4842 word_at((1, 1, 13), 139, 217, 28, "2015"),
4843 word_at((1, 1, 13), 182, 217, 28, "2016"),
4844 word_at((1, 1, 13), 225, 217, 28, "2017"),
4845 word_at((1, 1, 13), 268, 217, 28, "2018"),
4846 word_at((1, 1, 13), 311, 217, 28, "2019"),
4847 word_at((1, 1, 13), 354, 217, 28, "2020"),
4848 ];
4849
4850 assert!(looks_like_chart_label_ocr(&words));
4851 assert!(!looks_like_table_ocr(&words));
4852 }
4853
4854 #[test]
4855 fn test_build_numeric_table_border_rejects_sparse_chart_layout() {
4856 let image = image_chunk();
4857 let mut words = Vec::new();
4858 let columns = [20, 55, 90, 125, 160, 195, 230, 265, 300, 335, 370, 405];
4859
4860 for (idx, left) in columns.iter().enumerate() {
4861 words.push(word_at((1, 1, 1), *left, 20, 22, &format!("H{}", idx + 1)));
4862 }
4863 for (idx, left) in [20, 160, 300].into_iter().enumerate() {
4864 words.push(word_at((1, 1, 2), left, 52, 22, &format!("{}", idx + 1)));
4865 }
4866 for (idx, left) in [55, 195, 335].into_iter().enumerate() {
4867 words.push(word_at((1, 1, 3), left, 84, 22, &format!("{}", idx + 4)));
4868 }
4869 for (idx, left) in [90, 230, 370].into_iter().enumerate() {
4870 words.push(word_at((1, 1, 4), left, 116, 22, &format!("{}", idx + 7)));
4871 }
4872 for (idx, left) in columns.iter().enumerate() {
4873 words.push(word_at((1, 1, 5), *left, 148, 22, &format!("{}", idx + 10)));
4874 }
4875
4876 assert!(looks_like_chart_label_ocr(&words));
4877 assert!(!looks_like_table_ocr(&words));
4878 assert!(!looks_like_numeric_table_ocr(&words));
4879 assert!(build_numeric_table_border(&words, &image).is_none());
4880 }
4881
4882 #[test]
4883 fn test_lines_from_ocr_words_merges_wrapped_lines_into_blocks() {
4884 let words = vec![
4885 word_at((1, 1, 1), 20, 20, 64, "Copyright"),
4886 word_at((1, 1, 1), 100, 20, 56, "protects"),
4887 word_at((1, 1, 2), 20, 38, 52, "creative"),
4888 word_at((1, 1, 2), 84, 38, 36, "work"),
4889 word_at((1, 1, 3), 240, 20, 52, "Public"),
4890 word_at((1, 1, 3), 304, 20, 40, "domain"),
4891 word_at((1, 1, 4), 240, 38, 60, "expires"),
4892 word_at((1, 1, 4), 312, 38, 44, "later"),
4893 ];
4894
4895 let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &[]);
4896
4897 assert_eq!(recovered.len(), 2);
4898 assert_eq!(recovered[0].value, "Copyright protects creative work");
4899 assert_eq!(recovered[1].value, "Public domain expires later");
4900 }
4901
4902 #[test]
4903 fn test_page_raster_ocr_skips_bar_chart_tables() {
4904 let mut chart = GrayImage::from_pixel(420, 260, Luma([255]));
4905 for x in 24..396 {
4906 chart.put_pixel(x, 222, Luma([170]));
4907 }
4908 for &(x1, x2, y1, shade) in &[
4909 (46, 82, 132, 222),
4910 (104, 140, 84, 214),
4911 (162, 198, 62, 206),
4912 (220, 256, 144, 228),
4913 ] {
4914 for x in x1..x2 {
4915 for y in y1..222 {
4916 chart.put_pixel(x, y, Luma([shade]));
4917 }
4918 }
4919 }
4920
4921 let page_bbox = BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0);
4922 let mut table = TableBorder {
4923 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 260.0),
4924 index: None,
4925 level: None,
4926 x_coordinates: vec![0.0, 210.0, 420.0],
4927 x_widths: vec![0.0; 3],
4928 y_coordinates: vec![260.0, 130.0, 0.0],
4929 y_widths: vec![0.0; 3],
4930 rows: vec![
4931 TableBorderRow {
4932 bbox: BoundingBox::new(Some(1), 0.0, 130.0, 420.0, 260.0),
4933 index: None,
4934 level: None,
4935 row_number: 0,
4936 cells: vec![
4937 TableBorderCell {
4938 bbox: BoundingBox::new(Some(1), 0.0, 130.0, 210.0, 260.0),
4939 index: None,
4940 level: None,
4941 row_number: 0,
4942 col_number: 0,
4943 row_span: 1,
4944 col_span: 1,
4945 content: Vec::new(),
4946 contents: Vec::new(),
4947 semantic_type: None,
4948 },
4949 TableBorderCell {
4950 bbox: BoundingBox::new(Some(1), 210.0, 130.0, 420.0, 260.0),
4951 index: None,
4952 level: None,
4953 row_number: 0,
4954 col_number: 1,
4955 row_span: 1,
4956 col_span: 1,
4957 content: Vec::new(),
4958 contents: Vec::new(),
4959 semantic_type: None,
4960 },
4961 ],
4962 semantic_type: None,
4963 },
4964 TableBorderRow {
4965 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 420.0, 130.0),
4966 index: None,
4967 level: None,
4968 row_number: 1,
4969 cells: vec![
4970 TableBorderCell {
4971 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 210.0, 130.0),
4972 index: None,
4973 level: None,
4974 row_number: 1,
4975 col_number: 0,
4976 row_span: 1,
4977 col_span: 1,
4978 content: Vec::new(),
4979 contents: Vec::new(),
4980 semantic_type: None,
4981 },
4982 TableBorderCell {
4983 bbox: BoundingBox::new(Some(1), 210.0, 0.0, 420.0, 130.0),
4984 index: None,
4985 level: None,
4986 row_number: 1,
4987 col_number: 1,
4988 row_span: 1,
4989 col_span: 1,
4990 content: Vec::new(),
4991 contents: Vec::new(),
4992 semantic_type: None,
4993 },
4994 ],
4995 semantic_type: None,
4996 },
4997 ],
4998 num_rows: 2,
4999 num_columns: 2,
5000 is_bad_table: false,
5001 is_table_transformer: true,
5002 previous_table: None,
5003 next_table: None,
5004 };
5005
5006 enrich_empty_table_from_page_raster(&chart, &page_bbox, &mut table);
5007
5008 assert!(table
5009 .rows
5010 .iter()
5011 .flat_map(|row| row.cells.iter())
5012 .all(|cell| cell.content.is_empty()));
5013 }
5014
5015 #[test]
5016 fn test_lines_from_ocr_words_dedupes_against_native_text() {
5017 let words = vec![
5018 word_at((1, 1, 1), 20, 20, 64, "Copyright"),
5019 word_at((1, 1, 1), 100, 20, 56, "protects"),
5020 word_at((1, 1, 2), 20, 38, 52, "creative"),
5021 word_at((1, 1, 2), 84, 38, 36, "work"),
5022 ];
5023 let native = vec![TextChunk {
5024 value: "Copyright protects creative work".to_string(),
5025 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 10.0, 10.0),
5026 font_name: "Native".to_string(),
5027 font_size: 12.0,
5028 font_weight: 400.0,
5029 italic_angle: 0.0,
5030 font_color: "#000000".to_string(),
5031 contrast_ratio: 21.0,
5032 symbol_ends: Vec::new(),
5033 text_format: TextFormat::Normal,
5034 text_type: TextType::Regular,
5035 pdf_layer: PdfLayer::Content,
5036 ocg_visible: true,
5037 index: None,
5038 page_number: Some(1),
5039 level: None,
5040 mcid: None,
5041 }];
5042
5043 let recovered = lines_from_ocr_words(&words, &image_chunk(), 400, 400, &native);
5044
5045 assert!(recovered.is_empty());
5046 }
5047}