1use crate::oarocr::TextRegion;
15use oar_ocr_core::domain::structure::{
16 FormulaResult, LayoutElement, LayoutElementType, StructureResult, TableCell, TableResult,
17};
18use oar_ocr_core::processors::{
19 BoundingBox, SplitConfig as OcrSplitConfig, create_expanded_ocr_for_table, parse_cell_grid_info,
20};
21use std::cmp::Ordering;
22
23#[derive(Clone, Copy, Debug)]
26enum OcrSource {
27 Split,
29 Original(usize),
31}
32
33const EXCLUDED_FROM_OCR_LABELS: [LayoutElementType; 3] = [
43 LayoutElementType::Table,
44 LayoutElementType::Seal,
45 LayoutElementType::Formula, ];
47
48#[derive(Clone)]
49pub struct StitchConfig {
50 pub overlap_min_pixels: f32,
51 pub cell_text_min_ioa: f32,
52 pub require_text_center_inside_cell: bool,
53 pub cell_merge_min_iou: f32,
54 pub formula_to_cell_min_iou: f32,
55 pub same_line_y_tolerance: f32,
57 pub line_height_iou_threshold: f32,
59 pub enable_cross_cell_split: bool,
63}
64
65impl Default for StitchConfig {
66 fn default() -> Self {
67 Self {
68 overlap_min_pixels: 3.0,
69 cell_text_min_ioa: 0.6,
70 require_text_center_inside_cell: true,
71 cell_merge_min_iou: 0.3,
72 formula_to_cell_min_iou: 0.01,
73 same_line_y_tolerance: 10.0,
74 line_height_iou_threshold: 0.6,
75 enable_cross_cell_split: true,
76 }
77 }
78}
79
80pub struct ResultStitcher;
82
83impl ResultStitcher {
84 pub fn stitch(result: &mut StructureResult) {
93 let cfg = StitchConfig::default();
94 Self::stitch_with_config(result, &cfg);
95 }
96
97 pub fn stitch_with_config(result: &mut StructureResult, cfg: &StitchConfig) {
98 let mut used_region_indices = std::collections::HashSet::new();
100
101 let mut regions = result.text_regions.clone().unwrap_or_default();
103
104 tracing::debug!("Stitching: {} text regions", regions.len());
105
106 Self::stitch_tables(
111 &mut result.tables,
112 ®ions,
113 &result.formulas,
114 &mut used_region_indices,
115 cfg,
116 );
117
118 tracing::debug!(
119 "After stitch_tables: {} regions used",
120 used_region_indices.len()
121 );
122
123 Self::fill_formula_elements(&mut result.layout_elements, &result.formulas, cfg);
126
127 Self::inject_inline_formulas(&mut result.layout_elements, &mut regions, cfg);
133
134 Self::stitch_layout_elements(
138 &mut result.layout_elements,
139 ®ions,
140 &mut used_region_indices,
141 cfg,
142 );
143
144 tracing::debug!(
145 "After stitch_layout_elements: {} regions used",
146 used_region_indices.len()
147 );
148
149 for element in &result.layout_elements {
160 if element.element_type == LayoutElementType::Seal {
161 for (idx, region) in regions.iter().enumerate() {
162 if Self::is_overlapping(&element.bbox, ®ion.bounding_box, cfg) {
163 used_region_indices.insert(idx);
164 }
165 }
166 }
167 }
168
169 let table_bboxes: Vec<&BoundingBox> = result
174 .layout_elements
175 .iter()
176 .filter(|e| e.element_type == LayoutElementType::Table)
177 .map(|e| &e.bbox)
178 .collect();
179
180 let image_chart_bboxes: Vec<&BoundingBox> = result
181 .layout_elements
182 .iter()
183 .filter(|e| {
184 matches!(
185 e.element_type,
186 LayoutElementType::Image | LayoutElementType::Chart
187 )
188 })
189 .map(|e| &e.bbox)
190 .collect();
191
192 let figure_caption_bboxes: Vec<&BoundingBox> = result
197 .layout_elements
198 .iter()
199 .filter(|e| {
200 matches!(
201 e.element_type,
202 LayoutElementType::FigureTitle
203 | LayoutElementType::ChartTitle
204 | LayoutElementType::FigureTableChartTitle
205 )
206 })
207 .map(|e| &e.bbox)
208 .collect();
209
210 let content_element_bboxes: Vec<&BoundingBox> = result
213 .layout_elements
214 .iter()
215 .filter(|e| {
216 matches!(
217 e.element_type,
218 LayoutElementType::Text
219 | LayoutElementType::DocTitle
220 | LayoutElementType::ParagraphTitle
221 | LayoutElementType::Abstract
222 )
223 })
224 .map(|e| &e.bbox)
225 .collect();
226
227 let original_element_count = result.layout_elements.len();
228 let mut new_elements = Vec::new();
229 for (idx, region) in regions.iter().enumerate() {
230 if !used_region_indices.contains(&idx)
231 && let Some(text) = ®ion.text
232 {
233 let overlaps_table = table_bboxes
236 .iter()
237 .any(|table_bbox| region.bounding_box.ioa(table_bbox) > 0.3);
238
239 if overlaps_table {
240 continue;
242 }
243
244 let overlaps_image_chart = image_chart_bboxes
246 .iter()
247 .any(|bbox| region.bounding_box.ioa(bbox) > 0.5);
248
249 if overlaps_image_chart {
250 continue;
251 }
252
253 let in_inferred_figure_region = figure_caption_bboxes.iter().any(|cap| {
259 let orphan_bb = ®ion.bounding_box;
260 let above_caption = orphan_bb.y_max() < cap.y_max();
262 let x_margin = (cap.x_max() - cap.x_min()) * 0.1;
264 let in_x_range = orphan_bb.x_min() >= (cap.x_min() - x_margin)
265 && orphan_bb.x_max() <= (cap.x_max() + x_margin);
266 above_caption && in_x_range
267 });
268
269 if in_inferred_figure_region {
270 let inside_content_element = content_element_bboxes
272 .iter()
273 .any(|bbox| region.bounding_box.ioa(bbox) > 0.5);
274 if !inside_content_element {
275 continue;
276 }
277 }
278
279 let element_type = if region.is_formula() {
283 LayoutElementType::Formula
284 } else {
285 LayoutElementType::Text
286 };
287
288 let element = LayoutElement::new(
289 region.bounding_box.clone(),
290 element_type,
291 region.confidence.unwrap_or(0.0),
292 )
293 .with_text(text.as_ref().to_string());
294
295 new_elements.push(element);
296 }
297 }
298
299 if let Some(ref mut region_blocks) = result.region_blocks {
302 for (new_idx, new_element) in new_elements.iter().enumerate() {
303 let element_index = original_element_count + new_idx;
304
305 let mut best_region_idx: Option<usize> = None;
307 let mut best_overlap = 0.0f32;
308
309 for (region_idx, region) in region_blocks.iter().enumerate() {
310 let overlap = new_element.bbox.intersection_area(®ion.bbox);
312 if overlap > best_overlap {
313 best_overlap = overlap;
314 best_region_idx = Some(region_idx);
315 }
316 }
317
318 if let Some(region_idx) = best_region_idx {
320 region_blocks[region_idx]
321 .element_indices
322 .push(element_index);
323 }
324 }
325 }
326
327 result.layout_elements.extend(new_elements);
328
329 let width = if let Some(img) = &result.rectified_img {
333 img.width() as f32
334 } else {
335 result
337 .layout_elements
338 .iter()
339 .map(|e| e.bbox.x_max())
340 .fold(0.0f32, f32::max)
341 .max(1000.0) };
343
344 if result.region_blocks.is_none() {
348 let height = if let Some(img) = &result.rectified_img {
349 img.height() as f32
350 } else {
351 result
352 .layout_elements
353 .iter()
354 .map(|e| e.bbox.y_max())
355 .fold(0.0f32, f32::max)
356 .max(1000.0)
357 };
358 Self::sort_layout_elements_enhanced(&mut result.layout_elements, width, height);
359 }
360
361 Self::assign_order_indices(&mut result.layout_elements);
363 }
364
365 fn assign_order_indices(elements: &mut [LayoutElement]) {
370 let mut order_index = 1u32;
371 for element in elements.iter_mut() {
372 if Self::should_have_order_index(element.element_type) {
375 element.order_index = Some(order_index);
376 order_index += 1;
377 }
378 }
379 }
380
381 fn should_have_order_index(element_type: LayoutElementType) -> bool {
385 matches!(
386 element_type,
387 LayoutElementType::Text
388 | LayoutElementType::Content
389 | LayoutElementType::Abstract
390 | LayoutElementType::DocTitle
391 | LayoutElementType::ParagraphTitle
392 | LayoutElementType::Table
393 | LayoutElementType::Image
394 | LayoutElementType::Chart
395 | LayoutElementType::Formula
396 | LayoutElementType::Seal
397 | LayoutElementType::Reference
398 | LayoutElementType::ReferenceContent
399 | LayoutElementType::List
400 | LayoutElementType::FigureTitle
401 | LayoutElementType::TableTitle
402 | LayoutElementType::ChartTitle
403 | LayoutElementType::FigureTableChartTitle
404 )
405 }
406
407 fn stitch_tables(
408 tables: &mut [TableResult],
409 text_regions: &[TextRegion],
410 formulas: &[FormulaResult],
411 used_indices: &mut std::collections::HashSet<usize>,
412 cfg: &StitchConfig,
413 ) {
414 for (table_idx, table) in tables.iter_mut().enumerate() {
415 if table.cells.is_empty() {
416 continue;
417 }
418 let has_detected_cells = table.detected_cell_bboxes.is_some();
421 let e2e_like_cells = table.is_e2e && !has_detected_cells;
422
423 let table_bbox = table.bbox.clone(); let relevant_indices: Vec<usize> = text_regions
426 .iter()
427 .enumerate()
428 .filter(|(idx, region)| {
429 !used_indices.contains(idx)
430 && Self::is_overlapping(&table_bbox, ®ion.bounding_box, cfg)
431 })
432 .map(|(idx, _)| idx)
433 .collect();
434
435 let (split_regions, split_ocr_indices, _split_cell_assignments) =
439 if cfg.enable_cross_cell_split && !e2e_like_cells {
440 Self::split_cross_cell_ocr_boxes(text_regions, &relevant_indices, &table.cells)
441 } else {
442 (
443 Vec::new(),
444 std::collections::HashSet::new(),
445 std::collections::HashMap::new(),
446 )
447 };
448
449 let mut ocr_candidates: Vec<(OcrSource, TextRegion)> = Vec::new();
451
452 for region in &split_regions {
453 let mut normalized_region = region.clone();
454 Self::normalize_tiny_symbol_for_paddlex(&mut normalized_region);
455
456 if normalized_region
457 .text
458 .as_ref()
459 .map(|t| !t.trim().is_empty())
460 .unwrap_or(false)
461 {
462 ocr_candidates.push((OcrSource::Split, normalized_region));
463 }
464 }
465
466 for &ocr_idx in &relevant_indices {
468 if split_ocr_indices.contains(&ocr_idx) {
469 used_indices.insert(ocr_idx);
470 continue;
471 }
472
473 if let Some(region) = text_regions.get(ocr_idx) {
474 let mut normalized_region = region.clone();
475 Self::normalize_tiny_symbol_for_paddlex(&mut normalized_region);
476
477 if normalized_region
478 .text
479 .as_ref()
480 .map(|t| !t.trim().is_empty())
481 .unwrap_or(false)
482 {
483 ocr_candidates.push((OcrSource::Original(ocr_idx), normalized_region));
484 }
485 }
486 }
487
488 for formula in formulas {
492 let w = formula.bbox.x_max() - formula.bbox.x_min();
493 let h = formula.bbox.y_max() - formula.bbox.y_min();
494 if w <= 1.0 || h <= 1.0 {
495 continue;
496 }
497 if !Self::is_overlapping(&table_bbox, &formula.bbox, cfg) {
498 continue;
499 }
500 let latex = &formula.latex;
501 let formatted = if latex.starts_with('$') && latex.ends_with('$') {
502 latex.clone()
503 } else {
504 format!("${}$", latex)
505 };
506 let mut formula_region = TextRegion::new(formula.bbox.clone());
507 formula_region.text = Some(formatted.into());
508 formula_region.confidence = Some(1.0);
509 ocr_candidates.push((OcrSource::Split, formula_region));
510 }
511
512 let structure_tokens = table.structure_tokens.clone();
513
514 let mut td_to_cell_mapping: Option<Vec<Option<usize>>> = None;
517 if !e2e_like_cells
518 && let Some(tokens) = structure_tokens.as_deref()
519 && !ocr_candidates.is_empty()
520 && let Some((mapping, matched_candidate_indices)) =
521 Self::match_table_cells_with_structure_rows(
522 &mut table.cells,
523 tokens,
524 &ocr_candidates,
525 cfg.same_line_y_tolerance,
526 table.detected_cell_bboxes.as_deref(),
527 )
528 {
529 td_to_cell_mapping = Some(mapping);
530 for matched_idx in matched_candidate_indices {
531 if let Some((OcrSource::Original(region_idx), _)) =
532 ocr_candidates.get(matched_idx)
533 {
534 used_indices.insert(*region_idx);
535 }
536 }
537 }
538
539 if td_to_cell_mapping.is_none() {
541 let (cell_to_ocr, matched_candidate_indices) =
542 Self::match_table_and_ocr_by_iou_distance(
543 &table.cells,
544 &ocr_candidates,
545 !e2e_like_cells, e2e_like_cells, );
548
549 for matched_idx in matched_candidate_indices {
550 if let Some((OcrSource::Original(region_idx), _)) =
551 ocr_candidates.get(matched_idx)
552 {
553 used_indices.insert(*region_idx);
554 }
555 }
556
557 for (cell_idx, cell) in table.cells.iter_mut().enumerate() {
558 let has_text = cell
559 .text
560 .as_ref()
561 .map(|t| !t.trim().is_empty())
562 .unwrap_or(false);
563 if has_text {
564 continue;
565 }
566
567 if let Some(candidate_indices) = cell_to_ocr.get(&cell_idx) {
568 if e2e_like_cells {
569 let joined = Self::join_ocr_texts_paddlex_style(
570 candidate_indices,
571 &ocr_candidates,
572 );
573 if !joined.is_empty() {
574 cell.text = Some(joined);
575 }
576 } else {
577 let mut cell_text_regions: Vec<(&TextRegion, &str)> = candidate_indices
578 .iter()
579 .filter_map(|&idx| {
580 ocr_candidates
581 .get(idx)
582 .and_then(|(_, r)| r.text.as_deref().map(|t| (r, t)))
583 })
584 .collect();
585
586 Self::sort_and_join_texts(
587 &mut cell_text_regions,
588 Some(&cell.bbox),
589 cfg,
590 |joined| {
591 if !joined.is_empty() {
592 cell.text = Some(joined);
593 }
594 },
595 );
596 }
597 }
598 }
599 }
600
601 Self::normalize_checkbox_symbols_in_table(&mut table.cells);
608
609 if let Some(tokens) = structure_tokens.as_deref() {
611 let cell_texts: Vec<Option<String>> =
612 if let Some(ref td_mapping) = td_to_cell_mapping {
613 td_mapping
615 .iter()
616 .map(|cell_idx| {
617 cell_idx
618 .and_then(|idx| table.cells.get(idx))
619 .and_then(|cell| cell.text.clone())
620 })
621 .collect()
622 } else {
623 Self::collect_cell_texts_for_tokens(&table.cells, tokens)
627 };
628
629 let html_structure =
630 crate::processors::wrap_table_html_with_content(tokens, &cell_texts);
631 table.html_structure = Some(html_structure);
632 table.cell_texts = Some(cell_texts);
633 }
634
635 tracing::debug!("Table {}: matching complete.", table_idx);
636 }
637 }
638
639 fn match_table_and_ocr_by_iou_distance(
645 cells: &[TableCell],
646 ocr_candidates: &[(OcrSource, TextRegion)],
647 require_positive_iou: bool,
648 use_paddlex_distance: bool,
649 ) -> (
650 std::collections::HashMap<usize, Vec<usize>>,
651 std::collections::HashSet<usize>,
652 ) {
653 let mut cell_to_ocr: std::collections::HashMap<usize, Vec<usize>> =
654 std::collections::HashMap::new();
655 let mut matched_candidate_indices = std::collections::HashSet::new();
656
657 if cells.is_empty() || ocr_candidates.is_empty() {
658 return (cell_to_ocr, matched_candidate_indices);
659 }
660
661 for (candidate_idx, (_, region)) in ocr_candidates.iter().enumerate() {
662 let ocr_bbox = ®ion.bounding_box;
663
664 let ocr_cx = (ocr_bbox.x_min() + ocr_bbox.x_max()) / 2.0;
669 let ocr_cy = (ocr_bbox.y_min() + ocr_bbox.y_max()) / 2.0;
670 let center_cell = cells.iter().enumerate().find(|(_, cell)| {
671 ocr_cx >= cell.bbox.x_min()
672 && ocr_cx <= cell.bbox.x_max()
673 && ocr_cy >= cell.bbox.y_min()
674 && ocr_cy <= cell.bbox.y_max()
675 && ocr_bbox.ioa(&cell.bbox) > 0.7
676 });
677
678 if let Some((cell_idx, _)) = center_cell {
679 cell_to_ocr.entry(cell_idx).or_default().push(candidate_idx);
680 matched_candidate_indices.insert(candidate_idx);
681 continue;
682 }
683
684 let mut best_cell_idx: Option<usize> = None;
686 let mut min_cost = (f32::MAX, f32::MAX);
687 let mut candidate_costs: Vec<(usize, (f32, f32))> = Vec::new();
688
689 for (cell_idx, cell) in cells.iter().enumerate() {
690 let iou = Self::calculate_iou(®ion.bounding_box, &cell.bbox);
691 if require_positive_iou && iou <= 0.0 {
692 continue;
693 }
694
695 let dist = if use_paddlex_distance {
696 Self::paddlex_distance(&cell.bbox, ®ion.bounding_box)
697 } else {
698 Self::l1_distance(®ion.bounding_box, &cell.bbox)
699 };
700 let cost = (1.0 - iou, dist);
701 candidate_costs.push((cell_idx, cost));
702 if Self::is_better_paddlex_match_cost(cost, min_cost, cell_idx, best_cell_idx) {
703 min_cost = cost;
704 best_cell_idx = Some(cell_idx);
705 }
706 }
707
708 if let Some(mut cell_idx) = best_cell_idx {
709 if use_paddlex_distance {
710 cell_idx = Self::maybe_prefer_upper_boundary_cell(
711 cells,
712 ®ion.bounding_box,
713 cell_idx,
714 min_cost,
715 &candidate_costs,
716 );
717 }
718 cell_to_ocr.entry(cell_idx).or_default().push(candidate_idx);
719 matched_candidate_indices.insert(candidate_idx);
720 }
721 }
722
723 (cell_to_ocr, matched_candidate_indices)
724 }
725
726 fn is_better_paddlex_match_cost(
732 candidate_cost: (f32, f32),
733 current_cost: (f32, f32),
734 candidate_idx: usize,
735 current_idx: Option<usize>,
736 ) -> bool {
737 const COST_EPS: f32 = 1e-4;
738
739 if !candidate_cost.0.is_finite() || !candidate_cost.1.is_finite() {
741 return false;
742 }
743
744 if !current_cost.0.is_finite() || !current_cost.1.is_finite() || current_idx.is_none() {
746 return true;
747 }
748
749 if candidate_cost.0 + COST_EPS < current_cost.0 {
750 return true;
751 }
752 if (candidate_cost.0 - current_cost.0).abs() <= COST_EPS {
753 if candidate_cost.1 + COST_EPS < current_cost.1 {
754 return true;
755 }
756 if (candidate_cost.1 - current_cost.1).abs() <= COST_EPS
757 && let Some(existing_idx) = current_idx
758 {
759 return candidate_idx < existing_idx;
760 }
761 }
762
763 false
764 }
765
766 fn maybe_prefer_upper_boundary_cell(
774 cells: &[TableCell],
775 ocr_box: &BoundingBox,
776 best_cell_idx: usize,
777 best_cost: (f32, f32),
778 candidate_costs: &[(usize, (f32, f32))],
779 ) -> usize {
780 const BOUNDARY_COST_IOU_DELTA: f32 = 0.12;
781 const BOUNDARY_OVERLAP_MIN: f32 = 0.35;
782
783 let Some(best_cell) = cells.get(best_cell_idx) else {
784 return best_cell_idx;
785 };
786 let (Some(best_row), Some(best_col)) = (best_cell.row, best_cell.col) else {
787 return best_cell_idx;
788 };
789 if best_row == 0 {
790 return best_cell_idx;
791 }
792
793 let upper_cell_idx = cells
794 .iter()
795 .position(|cell| cell.row == Some(best_row - 1) && cell.col == Some(best_col));
796 let Some(upper_cell_idx) = upper_cell_idx else {
797 return best_cell_idx;
798 };
799
800 let boundary_y = best_cell.bbox.y_min();
801 if !(ocr_box.y_min() < boundary_y && ocr_box.y_max() > boundary_y) {
802 return best_cell_idx;
803 }
804
805 let best_inter = Self::compute_inter(&best_cell.bbox, ocr_box);
806 let Some(upper_cell) = cells.get(upper_cell_idx) else {
807 return best_cell_idx;
808 };
809 let upper_inter = Self::compute_inter(&upper_cell.bbox, ocr_box);
810 if best_inter < BOUNDARY_OVERLAP_MIN || upper_inter < BOUNDARY_OVERLAP_MIN {
811 return best_cell_idx;
812 }
813
814 let upper_cost = candidate_costs
815 .iter()
816 .find_map(|(idx, cost)| (*idx == upper_cell_idx).then_some(*cost));
817 let Some(upper_cost) = upper_cost else {
818 return best_cell_idx;
819 };
820 if !upper_cost.0.is_finite() || !upper_cost.1.is_finite() {
821 return best_cell_idx;
822 }
823
824 if upper_cost.0 <= best_cost.0 + BOUNDARY_COST_IOU_DELTA {
825 upper_cell_idx
826 } else {
827 best_cell_idx
828 }
829 }
830
831 fn normalize_tiny_symbol_for_paddlex(region: &mut TextRegion) {
836 let Some(text) = region.text.as_deref() else {
837 return;
838 };
839 if text.chars().count() != 1 {
840 return;
841 }
842 let Some(score) = region.confidence else {
843 return;
844 };
845
846 let width = (region.bounding_box.x_max() - region.bounding_box.x_min()).max(0.0);
847 let height = (region.bounding_box.y_max() - region.bounding_box.y_min()).max(0.0);
848
849 let replacement = if text == "=" && score < 0.45 && width <= 9.5 && height <= 7.5 {
850 Some(",")
851 } else if text == "=" && score < 0.45 && width <= 12.5 && height > 7.5 && height <= 10.5 {
852 Some("-")
853 } else if text == "0" && score < 0.20 && width <= 14.5 && height <= 14.5 {
854 Some(";")
855 } else {
856 None
857 };
858
859 if let Some(value) = replacement {
860 region.text = Some(std::sync::Arc::<str>::from(value));
861 }
862 }
863
864 fn normalize_checkbox_symbols_in_table(cells: &mut [TableCell]) {
865 let mut has_positive_candidate = false;
866 let mut has_negative_candidate = false;
867
868 for cell in cells.iter() {
869 let Some(text) = cell.text.as_deref() else {
870 continue;
871 };
872 let trimmed = text.trim();
873 if trimmed.chars().count() != 1 {
874 continue;
875 }
876 match trimmed.chars().next().unwrap_or_default() {
877 '✓' | 'ü' | 'Ü' | 'L' | '√' | '☑' => has_positive_candidate = true,
878 '✗' | 'X' | 'x' | '✕' | '✖' | '☒' => has_negative_candidate = true,
879 _ => {}
880 }
881 }
882
883 for cell in cells.iter_mut() {
884 let Some(text) = cell.text.clone() else {
885 continue;
886 };
887 let trimmed = text.trim();
888 if trimmed.chars().count() != 1 {
889 continue;
890 }
891 let mapped = match trimmed.chars().next().unwrap_or_default() {
892 'ü' | 'Ü' | '√' | '☑' => Some("✓"),
894 'L' if has_positive_candidate && has_negative_candidate => Some("✓"),
896 '✕' | '✖' | '☒' => Some("✗"),
898 'X' | 'x' if has_positive_candidate && has_negative_candidate => Some("✗"),
900 _ => None,
901 };
902
903 if let Some(symbol) = mapped {
904 cell.text = Some(symbol.to_string());
905 }
906 }
907 }
908
909 fn join_ocr_texts_paddlex_style(
911 candidate_indices: &[usize],
912 ocr_candidates: &[(OcrSource, TextRegion)],
913 ) -> String {
914 let mut joined = String::new();
915
916 for (i, &candidate_idx) in candidate_indices.iter().enumerate() {
917 let Some((_, region)) = ocr_candidates.get(candidate_idx) else {
918 continue;
919 };
920 let Some(text) = region.text.as_deref() else {
921 continue;
922 };
923
924 let mut content = text.to_string();
925 if candidate_indices.len() > 1 {
926 if content.is_empty() {
927 continue;
928 }
929 if content.starts_with(' ') {
930 content = content[1..].to_string();
931 }
932 if content.starts_with("<b>") {
933 content = content[3..].to_string();
934 }
935 if content.ends_with("</b>") {
936 content.truncate(content.len().saturating_sub(4));
937 }
938 if content.is_empty() {
939 continue;
940 }
941 if i != candidate_indices.len() - 1 && !content.ends_with(' ') {
942 content.push_str("<br/>");
943 }
944 }
945 joined.push_str(&content);
946 }
947
948 joined
949 }
950
951 fn match_table_cells_with_structure_rows(
957 cells: &mut [TableCell],
958 structure_tokens: &[String],
959 ocr_candidates: &[(OcrSource, TextRegion)],
960 row_y_tolerance: f32,
961 cell_bboxes_override: Option<&[BoundingBox]>,
962 ) -> Option<(Vec<Option<usize>>, std::collections::HashSet<usize>)> {
963 if cells.is_empty() || structure_tokens.is_empty() || ocr_candidates.is_empty() {
964 return None;
965 }
966
967 let (cell_sorted_indices, cell_row_flags) =
978 Self::sort_table_cells_boxes(cells, row_y_tolerance);
979
980 if cell_sorted_indices.is_empty() || cell_row_flags.is_empty() {
981 return None;
982 }
983
984 let mut row_start_index = Self::find_row_start_index(structure_tokens);
985 if row_start_index.is_empty() {
986 return None;
987 }
988
989 let mut cell_aligned = Self::map_and_get_max(&cell_row_flags, &row_start_index);
992 cell_aligned.push(cell_sorted_indices.len());
993 row_start_index.push(
994 structure_tokens
995 .iter()
996 .filter(|t| Self::is_td_end_token(t))
997 .count(),
998 );
999
1000 let use_cross_row_dedup = cell_bboxes_override.is_some();
1009 let mut globally_matched_ocr: std::collections::HashSet<usize> =
1010 std::collections::HashSet::new();
1011 let mut all_matched: Vec<std::collections::HashMap<usize, Vec<usize>>> = Vec::new();
1012
1013 for k in 0..cell_aligned.len().saturating_sub(1) {
1014 let row_start = cell_aligned[k].min(cell_sorted_indices.len());
1015 let row_end = cell_aligned[k + 1].min(cell_sorted_indices.len());
1016
1017 let mut matched: std::collections::HashMap<usize, Vec<usize>> =
1018 std::collections::HashMap::new();
1019
1020 for (local_idx, &cell_idx) in cell_sorted_indices[row_start..row_end].iter().enumerate()
1021 {
1022 let cell_box = &cells[cell_idx.min(cells.len() - 1)].bbox;
1027
1028 for (ocr_idx, (_, ocr_region)) in ocr_candidates.iter().enumerate() {
1029 if use_cross_row_dedup && globally_matched_ocr.contains(&ocr_idx) {
1030 continue;
1031 }
1032 let ioa = ocr_region.bounding_box.ioa(cell_box);
1034 if ioa > 0.7 {
1035 matched.entry(local_idx).or_default().push(ocr_idx);
1036 }
1037 }
1038 }
1039
1040 if use_cross_row_dedup {
1041 for indices in matched.values() {
1042 globally_matched_ocr.extend(indices.iter().copied());
1043 }
1044 }
1045
1046 all_matched.push(matched);
1047 }
1048
1049 let mut td_to_cell_mapping: Vec<Option<usize>> = Vec::new();
1052 let mut matched_candidate_indices: std::collections::HashSet<usize> =
1053 std::collections::HashSet::new();
1054
1055 let mut td_index = 0usize;
1056 let mut td_count = 0usize;
1057 let mut matched_row_idx = 0usize;
1058
1059 for tag in structure_tokens {
1060 if tag == "<tr>" {
1061 td_index = 0; continue;
1063 }
1064 if !Self::is_td_end_token(tag) {
1065 continue;
1066 }
1067
1068 let row_matches = all_matched.get(matched_row_idx);
1069 let matched_ocr_indices = row_matches.and_then(|m| m.get(&td_index));
1070 let matched_text = matched_ocr_indices
1071 .and_then(|indices| Self::compose_matched_cell_text(indices, ocr_candidates));
1072
1073 if let Some(indices) = matched_ocr_indices {
1074 matched_candidate_indices.extend(indices.iter().copied());
1075 }
1076
1077 let mapped_cell_idx = cell_aligned
1083 .get(matched_row_idx)
1084 .copied()
1085 .and_then(|row_start| {
1086 let sorted_pos = row_start + td_index;
1087 cell_sorted_indices.get(sorted_pos).copied()
1088 })
1089 .filter(|&idx| idx < cells.len());
1090
1091 td_to_cell_mapping.push(mapped_cell_idx);
1092
1093 if let (Some(cell_idx), Some(text)) = (mapped_cell_idx, matched_text)
1094 && let Some(cell) = cells.get_mut(cell_idx)
1095 {
1096 let has_text = cell
1097 .text
1098 .as_ref()
1099 .map(|t| !t.trim().is_empty())
1100 .unwrap_or(false);
1101 if !has_text {
1102 cell.text = Some(text);
1103 }
1104 }
1105
1106 td_index += 1;
1107 td_count += 1;
1108
1109 if matched_row_idx + 1 < row_start_index.len()
1110 && td_count >= row_start_index[matched_row_idx + 1]
1111 {
1112 matched_row_idx += 1;
1113 }
1114 }
1115
1116 if td_to_cell_mapping.is_empty() {
1117 None
1118 } else {
1119 Some((td_to_cell_mapping, matched_candidate_indices))
1120 }
1121 }
1122
1123 fn collect_cell_texts_for_tokens(
1129 cells: &[TableCell],
1130 tokens: &[String],
1131 ) -> Vec<Option<String>> {
1132 if cells.is_empty() {
1133 return Vec::new();
1134 }
1135
1136 let token_grid = parse_cell_grid_info(tokens);
1138 let td_count = token_grid.len();
1139
1140 let mut grid_to_cell: std::collections::HashMap<(usize, usize), usize> =
1142 std::collections::HashMap::new();
1143 let mut has_grid_info = false;
1144
1145 for (cell_idx, cell) in cells.iter().enumerate() {
1146 if let (Some(row), Some(col)) = (cell.row, cell.col) {
1147 grid_to_cell.insert((row, col), cell_idx);
1148 has_grid_info = true;
1149 }
1150 }
1151
1152 if has_grid_info {
1153 token_grid
1155 .iter()
1156 .map(|gi| {
1157 grid_to_cell
1158 .get(&(gi.row, gi.col))
1159 .and_then(|&idx| cells.get(idx))
1160 .and_then(|cell| cell.text.clone())
1161 })
1162 .collect()
1163 } else {
1164 (0..td_count)
1166 .map(|i| cells.get(i).and_then(|cell| cell.text.clone()))
1167 .collect()
1168 }
1169 }
1170
1171 fn sort_table_cells_boxes(
1175 cells: &[TableCell],
1176 row_y_tolerance: f32,
1177 ) -> (Vec<usize>, Vec<usize>) {
1178 if cells.is_empty() {
1179 return (Vec::new(), Vec::new());
1180 }
1181
1182 let mut by_y: Vec<usize> = (0..cells.len()).collect();
1183 by_y.sort_by(|&a, &b| {
1184 cells[a]
1185 .bbox
1186 .y_min()
1187 .partial_cmp(&cells[b].bbox.y_min())
1188 .unwrap_or(Ordering::Equal)
1189 });
1190
1191 let mut rows: Vec<Vec<usize>> = Vec::new();
1192 let mut current_row: Vec<usize> = Vec::new();
1193 let mut current_y: Option<f32> = None;
1194
1195 for idx in by_y {
1196 let y = cells[idx].bbox.y_min();
1197 match current_y {
1198 None => {
1199 current_row.push(idx);
1200 current_y = Some(y);
1201 }
1202 Some(row_y) if (y - row_y).abs() <= row_y_tolerance => {
1203 current_row.push(idx);
1204 }
1205 Some(_) => {
1206 current_row.sort_by(|&a, &b| {
1207 cells[a]
1208 .bbox
1209 .x_min()
1210 .partial_cmp(&cells[b].bbox.x_min())
1211 .unwrap_or(Ordering::Equal)
1212 });
1213 rows.push(current_row);
1214 current_row = vec![idx];
1215 current_y = Some(y);
1216 }
1217 }
1218 }
1219
1220 if !current_row.is_empty() {
1221 current_row.sort_by(|&a, &b| {
1222 cells[a]
1223 .bbox
1224 .x_min()
1225 .partial_cmp(&cells[b].bbox.x_min())
1226 .unwrap_or(Ordering::Equal)
1227 });
1228 rows.push(current_row);
1229 }
1230
1231 let mut sorted = Vec::with_capacity(cells.len());
1232 let mut flags = Vec::with_capacity(rows.len() + 1);
1233 flags.push(0);
1234
1235 for row in rows {
1236 sorted.extend(row.iter().copied());
1237 let next = flags.last().copied().unwrap_or(0) + row.len();
1238 flags.push(next);
1239 }
1240
1241 (sorted, flags)
1242 }
1243
1244 fn find_row_start_index(structure_tokens: &[String]) -> Vec<usize> {
1246 let mut row_start_indices = Vec::new();
1247 let mut current_index = 0usize;
1248 let mut inside_row = false;
1249
1250 for token in structure_tokens {
1251 if token == "<tr>" {
1252 inside_row = true;
1253 } else if token == "</tr>" {
1254 inside_row = false;
1255 } else if Self::is_td_end_token(token) && inside_row {
1256 row_start_indices.push(current_index);
1257 inside_row = false;
1258 }
1259
1260 if Self::is_td_end_token(token) {
1261 current_index += 1;
1262 }
1263 }
1264
1265 row_start_indices
1266 }
1267
1268 fn map_and_get_max(table_cells_flag: &[usize], row_start_index: &[usize]) -> Vec<usize> {
1270 let mut max_values = Vec::with_capacity(row_start_index.len());
1271 let mut i = 0usize;
1272 let mut max_value: Option<usize> = None;
1273
1274 for &row_start in row_start_index {
1275 while i < table_cells_flag.len() && table_cells_flag[i] <= row_start {
1276 max_value =
1277 Some(max_value.map_or(table_cells_flag[i], |v| v.max(table_cells_flag[i])));
1278 i += 1;
1279 }
1280 max_values.push(max_value.unwrap_or(row_start));
1281 }
1282
1283 max_values
1284 }
1285
1286 fn is_td_end_token(token: &str) -> bool {
1288 token == "<td></td>"
1289 || token == "</td>"
1290 || (token.contains("<td") && token.contains("</td>"))
1291 }
1292
1293 fn compose_matched_cell_text(
1295 matched_indices: &[usize],
1296 ocr_candidates: &[(OcrSource, TextRegion)],
1297 ) -> Option<String> {
1298 if matched_indices.is_empty() {
1299 return None;
1300 }
1301
1302 let mut merged = String::new();
1303
1304 for (i, &ocr_idx) in matched_indices.iter().enumerate() {
1305 let Some((_, region)) = ocr_candidates.get(ocr_idx) else {
1306 continue;
1307 };
1308 let Some(raw_text) = region.text.as_deref() else {
1309 continue;
1310 };
1311
1312 let mut content = raw_text.to_string();
1313 if matched_indices.len() > 1 {
1314 if content.starts_with(' ') {
1315 content = content.chars().skip(1).collect();
1316 }
1317 content = content.replace("<b>", "");
1318 content = content.replace("</b>", "");
1319 if content.is_empty() {
1320 continue;
1321 }
1322 if i != matched_indices.len() - 1 && !content.ends_with(' ') {
1323 content.push_str("<br/>");
1324 }
1325 }
1326
1327 merged.push_str(&content);
1328 }
1329
1330 let merged = merged.trim_end().to_string();
1331 if merged.is_empty() {
1332 None
1333 } else {
1334 Some(merged)
1335 }
1336 }
1337
1338 fn compute_inter(rec1: &BoundingBox, rec2: &BoundingBox) -> f32 {
1340 let x_left = rec1.x_min().max(rec2.x_min());
1341 let y_top = rec1.y_min().max(rec2.y_min());
1342 let x_right = rec1.x_max().min(rec2.x_max());
1343 let y_bottom = rec1.y_max().min(rec2.y_max());
1344
1345 let inter_width = (x_right - x_left).max(0.0);
1346 let inter_height = (y_bottom - y_top).max(0.0);
1347 let inter_area = inter_width * inter_height;
1348
1349 let rec2_area = (rec2.x_max() - rec2.x_min()) * (rec2.y_max() - rec2.y_min());
1350 if rec2_area <= 0.0 {
1351 0.0
1352 } else {
1353 inter_area / rec2_area
1354 }
1355 }
1356
1357 fn split_cross_cell_ocr_boxes(
1364 text_regions: &[TextRegion],
1365 relevant_indices: &[usize],
1366 cells: &[oar_ocr_core::domain::structure::TableCell],
1367 ) -> (
1368 Vec<TextRegion>,
1369 std::collections::HashSet<usize>,
1370 std::collections::HashMap<usize, Vec<usize>>,
1371 ) {
1372 let mut split_regions: Vec<TextRegion> = Vec::new();
1373 let mut split_ocr_indices: std::collections::HashSet<usize> =
1374 std::collections::HashSet::new();
1375 let mut cell_assignments: std::collections::HashMap<usize, Vec<usize>> =
1376 std::collections::HashMap::new();
1377
1378 let table_regions: Vec<TextRegion> = relevant_indices
1380 .iter()
1381 .map(|&idx| text_regions[idx].clone())
1382 .collect();
1383
1384 if table_regions.is_empty() || cells.is_empty() {
1385 return (split_regions, split_ocr_indices, cell_assignments);
1386 }
1387
1388 let split_config = OcrSplitConfig::default();
1390 let (expanded, processed_local_indices) =
1391 create_expanded_ocr_for_table(&table_regions, cells, Some(&split_config));
1392
1393 for local_idx in processed_local_indices {
1395 if local_idx < relevant_indices.len() {
1396 split_ocr_indices.insert(relevant_indices[local_idx]);
1397 }
1398 }
1399
1400 for region in expanded {
1402 let region_idx = split_regions.len();
1403
1404 let mut best_cell_idx = None;
1406 let mut best_iou = 0.0f32;
1407
1408 for (cell_idx, cell) in cells.iter().enumerate() {
1409 let iou = region.bounding_box.iou(&cell.bbox);
1410 if iou > best_iou {
1411 best_iou = iou;
1412 best_cell_idx = Some(cell_idx);
1413 }
1414 }
1415
1416 if let Some(cell_idx) = best_cell_idx {
1418 cell_assignments
1419 .entry(cell_idx)
1420 .or_default()
1421 .push(region_idx);
1422 }
1423
1424 split_regions.push(region);
1425 }
1426
1427 tracing::debug!(
1428 "Cross-cell OCR splitting: {} original regions processed, {} new regions created",
1429 split_ocr_indices.len(),
1430 split_regions.len()
1431 );
1432
1433 (split_regions, split_ocr_indices, cell_assignments)
1434 }
1435
1436 fn calculate_iou(bbox1: &BoundingBox, bbox2: &BoundingBox) -> f32 {
1438 let x1_min = bbox1.x_min();
1439 let y1_min = bbox1.y_min();
1440 let x1_max = bbox1.x_max();
1441 let y1_max = bbox1.y_max();
1442
1443 let x2_min = bbox2.x_min();
1444 let y2_min = bbox2.y_min();
1445 let x2_max = bbox2.x_max();
1446 let y2_max = bbox2.y_max();
1447
1448 let inter_x_min = x1_min.max(x2_min);
1449 let inter_y_min = y1_min.max(y2_min);
1450 let inter_x_max = x1_max.min(x2_max);
1451 let inter_y_max = y1_max.min(y2_max);
1452
1453 let inter_w = (inter_x_max - inter_x_min).max(0.0);
1454 let inter_h = (inter_y_max - inter_y_min).max(0.0);
1455 let inter_area = inter_w * inter_h;
1456
1457 let area1 = (x1_max - x1_min) * (y1_max - y1_min);
1458 let area2 = (x2_max - x2_min) * (y2_max - y2_min);
1459 let union_area = area1 + area2 - inter_area;
1460
1461 if union_area > 0.0 {
1462 inter_area / union_area
1463 } else {
1464 0.0
1465 }
1466 }
1467
1468 fn l1_distance(bbox1: &BoundingBox, bbox2: &BoundingBox) -> f32 {
1470 let b1 = [bbox1.x_min(), bbox1.y_min(), bbox1.x_max(), bbox1.y_max()];
1471 let b2 = [bbox2.x_min(), bbox2.y_min(), bbox2.x_max(), bbox2.y_max()];
1472
1473 (b2[0] - b1[0]).abs()
1474 + (b2[1] - b1[1]).abs()
1475 + (b2[2] - b1[2]).abs()
1476 + (b2[3] - b1[3]).abs()
1477 }
1478
1479 fn paddlex_distance(table_box: &BoundingBox, ocr_box: &BoundingBox) -> f32 {
1481 let x1 = table_box.x_min();
1482 let y1 = table_box.y_min();
1483 let x2 = table_box.x_max();
1484 let y2 = table_box.y_max();
1485 let x3 = ocr_box.x_min();
1486 let y3 = ocr_box.y_min();
1487 let x4 = ocr_box.x_max();
1488 let y4 = ocr_box.y_max();
1489
1490 let dis = (x3 - x1).abs() + (y3 - y1).abs() + (x4 - x2).abs() + (y4 - y2).abs();
1491 let dis_2 = (x3 - x1).abs() + (y3 - y1).abs();
1492 let dis_3 = (x4 - x2).abs() + (y4 - y2).abs();
1493 dis + dis_2.min(dis_3)
1494 }
1495
1496 fn inject_inline_formulas(
1507 elements: &mut [LayoutElement],
1508 _text_regions: &mut Vec<TextRegion>,
1509 _cfg: &StitchConfig,
1510 ) {
1511 use oar_ocr_core::domain::structure::LayoutElementType;
1512
1513 let mut inline_formula_indices: Vec<usize> = Vec::new();
1514
1515 const INLINE_FORMULA_MAX_AREA: f32 = 80000.0;
1517
1518 for (idx, element) in elements.iter().enumerate() {
1519 if element.element_type != LayoutElementType::Formula {
1520 continue;
1521 }
1522
1523 let formula_text = if let Some(text) = &element.text {
1525 if !text.is_empty() {
1526 text
1527 } else {
1528 continue;
1529 }
1530 } else {
1531 continue;
1532 };
1533
1534 let formula_area = element.bbox.area();
1535 tracing::debug!(
1536 "Formula idx {}: area={:.1}, text={}",
1537 idx,
1538 formula_area,
1539 formula_text
1540 );
1541
1542 if formula_area < INLINE_FORMULA_MAX_AREA {
1544 inline_formula_indices.push(idx);
1545 tracing::debug!(
1546 "Marking formula idx {} as inline (area {:.1} < {})",
1547 idx,
1548 formula_area,
1549 INLINE_FORMULA_MAX_AREA
1550 );
1551 }
1552 }
1553
1554 for idx in &inline_formula_indices {
1556 if let Some(element) = elements.get_mut(*idx) {
1557 tracing::debug!(
1558 "Clearing inline formula idx {} to use TextRegion with label=formula",
1559 idx
1560 );
1561 element.text = None;
1562 element.order_index = None;
1563 }
1564 }
1565
1566 if !inline_formula_indices.is_empty() {
1567 tracing::debug!("Marked {} formulas as inline", inline_formula_indices.len());
1568 }
1569 }
1570
1571 fn stitch_layout_elements(
1572 elements: &mut [LayoutElement],
1573 text_regions: &[TextRegion],
1574 used_indices: &mut std::collections::HashSet<usize>,
1575 cfg: &StitchConfig,
1576 ) {
1577 tracing::debug!(
1578 "stitch_layout_elements: {} elements, {} regions, {} already used",
1579 elements.len(),
1580 text_regions.len(),
1581 used_indices.len()
1582 );
1583
1584 for (elem_idx, element) in elements.iter_mut().enumerate() {
1585 if EXCLUDED_FROM_OCR_LABELS.contains(&element.element_type) {
1591 continue;
1592 }
1593
1594 let mut element_texts: Vec<(&TextRegion, &str)> = Vec::new();
1595
1596 for (idx, region) in text_regions.iter().enumerate() {
1597 if let Some(text) = ®ion.text
1598 && Self::is_overlapping(&element.bbox, ®ion.bounding_box, cfg)
1599 {
1600 element_texts.push((region, text));
1601 used_indices.insert(idx);
1605 }
1606 }
1607
1608 if !element_texts.is_empty() {
1609 tracing::debug!(
1610 "Element {} ({:?}): matched {} regions",
1611 elem_idx,
1612 element.element_type,
1613 element_texts.len()
1614 );
1615
1616 for (region, text) in &element_texts {
1618 tracing::debug!(" - region with label={:?}, text={:?}", region.label, text);
1619 }
1620
1621 let mut sorted_for_meta = element_texts.clone();
1624 sorted_for_meta.sort_by(|(r1, _), (r2, _)| {
1625 r1.bounding_box
1626 .center()
1627 .y
1628 .partial_cmp(&r2.bounding_box.center().y)
1629 .unwrap_or(Ordering::Equal)
1630 });
1631 let mut lines = Vec::new();
1632 let mut current_line = Vec::new();
1633 for item in std::mem::take(&mut sorted_for_meta) {
1634 if current_line.is_empty() {
1635 current_line.push(item);
1636 } else {
1637 let first_in_line = ¤t_line[0].0.bounding_box;
1638 if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) {
1639 current_line.push(item);
1640 } else {
1641 current_line.sort_by(|(r1, _), (r2, _)| {
1642 r1.bounding_box
1643 .center()
1644 .x
1645 .partial_cmp(&r2.bounding_box.center().x)
1646 .unwrap_or(Ordering::Equal)
1647 });
1648 lines.push(current_line);
1649 current_line = vec![item];
1650 }
1651 }
1652 }
1653 if !current_line.is_empty() {
1654 current_line.sort_by(|(r1, _), (r2, _)| {
1655 r1.bounding_box
1656 .center()
1657 .x
1658 .partial_cmp(&r2.bounding_box.center().x)
1659 .unwrap_or(Ordering::Equal)
1660 });
1661 lines.push(current_line);
1662 }
1663 for mut line in lines {
1664 sorted_for_meta.append(&mut line);
1665 }
1666
1667 element.seg_start_x = Some(sorted_for_meta[0].0.bounding_box.x_min());
1669 element.seg_end_x = Some(sorted_for_meta.last().unwrap().0.bounding_box.x_max());
1671
1672 let mut num_lines = 1u32;
1674 let mut prev_bbox = &sorted_for_meta[0].0.bounding_box;
1675 for (region, _) in &sorted_for_meta[1..] {
1676 if !Self::is_same_text_line_bbox(prev_bbox, ®ion.bounding_box, cfg) {
1677 num_lines += 1;
1678 prev_bbox = ®ion.bounding_box;
1679 }
1680 }
1681 element.num_lines = Some(num_lines);
1682 }
1683
1684 Self::sort_and_join_texts(&mut element_texts, Some(&element.bbox), cfg, |joined| {
1685 element.text = Some(joined);
1686 });
1687 }
1688 }
1689
1690 fn fill_formula_elements(
1695 elements: &mut [LayoutElement],
1696 formulas: &[FormulaResult],
1697 _cfg: &StitchConfig,
1698 ) {
1699 for element in elements.iter_mut() {
1700 if element.element_type != LayoutElementType::Formula {
1701 continue;
1702 }
1703
1704 if element.text.is_some() {
1706 continue;
1707 }
1708
1709 let mut best_formula: Option<&FormulaResult> = None;
1713 let mut best_score = 0.0f32;
1714
1715 for formula in formulas {
1716 let ioa_element = element.bbox.ioa(&formula.bbox);
1717 let ioa_formula = formula.bbox.ioa(&element.bbox);
1718 let score = ioa_element.max(ioa_formula);
1719 if score > best_score {
1720 best_score = score;
1721 best_formula = Some(formula);
1722 }
1723 }
1724
1725 if best_score < 0.05 {
1728 let elem_center = element.bbox.center();
1729 let mut best_dist = f32::MAX;
1730
1731 for formula in formulas {
1732 let fc = formula.bbox.center();
1733 let fc_inside = fc.x >= element.bbox.x_min()
1734 && fc.x <= element.bbox.x_max()
1735 && fc.y >= element.bbox.y_min()
1736 && fc.y <= element.bbox.y_max();
1737 let ec_inside = elem_center.x >= formula.bbox.x_min()
1738 && elem_center.x <= formula.bbox.x_max()
1739 && elem_center.y >= formula.bbox.y_min()
1740 && elem_center.y <= formula.bbox.y_max();
1741
1742 if fc_inside || ec_inside {
1743 let dx = fc.x - elem_center.x;
1744 let dy = fc.y - elem_center.y;
1745 let dist = dx * dx + dy * dy;
1746 if dist < best_dist {
1747 best_dist = dist;
1748 best_formula = Some(formula);
1749 best_score = 0.05;
1750 }
1751 }
1752 }
1753 }
1754
1755 if best_score >= 0.05
1756 && let Some(formula) = best_formula
1757 {
1758 element.text = Some(formula.latex.clone());
1759 }
1760 }
1761 }
1762
1763 fn is_overlapping(bbox1: &BoundingBox, bbox2: &BoundingBox, cfg: &StitchConfig) -> bool {
1766 let x1_min = bbox1.x_min();
1767 let y1_min = bbox1.y_min();
1768 let x1_max = bbox1.x_max();
1769 let y1_max = bbox1.y_max();
1770
1771 let x2_min = bbox2.x_min();
1772 let y2_min = bbox2.y_min();
1773 let x2_max = bbox2.x_max();
1774 let y2_max = bbox2.y_max();
1775
1776 let inter_x_min = x1_min.max(x2_min);
1777 let inter_y_min = y1_min.max(y2_min);
1778 let inter_x_max = x1_max.min(x2_max);
1779 let inter_y_max = y1_max.min(y2_max);
1780
1781 let inter_w = inter_x_max - inter_x_min;
1782 let inter_h = inter_y_max - inter_y_min;
1783
1784 inter_w > cfg.overlap_min_pixels && inter_h > cfg.overlap_min_pixels
1785 }
1786
1787 fn is_same_text_line_bbox(
1793 bbox1: &BoundingBox,
1794 bbox2: &BoundingBox,
1795 cfg: &StitchConfig,
1796 ) -> bool {
1797 let h1 = (bbox1.y_max() - bbox1.y_min()).max(1.0);
1798 let h2 = (bbox2.y_max() - bbox2.y_min()).max(1.0);
1799 let inter_h =
1800 (bbox1.y_max().min(bbox2.y_max()) - bbox1.y_min().max(bbox2.y_min())).max(0.0);
1801 let overlap_ratio = inter_h / h1.min(h2);
1802 if overlap_ratio >= cfg.line_height_iou_threshold {
1803 return true;
1804 }
1805
1806 let adaptive_tol = (h1.min(h2) * 0.5).max(1.0);
1807 let center_delta = (bbox1.center().y - bbox2.center().y).abs();
1808 center_delta <= adaptive_tol.max(cfg.same_line_y_tolerance * 0.25)
1809 }
1810
1811 fn sort_and_join_texts<F>(
1812 texts: &mut Vec<(&TextRegion, &str)>,
1813 container_bbox: Option<&BoundingBox>,
1814 cfg: &StitchConfig,
1815 update_fn: F,
1816 ) where
1817 F: FnOnce(String),
1818 {
1819 if texts.is_empty() {
1820 return;
1821 }
1822
1823 texts.sort_by(|(r1, _), (r2, _)| {
1825 r1.bounding_box
1826 .center()
1827 .y
1828 .partial_cmp(&r2.bounding_box.center().y)
1829 .unwrap_or(Ordering::Equal)
1830 });
1831 let mut lines = Vec::new();
1832 let mut current_line = Vec::new();
1833 for item in std::mem::take(texts) {
1834 if current_line.is_empty() {
1835 current_line.push(item);
1836 } else {
1837 let first_in_line = ¤t_line[0].0.bounding_box;
1838 if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) {
1839 current_line.push(item);
1840 } else {
1841 current_line.sort_by(|(r1, _), (r2, _)| {
1842 r1.bounding_box
1843 .center()
1844 .x
1845 .partial_cmp(&r2.bounding_box.center().x)
1846 .unwrap_or(Ordering::Equal)
1847 });
1848 lines.push(current_line);
1849 current_line = vec![item];
1850 }
1851 }
1852 }
1853 if !current_line.is_empty() {
1854 current_line.sort_by(|(r1, _), (r2, _)| {
1855 r1.bounding_box
1856 .center()
1857 .x
1858 .partial_cmp(&r2.bounding_box.center().x)
1859 .unwrap_or(Ordering::Equal)
1860 });
1861 lines.push(current_line);
1862 }
1863 for mut line in lines {
1864 texts.append(&mut line);
1865 }
1866
1867 let mut result = String::new();
1872 let mut prev_region: Option<&TextRegion> = None;
1873
1874 tracing::debug!(
1875 "sort_and_join_texts: processing {} text regions",
1876 texts.len()
1877 );
1878
1879 for (region, text) in texts.iter() {
1880 if text.is_empty() {
1881 continue;
1882 }
1883
1884 if let Some(last_region) = prev_region {
1885 if !Self::is_same_text_line_bbox(
1886 &last_region.bounding_box,
1887 ®ion.bounding_box,
1888 cfg,
1889 ) {
1890 let mut add_newline = false;
1893 let mut is_line_wrap = false;
1894
1895 if let Some(container) = container_bbox {
1896 let container_width = container.x_max() - container.x_min();
1897 let right_gap = container.x_max() - last_region.bounding_box.x_max();
1898 let tail_char = last_non_whitespace_char(&result);
1899 let ends_with_non_break_punct =
1900 tail_char.is_some_and(is_non_break_line_end_punctuation);
1901 let paragraph_gap_ratio =
1903 if tail_char.is_some_and(|c| c.is_ascii_alphabetic()) {
1904 0.5
1905 } else {
1906 0.3
1907 };
1908
1909 if !ends_with_non_break_punct
1910 && right_gap > container_width * paragraph_gap_ratio
1911 {
1912 add_newline = true;
1914 } else {
1915 is_line_wrap = true;
1917 }
1918 }
1919
1920 let prev_ends_hyphen = result.ends_with('-');
1927 if prev_ends_hyphen && is_line_wrap {
1928 result.pop();
1930 } else if add_newline {
1932 if !result.ends_with('\n') {
1933 result.push('\n');
1934 }
1935 } else {
1936 if let Some(last_char) = result.chars().last()
1938 && last_char != '\n'
1939 && needs_space_after(last_char)
1940 {
1941 result.push(' ');
1942 }
1943 }
1944 } else {
1945 let needs_spacing = if let Some(last_char) = result.chars().last()
1948 && last_char != '\n'
1949 && needs_space_after(last_char)
1950 {
1951 true
1952 } else {
1953 last_region.is_formula()
1955 };
1956
1957 if needs_spacing {
1958 result.push(' ');
1959 }
1960 }
1961 }
1962
1963 let is_formula = region.is_formula();
1967 let text_to_add = if is_formula {
1968 let already_wrapped =
1970 text.starts_with('$') || text.starts_with("\\(") || text.starts_with("\\[");
1971 if already_wrapped {
1972 text.to_string()
1973 } else {
1974 let is_display = result.is_empty() || result.ends_with('\n');
1977
1978 if is_display {
1979 format!("$${}$$", text)
1981 } else {
1982 format!("${}$", text)
1984 }
1985 }
1986 } else {
1987 text.to_string()
1988 };
1989
1990 result.push_str(&text_to_add);
1991 prev_region = Some(region);
1992 }
1993
1994 let joined = result.trim_end().to_string();
1996 update_fn(joined);
1997 }
1998
1999 fn sort_layout_elements_enhanced(
2004 elements: &mut Vec<LayoutElement>,
2005 page_width: f32,
2006 page_height: f32,
2007 ) {
2008 use oar_ocr_core::processors::layout_sorting::{SortableElement, sort_layout_enhanced};
2009
2010 if elements.is_empty() {
2011 return;
2012 }
2013
2014 let sortable_elements: Vec<_> = elements
2015 .iter()
2016 .map(|e| SortableElement {
2017 bbox: e.bbox.clone(),
2018 element_type: e.element_type,
2019 num_lines: e.num_lines,
2020 })
2021 .collect();
2022
2023 let sorted_indices = sort_layout_enhanced(&sortable_elements, page_width, page_height);
2024 if sorted_indices.len() != elements.len() {
2025 return;
2026 }
2027
2028 let sorted_elements: Vec<_> = sorted_indices
2029 .into_iter()
2030 .map(|idx| elements[idx].clone())
2031 .collect();
2032 *elements = sorted_elements;
2033 }
2034
2035 #[allow(dead_code)]
2037 fn sort_layout_elements(elements: &mut Vec<LayoutElement>, _width: f32, _cfg: &StitchConfig) {
2038 if elements.len() <= 1 {
2039 return;
2040 }
2041
2042 let bboxes: Vec<BoundingBox> = elements.iter().map(|e| e.bbox.clone()).collect();
2044 let order = crate::processors::sort_by_xycut(
2045 &bboxes,
2046 crate::processors::SortDirection::Vertical,
2047 1,
2048 );
2049
2050 if order.len() != elements.len() {
2051 return;
2052 }
2053
2054 let mut reordered = Vec::with_capacity(elements.len());
2055 for idx in order {
2056 reordered.push(elements[idx].clone());
2057 }
2058
2059 *elements = reordered;
2060 }
2061}
2062
2063fn needs_space_after(c: char) -> bool {
2066 c.is_ascii_alphabetic()
2067}
2068
2069fn last_non_whitespace_char(text: &str) -> Option<char> {
2070 text.chars().rev().find(|c| !c.is_whitespace())
2071}
2072
2073fn is_non_break_line_end_punctuation(c: char) -> bool {
2075 matches!(c, ',' | ',' | '、' | ';' | ';' | ':' | ':')
2076}
2077
2078#[cfg(test)]
2079mod tests {
2080 use super::*;
2081 use crate::oarocr::TextRegion;
2082 use oar_ocr_core::processors::BoundingBox;
2083
2084 fn make_region(bbox: BoundingBox, text: &str) -> TextRegion {
2085 TextRegion {
2086 bounding_box: bbox.clone(),
2087 dt_poly: Some(bbox.clone()),
2088 rec_poly: Some(bbox),
2089 text: Some(text.into()),
2090 confidence: Some(0.9),
2091 orientation_angle: None,
2092 word_boxes: None,
2093 label: None,
2094 }
2095 }
2096
2097 #[test]
2098 fn test_normalize_tiny_symbol_for_paddlex_dash() {
2099 let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 10.0, 9.0), "=");
2100 region.confidence = Some(0.33);
2101 ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2102 assert_eq!(region.text.as_deref(), Some("-"));
2103 }
2104
2105 #[test]
2106 fn test_normalize_tiny_symbol_for_paddlex_comma() {
2107 let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 7.0, 6.0), "=");
2108 region.confidence = Some(0.40);
2109 ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2110 assert_eq!(region.text.as_deref(), Some(","));
2111 }
2112
2113 #[test]
2114 fn test_normalize_tiny_symbol_for_paddlex_semicolon() {
2115 let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 12.0, 13.0), "0");
2116 region.confidence = Some(0.13);
2117 ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2118 assert_eq!(region.text.as_deref(), Some(";"));
2119 }
2120
2121 #[test]
2122 fn test_is_overlapping_threshold() {
2123 let b1 = BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0);
2124 let b2 = BoundingBox::from_coords(5.0, 5.0, 20.0, 20.0);
2125 let cfg = StitchConfig::default();
2126 assert!(ResultStitcher::is_overlapping(&b1, &b2, &cfg));
2127 let cfg2 = StitchConfig {
2128 overlap_min_pixels: 5.0,
2129 ..cfg.clone()
2130 };
2131 assert!(!ResultStitcher::is_overlapping(&b1, &b2, &cfg2));
2132 }
2133
2134 #[test]
2135 fn test_sort_and_join_texts_tolerance() {
2136 let b1 = BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0);
2137 let b2 = BoundingBox::from_coords(12.0, 1.0, 20.0, 11.0);
2138 let r1 = TextRegion {
2139 bounding_box: b1.clone(),
2140 dt_poly: Some(b1.clone()),
2141 rec_poly: Some(b1),
2142 text: Some("A".into()),
2143 confidence: Some(0.9),
2144 orientation_angle: None,
2145 word_boxes: None,
2146 label: None,
2147 };
2148 let r2 = TextRegion {
2149 bounding_box: b2.clone(),
2150 dt_poly: Some(b2.clone()),
2151 rec_poly: Some(b2),
2152 text: Some("B".into()),
2153 confidence: Some(0.9),
2154 orientation_angle: None,
2155 word_boxes: None,
2156 label: None,
2157 };
2158 let mut texts = vec![(&r1, "A"), (&r2, "B")];
2159 let cfg = StitchConfig::default();
2160 let mut joined = String::new();
2161 ResultStitcher::sort_and_join_texts(&mut texts, None, &cfg, |j| {
2162 joined = j;
2163 });
2164 assert_eq!(joined, "A B");
2165 }
2166
2167 #[test]
2168 fn test_sort_and_join_texts_english_line_uses_larger_paragraph_gap_threshold() {
2169 let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "Line");
2170 let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2171 let mut texts = vec![(&r1, "Line"), (&r2, "next")];
2172 let cfg = StitchConfig::default();
2173 let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2174 let mut joined = String::new();
2175 ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2176 assert_eq!(joined, "Line next");
2177 }
2178
2179 #[test]
2180 fn test_sort_and_join_texts_non_english_tail_keeps_original_paragraph_gap_threshold() {
2181 let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "2024");
2182 let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2183 let mut texts = vec![(&r1, "2024"), (&r2, "next")];
2184 let cfg = StitchConfig::default();
2185 let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2186 let mut joined = String::new();
2187 ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2188 assert_eq!(joined, "2024\nnext");
2189 }
2190
2191 #[test]
2192 fn test_sort_and_join_texts_non_break_punctuation_suppresses_newline() {
2193 let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 20.0, 10.0), "Note:");
2194 let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2195 let mut texts = vec![(&r1, "Note:"), (&r2, "next")];
2196 let cfg = StitchConfig::default();
2197 let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2198 let mut joined = String::new();
2199 ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2200 assert_eq!(joined, "Note:next");
2201 }
2202
2203 #[test]
2204 fn test_normalize_checkbox_symbols_in_table_checkbox_like() {
2205 let mut cells = vec![
2206 TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("ü"),
2207 TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("X"),
2208 TableCell::new(BoundingBox::from_coords(20.0, 0.0, 30.0, 10.0), 1.0).with_text("L"),
2209 ];
2210
2211 ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells);
2212
2213 assert_eq!(cells[0].text.as_deref(), Some("✓"));
2214 assert_eq!(cells[1].text.as_deref(), Some("✗"));
2215 assert_eq!(cells[2].text.as_deref(), Some("✓"));
2216 }
2217
2218 #[test]
2219 fn test_normalize_checkbox_symbols_in_table_keeps_ambiguous_when_not_checkbox_like() {
2220 let mut cells = vec![
2221 TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("L"),
2222 TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("A"),
2223 ];
2224
2225 ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells);
2226
2227 assert_eq!(cells[0].text.as_deref(), Some("L"));
2228 assert_eq!(cells[1].text.as_deref(), Some("A"));
2229 }
2230
2231 #[test]
2232 fn test_find_row_start_index_with_compact_td_tokens() {
2233 let tokens = vec![
2234 "<table>".to_string(),
2235 "<tbody>".to_string(),
2236 "<tr>".to_string(),
2237 "<td></td>".to_string(),
2238 "<td></td>".to_string(),
2239 "</tr>".to_string(),
2240 "<tr>".to_string(),
2241 "<td rowspan=\"2\"></td>".to_string(),
2242 "<td></td>".to_string(),
2243 "</tr>".to_string(),
2244 "</tbody>".to_string(),
2245 "</table>".to_string(),
2246 ];
2247
2248 let row_start = ResultStitcher::find_row_start_index(&tokens);
2249 assert_eq!(row_start, vec![0, 2]);
2250 }
2251
2252 #[test]
2253 fn test_match_table_cells_with_structure_rows() {
2254 let mut cells = vec![
2255 TableCell::new(BoundingBox::from_coords(50.0, 0.0, 100.0, 20.0), 1.0), TableCell::new(BoundingBox::from_coords(0.0, 0.0, 50.0, 20.0), 1.0), TableCell::new(BoundingBox::from_coords(0.0, 20.0, 50.0, 40.0), 1.0), TableCell::new(BoundingBox::from_coords(50.0, 20.0, 100.0, 40.0), 1.0), ];
2260
2261 let structure_tokens = vec![
2262 "<table>".to_string(),
2263 "<tbody>".to_string(),
2264 "<tr>".to_string(),
2265 "<td></td>".to_string(),
2266 "<td></td>".to_string(),
2267 "</tr>".to_string(),
2268 "<tr>".to_string(),
2269 "<td></td>".to_string(),
2270 "<td></td>".to_string(),
2271 "</tr>".to_string(),
2272 "</tbody>".to_string(),
2273 "</table>".to_string(),
2274 ];
2275
2276 let ocr_candidates = vec![
2277 (
2278 OcrSource::Original(0),
2279 make_region(BoundingBox::from_coords(2.0, 2.0, 48.0, 18.0), "A"),
2280 ),
2281 (
2282 OcrSource::Original(1),
2283 make_region(BoundingBox::from_coords(52.0, 2.0, 98.0, 18.0), "B"),
2284 ),
2285 (
2286 OcrSource::Original(2),
2287 make_region(BoundingBox::from_coords(2.0, 22.0, 48.0, 38.0), "C"),
2288 ),
2289 (
2290 OcrSource::Original(3),
2291 make_region(BoundingBox::from_coords(52.0, 22.0, 98.0, 38.0), "D"),
2292 ),
2293 ];
2294
2295 let (mapping, matched) = ResultStitcher::match_table_cells_with_structure_rows(
2296 &mut cells,
2297 &structure_tokens,
2298 &ocr_candidates,
2299 10.0,
2300 None,
2301 )
2302 .expect("expected row-aware matching result");
2303
2304 assert_eq!(mapping, vec![Some(1), Some(0), Some(2), Some(3)]);
2305 assert_eq!(matched.len(), 4);
2306
2307 assert_eq!(cells[1].text.as_deref(), Some("A"));
2308 assert_eq!(cells[0].text.as_deref(), Some("B"));
2309 assert_eq!(cells[2].text.as_deref(), Some("C"));
2310 assert_eq!(cells[3].text.as_deref(), Some("D"));
2311 }
2312
2313 #[test]
2314 fn test_match_table_and_ocr_by_iou_distance_prefers_first_cell_on_exact_tie() {
2315 let cells = vec![
2316 TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2317 TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2318 ];
2319 let ocr_candidates = vec![(
2320 OcrSource::Original(0),
2321 make_region(BoundingBox::from_coords(2.0, 2.0, 18.0, 18.0), "X"),
2322 )];
2323
2324 let (mapping, matched) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2325 &cells,
2326 &ocr_candidates,
2327 false,
2328 true,
2329 );
2330
2331 assert_eq!(matched.len(), 1);
2332 assert_eq!(mapping.get(&0), Some(&vec![0]));
2333 assert!(!mapping.contains_key(&1));
2334 }
2335
2336 #[test]
2337 fn test_match_table_and_ocr_by_iou_distance_boundary_near_tie_stays_stable() {
2338 let cells = vec![
2340 TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2341 TableCell::new(BoundingBox::from_coords(0.0, 9.99995, 20.0, 29.99995), 1.0),
2342 ];
2343 let ocr_candidates = vec![(
2344 OcrSource::Original(0),
2345 make_region(BoundingBox::from_coords(0.0, 10.0, 20.0, 20.0), "Y"),
2346 )];
2347
2348 let (mapping, _) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2349 &cells,
2350 &ocr_candidates,
2351 false,
2352 true,
2353 );
2354
2355 assert_eq!(mapping.get(&0), Some(&vec![0]));
2357 assert!(!mapping.contains_key(&1));
2358 }
2359
2360 #[test]
2361 fn test_match_table_and_ocr_by_iou_distance_boundary_straddle_prefers_upper_row() {
2362 let cells = vec![
2365 TableCell::new(
2366 BoundingBox::from_coords(564.6841, 142.27391, 584.9476, 157.74164),
2367 1.0,
2368 )
2369 .with_position(2, 2),
2370 TableCell::new(
2371 BoundingBox::from_coords(565.3968, 158.34259, 584.0292, 171.04494),
2372 1.0,
2373 )
2374 .with_position(3, 2),
2375 ];
2376 let ocr_candidates = vec![(
2377 OcrSource::Original(0),
2378 make_region(BoundingBox::from_coords(567.0, 151.0, 583.0, 166.0), "84"),
2379 )];
2380
2381 let (mapping, matched) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2382 &cells,
2383 &ocr_candidates,
2384 false,
2385 true,
2386 );
2387
2388 assert_eq!(matched.len(), 1);
2389 assert_eq!(mapping.get(&0), Some(&vec![0]));
2390 assert!(!mapping.contains_key(&1));
2391 }
2392}