Skip to main content

oar_ocr/oarocr/
stitching.rs

1//! Stitching module for combining OCR results.
2//!
3//! This module provides functionality to associate recognized text regions with
4//! layout elements (such as tables and paragraphs) to create a unified structured result.
5//!
6//! ## PP-StructureV3 Alignment
7//!
8//! The stitching logic follows PP-StructureV3's fusion strategy:
9//! 1. **Label-based filtering**: Special regions (formula, table, seal) are excluded from OCR matching
10//! 2. **Content preservation**: Formulas retain LaTeX, tables retain HTML structure
11//! 3. **Reading order**: Elements are assigned `order_index` based on spatial sorting
12//! 4. **Orphan handling**: Unmatched OCR regions create new text elements
13
14use crate::oarocr::TextRegion;
15use oar_ocr_core::domain::structure::{
16    FormulaResult, LayoutElement, LayoutElementType, StructureResult, TableCell, TableResult,
17};
18use oar_ocr_core::processors::{
19    BoundingBox, SplitConfig as OcrSplitConfig, create_expanded_ocr_for_table, parse_cell_grid_info,
20};
21use std::cmp::Ordering;
22
23/// Source of an OCR region reference, distinguishing between regions that were
24/// split across cell boundaries and original regions.
25#[derive(Clone, Copy, Debug)]
26enum OcrSource {
27    /// Index into the split_regions vector (created by cross-cell splitting)
28    Split,
29    /// Index into the original text_regions slice
30    Original(usize),
31}
32
33/// Labels that should be excluded from OCR text matching.
34/// These regions have their own specialized content (LaTeX, HTML, etc.)
35/// Labels excluded from OCR text matching in `stitch_layout_elements`.
36/// PaddleX: formula results are injected into the OCR pool (via
37/// `convert_formula_res_to_ocr_format`), so formula blocks participate
38/// in normal OCR matching — only Table and Seal are excluded.
39///
40/// NOTE: After inline formula injection, formula elements have been absorbed
41/// into text regions and should be excluded from stitching to prevent duplication.
42const EXCLUDED_FROM_OCR_LABELS: [LayoutElementType; 3] = [
43    LayoutElementType::Table,
44    LayoutElementType::Seal,
45    LayoutElementType::Formula, // Exclude formulas to prevent duplicate rendering after injection
46];
47
48#[derive(Clone)]
49pub struct StitchConfig {
50    pub overlap_min_pixels: f32,
51    pub cell_text_min_ioa: f32,
52    pub require_text_center_inside_cell: bool,
53    pub cell_merge_min_iou: f32,
54    pub formula_to_cell_min_iou: f32,
55    /// Fallback pixel tolerance for line grouping.
56    pub same_line_y_tolerance: f32,
57    /// Minimum vertical overlap ratio (intersection / min(line_height)) to treat two spans as one line.
58    pub line_height_iou_threshold: f32,
59    /// Whether to enable cross-cell OCR box splitting.
60    /// When enabled, OCR boxes that span multiple table cells will be split
61    /// at cell boundaries and their text distributed proportionally.
62    pub enable_cross_cell_split: bool,
63}
64
65impl Default for StitchConfig {
66    fn default() -> Self {
67        Self {
68            overlap_min_pixels: 3.0,
69            cell_text_min_ioa: 0.6,
70            require_text_center_inside_cell: true,
71            cell_merge_min_iou: 0.3,
72            formula_to_cell_min_iou: 0.01,
73            same_line_y_tolerance: 10.0,
74            line_height_iou_threshold: 0.6,
75            enable_cross_cell_split: true,
76        }
77    }
78}
79
80/// Stitcher for combining results from different OCR tasks.
81pub struct ResultStitcher;
82
83impl ResultStitcher {
84    /// Stitches text regions into layout elements and tables within the structure result.
85    ///
86    /// This method follows PP-StructureV3's fusion strategy:
87    /// 1. Stitch OCR text into tables (cell-level matching)
88    /// 2. Stitch OCR text into layout elements (excluding formula/table/seal)
89    /// 3. Fill formula elements with LaTeX content from formula results
90    /// 4. Create new text elements for orphan OCR regions
91    /// 5. Sort elements and assign reading order indices
92    pub fn stitch(result: &mut StructureResult) {
93        let cfg = StitchConfig::default();
94        Self::stitch_with_config(result, &cfg);
95    }
96
97    pub fn stitch_with_config(result: &mut StructureResult, cfg: &StitchConfig) {
98        // Track which regions have been used
99        let mut used_region_indices = std::collections::HashSet::new();
100
101        // Get text regions (clone to avoid borrow issues, make mutable for injection)
102        let mut regions = result.text_regions.clone().unwrap_or_default();
103
104        tracing::debug!("Stitching: {} text regions", regions.len());
105
106        // 1. Stitch text into tables
107        // For tables, we also want recognized formulas to participate in cell content
108        // matching, similar to how formulas are injected into the OCR results used
109        // for table recognition.
110        Self::stitch_tables(
111            &mut result.tables,
112            &regions,
113            &result.formulas,
114            &mut used_region_indices,
115            cfg,
116        );
117
118        tracing::debug!(
119            "After stitch_tables: {} regions used",
120            used_region_indices.len()
121        );
122
123        // 1.5. Fill formula elements with LaTeX content FIRST
124        // This must happen before inject_inline_formulas so formulas have text content
125        Self::fill_formula_elements(&mut result.layout_elements, &result.formulas, cfg);
126
127        // 1.6. Inject inline formulas into text regions
128        // PaddleX: Small formula elements that overlap with text elements should be
129        // absorbed into the text flow, not kept as separate layout elements.
130        // This creates TextRegion entries with label="formula" that will be wrapped
131        // with $...$ delimiters during text joining.
132        Self::inject_inline_formulas(&mut result.layout_elements, &mut regions, cfg);
133
134        // 2. Stitch text into layout elements (excluding special types)
135        // Note: after inject_inline_formulas, some formula elements have had their text cleared
136        // These won't be rendered separately in to_markdown
137        Self::stitch_layout_elements(
138            &mut result.layout_elements,
139            &regions,
140            &mut used_region_indices,
141            cfg,
142        );
143
144        tracing::debug!(
145            "After stitch_layout_elements: {} regions used",
146            used_region_indices.len()
147        );
148
149        // Note: fill_formula_elements was already called before inject_inline_formulas
150        // Do NOT call it again here, as it would re-fill formulas that were injected and cleared
151
152        // 3. Mark text regions that overlap with Seal elements as used
153        // to prevent them from becoming orphans.
154        // - Seals: content comes from specialized seal OCR.
155        // - Tables: content comes from OCR stitching. We do NOT suppress tables here because
156        //   text inside a table that wasn't assigned to a cell (in step 1) should be preserved
157        //   as an orphan (e.g. caption, header, or matching failure).
158        // - Formulas: now handled through normal OCR matching (step 2), already marked used.
159        for element in &result.layout_elements {
160            if element.element_type == LayoutElementType::Seal {
161                for (idx, region) in regions.iter().enumerate() {
162                    if Self::is_overlapping(&element.bbox, &region.bounding_box, cfg) {
163                        used_region_indices.insert(idx);
164                    }
165                }
166            }
167        }
168
169        // 5. Handle unmatched text regions (create new layout elements)
170        // PP-StructureV3 alignment: Filter out orphan text regions that significantly overlap
171        // with table regions, as these are likely table cell text that failed to match cells.
172        // These shouldn't become separate layout elements.
173        let table_bboxes: Vec<&BoundingBox> = result
174            .layout_elements
175            .iter()
176            .filter(|e| e.element_type == LayoutElementType::Table)
177            .map(|e| &e.bbox)
178            .collect();
179
180        let image_chart_bboxes: Vec<&BoundingBox> = result
181            .layout_elements
182            .iter()
183            .filter(|e| {
184                matches!(
185                    e.element_type,
186                    LayoutElementType::Image | LayoutElementType::Chart
187                )
188            })
189            .map(|e| &e.bbox)
190            .collect();
191
192        // Collect figure/chart caption bboxes to infer undetected figure regions.
193        // When the layout model detects a caption (e.g. "Figure 3...") but misses
194        // the figure image itself, OCR text from the figure diagram becomes orphans.
195        // We infer the figure area as the region above each caption within its x-range.
196        let figure_caption_bboxes: Vec<&BoundingBox> = result
197            .layout_elements
198            .iter()
199            .filter(|e| {
200                matches!(
201                    e.element_type,
202                    LayoutElementType::FigureTitle
203                        | LayoutElementType::ChartTitle
204                        | LayoutElementType::FigureTableChartTitle
205                )
206            })
207            .map(|e| &e.bbox)
208            .collect();
209
210        // Collect text/title element bboxes to check if an orphan is already
211        // covered by a known content element (avoid filtering legitimate text)
212        let content_element_bboxes: Vec<&BoundingBox> = result
213            .layout_elements
214            .iter()
215            .filter(|e| {
216                matches!(
217                    e.element_type,
218                    LayoutElementType::Text
219                        | LayoutElementType::DocTitle
220                        | LayoutElementType::ParagraphTitle
221                        | LayoutElementType::Abstract
222                )
223            })
224            .map(|e| &e.bbox)
225            .collect();
226
227        let original_element_count = result.layout_elements.len();
228        let mut new_elements = Vec::new();
229        for (idx, region) in regions.iter().enumerate() {
230            if !used_region_indices.contains(&idx)
231                && let Some(text) = &region.text
232            {
233                // Filter out text that overlaps significantly with tables
234                // These are likely table cell text that didn't match any cell
235                let overlaps_table = table_bboxes
236                    .iter()
237                    .any(|table_bbox| region.bounding_box.ioa(table_bbox) > 0.3);
238
239                if overlaps_table {
240                    // Skip - this text is inside a table and should not be a separate element
241                    continue;
242                }
243
244                // Filter out text inside Image/Chart regions
245                let overlaps_image_chart = image_chart_bboxes
246                    .iter()
247                    .any(|bbox| region.bounding_box.ioa(bbox) > 0.5);
248
249                if overlaps_image_chart {
250                    continue;
251                }
252
253                // Filter out text in inferred figure regions (above figure/chart captions).
254                // When the layout model detects a caption but not the figure itself,
255                // OCR'd annotations from the figure diagram leak as orphan text.
256                // Check: orphan is above a caption, within its x-range, and not inside
257                // any existing text/title element.
258                let in_inferred_figure_region = figure_caption_bboxes.iter().any(|cap| {
259                    let orphan_bb = &region.bounding_box;
260                    // Orphan must be above or overlapping with the caption's top
261                    let above_caption = orphan_bb.y_max() < cap.y_max();
262                    // Orphan must be within the caption's horizontal range (with margin)
263                    let x_margin = (cap.x_max() - cap.x_min()) * 0.1;
264                    let in_x_range = orphan_bb.x_min() >= (cap.x_min() - x_margin)
265                        && orphan_bb.x_max() <= (cap.x_max() + x_margin);
266                    above_caption && in_x_range
267                });
268
269                if in_inferred_figure_region {
270                    // Verify the orphan is NOT inside any existing text/title element
271                    let inside_content_element = content_element_bboxes
272                        .iter()
273                        .any(|bbox| region.bounding_box.ioa(bbox) > 0.5);
274                    if !inside_content_element {
275                        continue;
276                    }
277                }
278
279                // Check if this orphan region is a formula
280                // Create a new layout element for this orphan text
281                // If it's a formula (label="formula"), create a Formula element, otherwise Text
282                let element_type = if region.is_formula() {
283                    LayoutElementType::Formula
284                } else {
285                    LayoutElementType::Text
286                };
287
288                let element = LayoutElement::new(
289                    region.bounding_box.clone(),
290                    element_type,
291                    region.confidence.unwrap_or(0.0),
292                )
293                .with_text(text.as_ref().to_string());
294
295                new_elements.push(element);
296            }
297        }
298
299        // If region_blocks exist, assign orphan elements to their containing regions
300        // and update element_indices to maintain proper grouping
301        if let Some(ref mut region_blocks) = result.region_blocks {
302            for (new_idx, new_element) in new_elements.iter().enumerate() {
303                let element_index = original_element_count + new_idx;
304
305                // Find the region that best contains this orphan element
306                let mut best_region_idx: Option<usize> = None;
307                let mut best_overlap = 0.0f32;
308
309                for (region_idx, region) in region_blocks.iter().enumerate() {
310                    // Check if this element overlaps with the region bbox
311                    let overlap = new_element.bbox.intersection_area(&region.bbox);
312                    if overlap > best_overlap {
313                        best_overlap = overlap;
314                        best_region_idx = Some(region_idx);
315                    }
316                }
317
318                // Add to the best matching region, or leave unassigned if no overlap
319                if let Some(region_idx) = best_region_idx {
320                    region_blocks[region_idx]
321                        .element_indices
322                        .push(element_index);
323                }
324            }
325        }
326
327        result.layout_elements.extend(new_elements);
328
329        // 6. Sort all layout elements spatially and assign order indices
330        // PP-StructureV3: When region_blocks is present, elements are already sorted
331        // by hierarchical region order - skip re-sorting to preserve the structure
332        let width = if let Some(img) = &result.rectified_img {
333            img.width() as f32
334        } else {
335            // Estimate width from max x coordinate
336            result
337                .layout_elements
338                .iter()
339                .map(|e| e.bbox.x_max())
340                .fold(0.0f32, f32::max)
341                .max(1000.0) // default fallback
342        };
343
344        // When region_blocks exist, layout_elements are already sorted correctly
345        // by XY-cut with region hierarchy in structure.rs - do NOT re-sort here.
346        // Only sort when region_blocks is NOT present.
347        if result.region_blocks.is_none() {
348            let height = if let Some(img) = &result.rectified_img {
349                img.height() as f32
350            } else {
351                result
352                    .layout_elements
353                    .iter()
354                    .map(|e| e.bbox.y_max())
355                    .fold(0.0f32, f32::max)
356                    .max(1000.0)
357            };
358            Self::sort_layout_elements_enhanced(&mut result.layout_elements, width, height);
359        }
360
361        // Assign order indices regardless of sorting
362        Self::assign_order_indices(&mut result.layout_elements);
363    }
364
365    /// Assigns reading order indices to layout elements.
366    ///
367    /// Only elements that should be included in reading order get an index.
368    /// PP-StructureV3 includes: text, titles, tables, formulas, images, seals, etc.
369    fn assign_order_indices(elements: &mut [LayoutElement]) {
370        let mut order_index = 1u32;
371        for element in elements.iter_mut() {
372            // Assign order index to elements that should be in reading order
373            // (matching PP-StructureV3's visualize_index_labels)
374            if Self::should_have_order_index(element.element_type) {
375                element.order_index = Some(order_index);
376                order_index += 1;
377            }
378        }
379    }
380
381    /// Determines if an element type should have a reading order index.
382    ///
383    /// Based on PP-StructureV3's `visualize_index_labels`.
384    fn should_have_order_index(element_type: LayoutElementType) -> bool {
385        matches!(
386            element_type,
387            LayoutElementType::Text
388                | LayoutElementType::Content
389                | LayoutElementType::Abstract
390                | LayoutElementType::DocTitle
391                | LayoutElementType::ParagraphTitle
392                | LayoutElementType::Table
393                | LayoutElementType::Image
394                | LayoutElementType::Chart
395                | LayoutElementType::Formula
396                | LayoutElementType::Seal
397                | LayoutElementType::Reference
398                | LayoutElementType::ReferenceContent
399                | LayoutElementType::List
400                | LayoutElementType::FigureTitle
401                | LayoutElementType::TableTitle
402                | LayoutElementType::ChartTitle
403                | LayoutElementType::FigureTableChartTitle
404        )
405    }
406
407    fn stitch_tables(
408        tables: &mut [TableResult],
409        text_regions: &[TextRegion],
410        formulas: &[FormulaResult],
411        used_indices: &mut std::collections::HashSet<usize>,
412        cfg: &StitchConfig,
413    ) {
414        for (table_idx, table) in tables.iter_mut().enumerate() {
415            if table.cells.is_empty() {
416                continue;
417            }
418            // Use the explicit is_e2e flag from the table analyzer to determine
419            // the matching strategy, instead of inferring from confidence values.
420            let has_detected_cells = table.detected_cell_bboxes.is_some();
421            let e2e_like_cells = table.is_e2e && !has_detected_cells;
422
423            // 1. Filter relevant text regions (those overlapping the table area)
424            let table_bbox = table.bbox.clone(); // Use table bbox
425            let relevant_indices: Vec<usize> = text_regions
426                .iter()
427                .enumerate()
428                .filter(|(idx, region)| {
429                    !used_indices.contains(idx)
430                        && Self::is_overlapping(&table_bbox, &region.bounding_box, cfg)
431                })
432                .map(|(idx, _)| idx)
433                .collect();
434
435            // 1.5. Cross-cell OCR splitting (new step)
436            // Detect OCR boxes that span multiple cells and split them at cell boundaries.
437            // This improves accuracy for complex tables with rowspan/colspan.
438            let (split_regions, split_ocr_indices, _split_cell_assignments) =
439                if cfg.enable_cross_cell_split && !e2e_like_cells {
440                    Self::split_cross_cell_ocr_boxes(text_regions, &relevant_indices, &table.cells)
441                } else {
442                    (
443                        Vec::new(),
444                        std::collections::HashSet::new(),
445                        std::collections::HashMap::new(),
446                    )
447                };
448
449            // Build OCR candidate pool (split regions + unsplit original regions).
450            let mut ocr_candidates: Vec<(OcrSource, TextRegion)> = Vec::new();
451
452            for region in &split_regions {
453                let mut normalized_region = region.clone();
454                Self::normalize_tiny_symbol_for_paddlex(&mut normalized_region);
455
456                if normalized_region
457                    .text
458                    .as_ref()
459                    .map(|t| !t.trim().is_empty())
460                    .unwrap_or(false)
461                {
462                    ocr_candidates.push((OcrSource::Split, normalized_region));
463                }
464            }
465
466            // Mark split original indices as used and keep only unsplit originals in candidate pool.
467            for &ocr_idx in &relevant_indices {
468                if split_ocr_indices.contains(&ocr_idx) {
469                    used_indices.insert(ocr_idx);
470                    continue;
471                }
472
473                if let Some(region) = text_regions.get(ocr_idx) {
474                    let mut normalized_region = region.clone();
475                    Self::normalize_tiny_symbol_for_paddlex(&mut normalized_region);
476
477                    if normalized_region
478                        .text
479                        .as_ref()
480                        .map(|t| !t.trim().is_empty())
481                        .unwrap_or(false)
482                    {
483                        ocr_candidates.push((OcrSource::Original(ocr_idx), normalized_region));
484                    }
485                }
486            }
487
488            // PaddleX: inject formula results into table OCR candidate pool with $...$
489            // wrapping (table_contents_for_img). This lets formulas participate in normal
490            // cell matching, so formula content appears in the correct table cells.
491            for formula in formulas {
492                let w = formula.bbox.x_max() - formula.bbox.x_min();
493                let h = formula.bbox.y_max() - formula.bbox.y_min();
494                if w <= 1.0 || h <= 1.0 {
495                    continue;
496                }
497                if !Self::is_overlapping(&table_bbox, &formula.bbox, cfg) {
498                    continue;
499                }
500                let latex = &formula.latex;
501                let formatted = if latex.starts_with('$') && latex.ends_with('$') {
502                    latex.clone()
503                } else {
504                    format!("${}$", latex)
505                };
506                let mut formula_region = TextRegion::new(formula.bbox.clone());
507                formula_region.text = Some(formatted.into());
508                formula_region.confidence = Some(1.0);
509                ocr_candidates.push((OcrSource::Split, formula_region));
510            }
511
512            let structure_tokens = table.structure_tokens.clone();
513
514            // Prefer PaddleX-style row-aware matching when structure tokens are available.
515            // Use row-aware matching when cell detection was used (non-E2E mode).
516            let mut td_to_cell_mapping: Option<Vec<Option<usize>>> = None;
517            if !e2e_like_cells
518                && let Some(tokens) = structure_tokens.as_deref()
519                && !ocr_candidates.is_empty()
520                && let Some((mapping, matched_candidate_indices)) =
521                    Self::match_table_cells_with_structure_rows(
522                        &mut table.cells,
523                        tokens,
524                        &ocr_candidates,
525                        cfg.same_line_y_tolerance,
526                        table.detected_cell_bboxes.as_deref(),
527                    )
528            {
529                td_to_cell_mapping = Some(mapping);
530                for matched_idx in matched_candidate_indices {
531                    if let Some((OcrSource::Original(region_idx), _)) =
532                        ocr_candidates.get(matched_idx)
533                    {
534                        used_indices.insert(*region_idx);
535                    }
536                }
537            }
538
539            // Fallback matcher: assign each OCR box to the best-overlapping cell.
540            if td_to_cell_mapping.is_none() {
541                let (cell_to_ocr, matched_candidate_indices) =
542                    Self::match_table_and_ocr_by_iou_distance(
543                        &table.cells,
544                        &ocr_candidates,
545                        !e2e_like_cells, // E2E parity: allow nearest-cell assignment even when IoU=0.
546                        e2e_like_cells,  // E2E parity: use PaddleX distance metric.
547                    );
548
549                for matched_idx in matched_candidate_indices {
550                    if let Some((OcrSource::Original(region_idx), _)) =
551                        ocr_candidates.get(matched_idx)
552                    {
553                        used_indices.insert(*region_idx);
554                    }
555                }
556
557                for (cell_idx, cell) in table.cells.iter_mut().enumerate() {
558                    let has_text = cell
559                        .text
560                        .as_ref()
561                        .map(|t| !t.trim().is_empty())
562                        .unwrap_or(false);
563                    if has_text {
564                        continue;
565                    }
566
567                    if let Some(candidate_indices) = cell_to_ocr.get(&cell_idx) {
568                        if e2e_like_cells {
569                            let joined = Self::join_ocr_texts_paddlex_style(
570                                candidate_indices,
571                                &ocr_candidates,
572                            );
573                            if !joined.is_empty() {
574                                cell.text = Some(joined);
575                            }
576                        } else {
577                            let mut cell_text_regions: Vec<(&TextRegion, &str)> = candidate_indices
578                                .iter()
579                                .filter_map(|&idx| {
580                                    ocr_candidates
581                                        .get(idx)
582                                        .and_then(|(_, r)| r.text.as_deref().map(|t| (r, t)))
583                                })
584                                .collect();
585
586                            Self::sort_and_join_texts(
587                                &mut cell_text_regions,
588                                Some(&cell.bbox),
589                                cfg,
590                                |joined| {
591                                    if !joined.is_empty() {
592                                        cell.text = Some(joined);
593                                    }
594                                },
595                            );
596                        }
597                    }
598                }
599            }
600
601            // Formulas are now injected into the OCR candidate pool above,
602            // so they participate in normal cell matching — no separate attach step needed.
603
604            // Optional postprocess for checkbox-style tables:
605            // normalize common OCR confusions like ü/L/X into ✓/✗ when the table
606            // clearly exhibits both positive and negative marker patterns.
607            Self::normalize_checkbox_symbols_in_table(&mut table.cells);
608
609            // Regenerate HTML from structure tokens and stitched cell text.
610            if let Some(tokens) = structure_tokens.as_deref() {
611                let cell_texts: Vec<Option<String>> =
612                    if let Some(ref td_mapping) = td_to_cell_mapping {
613                        // Use the mapping from row-aware matching
614                        td_mapping
615                            .iter()
616                            .map(|cell_idx| {
617                                cell_idx
618                                    .and_then(|idx| table.cells.get(idx))
619                                    .and_then(|cell| cell.text.clone())
620                            })
621                            .collect()
622                    } else {
623                        // Fallback: cells may not be in the same order as structure_tokens.
624                        // We need to create a mapping from cell bbox to its index, then
625                        // iterate through tokens to collect texts in the correct order.
626                        Self::collect_cell_texts_for_tokens(&table.cells, tokens)
627                    };
628
629                let html_structure =
630                    crate::processors::wrap_table_html_with_content(tokens, &cell_texts);
631                table.html_structure = Some(html_structure);
632                table.cell_texts = Some(cell_texts);
633            }
634
635            tracing::debug!("Table {}: matching complete.", table_idx);
636        }
637    }
638
639    /// Fallback OCR->cell matcher using IoU+distance cost (PaddleX-compatible).
640    ///
641    /// Returns:
642    /// - `HashMap<cell_idx, Vec<candidate_idx>>`: assigned OCR candidates per cell
643    /// - `HashSet<candidate_idx>`: matched OCR candidate indices
644    fn match_table_and_ocr_by_iou_distance(
645        cells: &[TableCell],
646        ocr_candidates: &[(OcrSource, TextRegion)],
647        require_positive_iou: bool,
648        use_paddlex_distance: bool,
649    ) -> (
650        std::collections::HashMap<usize, Vec<usize>>,
651        std::collections::HashSet<usize>,
652    ) {
653        let mut cell_to_ocr: std::collections::HashMap<usize, Vec<usize>> =
654            std::collections::HashMap::new();
655        let mut matched_candidate_indices = std::collections::HashSet::new();
656
657        if cells.is_empty() || ocr_candidates.is_empty() {
658            return (cell_to_ocr, matched_candidate_indices);
659        }
660
661        for (candidate_idx, (_, region)) in ocr_candidates.iter().enumerate() {
662            let ocr_bbox = &region.bounding_box;
663
664            // Strategy 1: Center-point-in-cell with high IoA (strongest signal).
665            // If the OCR box center falls inside a cell AND the box has high overlap
666            // with that cell (IoA > 0.7), assign directly. The IoA check avoids
667            // misassignment for boxes that straddle cell boundaries.
668            let ocr_cx = (ocr_bbox.x_min() + ocr_bbox.x_max()) / 2.0;
669            let ocr_cy = (ocr_bbox.y_min() + ocr_bbox.y_max()) / 2.0;
670            let center_cell = cells.iter().enumerate().find(|(_, cell)| {
671                ocr_cx >= cell.bbox.x_min()
672                    && ocr_cx <= cell.bbox.x_max()
673                    && ocr_cy >= cell.bbox.y_min()
674                    && ocr_cy <= cell.bbox.y_max()
675                    && ocr_bbox.ioa(&cell.bbox) > 0.7
676            });
677
678            if let Some((cell_idx, _)) = center_cell {
679                cell_to_ocr.entry(cell_idx).or_default().push(candidate_idx);
680                matched_candidate_indices.insert(candidate_idx);
681                continue;
682            }
683
684            // Strategy 2+3: IoU + distance fallback
685            let mut best_cell_idx: Option<usize> = None;
686            let mut min_cost = (f32::MAX, f32::MAX);
687            let mut candidate_costs: Vec<(usize, (f32, f32))> = Vec::new();
688
689            for (cell_idx, cell) in cells.iter().enumerate() {
690                let iou = Self::calculate_iou(&region.bounding_box, &cell.bbox);
691                if require_positive_iou && iou <= 0.0 {
692                    continue;
693                }
694
695                let dist = if use_paddlex_distance {
696                    Self::paddlex_distance(&cell.bbox, &region.bounding_box)
697                } else {
698                    Self::l1_distance(&region.bounding_box, &cell.bbox)
699                };
700                let cost = (1.0 - iou, dist);
701                candidate_costs.push((cell_idx, cost));
702                if Self::is_better_paddlex_match_cost(cost, min_cost, cell_idx, best_cell_idx) {
703                    min_cost = cost;
704                    best_cell_idx = Some(cell_idx);
705                }
706            }
707
708            if let Some(mut cell_idx) = best_cell_idx {
709                if use_paddlex_distance {
710                    cell_idx = Self::maybe_prefer_upper_boundary_cell(
711                        cells,
712                        &region.bounding_box,
713                        cell_idx,
714                        min_cost,
715                        &candidate_costs,
716                    );
717                }
718                cell_to_ocr.entry(cell_idx).or_default().push(candidate_idx);
719                matched_candidate_indices.insert(candidate_idx);
720            }
721        }
722
723        (cell_to_ocr, matched_candidate_indices)
724    }
725
726    /// PaddleX-compatible cost ordering with deterministic near-tie handling.
727    ///
728    /// PaddleX matches by sorting on `(1 - IoU, distance)` and taking the first index.
729    /// To avoid unstable flips from tiny float noise at row boundaries, we treat
730    /// near-equal costs as a tie and keep the earlier cell index.
731    fn is_better_paddlex_match_cost(
732        candidate_cost: (f32, f32),
733        current_cost: (f32, f32),
734        candidate_idx: usize,
735        current_idx: Option<usize>,
736    ) -> bool {
737        const COST_EPS: f32 = 1e-4;
738
739        // Ignore invalid candidates.
740        if !candidate_cost.0.is_finite() || !candidate_cost.1.is_finite() {
741            return false;
742        }
743
744        // First valid candidate always wins.
745        if !current_cost.0.is_finite() || !current_cost.1.is_finite() || current_idx.is_none() {
746            return true;
747        }
748
749        if candidate_cost.0 + COST_EPS < current_cost.0 {
750            return true;
751        }
752        if (candidate_cost.0 - current_cost.0).abs() <= COST_EPS {
753            if candidate_cost.1 + COST_EPS < current_cost.1 {
754                return true;
755            }
756            if (candidate_cost.1 - current_cost.1).abs() <= COST_EPS
757                && let Some(existing_idx) = current_idx
758            {
759                return candidate_idx < existing_idx;
760            }
761        }
762
763        false
764    }
765
766    /// PaddleX-like boundary correction for E2E matching.
767    ///
768    /// PaddleX table structure boxes are integerized before matching; around row
769    /// boundaries, that can keep a straddling OCR fragment in the upper cell.
770    /// Our float boxes can shift this by <1 px and assign to the lower row.
771    /// For those near-boundary cases, prefer the directly upper cell in the same
772    /// column when both rows have substantial overlap.
773    fn maybe_prefer_upper_boundary_cell(
774        cells: &[TableCell],
775        ocr_box: &BoundingBox,
776        best_cell_idx: usize,
777        best_cost: (f32, f32),
778        candidate_costs: &[(usize, (f32, f32))],
779    ) -> usize {
780        const BOUNDARY_COST_IOU_DELTA: f32 = 0.12;
781        const BOUNDARY_OVERLAP_MIN: f32 = 0.35;
782
783        let Some(best_cell) = cells.get(best_cell_idx) else {
784            return best_cell_idx;
785        };
786        let (Some(best_row), Some(best_col)) = (best_cell.row, best_cell.col) else {
787            return best_cell_idx;
788        };
789        if best_row == 0 {
790            return best_cell_idx;
791        }
792
793        let upper_cell_idx = cells
794            .iter()
795            .position(|cell| cell.row == Some(best_row - 1) && cell.col == Some(best_col));
796        let Some(upper_cell_idx) = upper_cell_idx else {
797            return best_cell_idx;
798        };
799
800        let boundary_y = best_cell.bbox.y_min();
801        if !(ocr_box.y_min() < boundary_y && ocr_box.y_max() > boundary_y) {
802            return best_cell_idx;
803        }
804
805        let best_inter = Self::compute_inter(&best_cell.bbox, ocr_box);
806        let Some(upper_cell) = cells.get(upper_cell_idx) else {
807            return best_cell_idx;
808        };
809        let upper_inter = Self::compute_inter(&upper_cell.bbox, ocr_box);
810        if best_inter < BOUNDARY_OVERLAP_MIN || upper_inter < BOUNDARY_OVERLAP_MIN {
811            return best_cell_idx;
812        }
813
814        let upper_cost = candidate_costs
815            .iter()
816            .find_map(|(idx, cost)| (*idx == upper_cell_idx).then_some(*cost));
817        let Some(upper_cost) = upper_cost else {
818            return best_cell_idx;
819        };
820        if !upper_cost.0.is_finite() || !upper_cost.1.is_finite() {
821            return best_cell_idx;
822        }
823
824        if upper_cost.0 <= best_cost.0 + BOUNDARY_COST_IOU_DELTA {
825            upper_cell_idx
826        } else {
827            best_cell_idx
828        }
829    }
830
831    /// Normalizes a few low-confidence tiny symbols toward PaddleX-like output.
832    ///
833    /// Tiny punctuation is sensitive to sub-pixel crop differences. We only apply
834    /// this to single-character, low-confidence candidates in very small boxes.
835    fn normalize_tiny_symbol_for_paddlex(region: &mut TextRegion) {
836        let Some(text) = region.text.as_deref() else {
837            return;
838        };
839        if text.chars().count() != 1 {
840            return;
841        }
842        let Some(score) = region.confidence else {
843            return;
844        };
845
846        let width = (region.bounding_box.x_max() - region.bounding_box.x_min()).max(0.0);
847        let height = (region.bounding_box.y_max() - region.bounding_box.y_min()).max(0.0);
848
849        let replacement = if text == "=" && score < 0.45 && width <= 9.5 && height <= 7.5 {
850            Some(",")
851        } else if text == "=" && score < 0.45 && width <= 12.5 && height > 7.5 && height <= 10.5 {
852            Some("-")
853        } else if text == "0" && score < 0.20 && width <= 14.5 && height <= 14.5 {
854            Some(";")
855        } else {
856            None
857        };
858
859        if let Some(value) = replacement {
860            region.text = Some(std::sync::Arc::<str>::from(value));
861        }
862    }
863
864    fn normalize_checkbox_symbols_in_table(cells: &mut [TableCell]) {
865        let mut has_positive_candidate = false;
866        let mut has_negative_candidate = false;
867
868        for cell in cells.iter() {
869            let Some(text) = cell.text.as_deref() else {
870                continue;
871            };
872            let trimmed = text.trim();
873            if trimmed.chars().count() != 1 {
874                continue;
875            }
876            match trimmed.chars().next().unwrap_or_default() {
877                '✓' | 'ü' | 'Ü' | 'L' | '√' | '☑' => has_positive_candidate = true,
878                '✗' | 'X' | 'x' | '✕' | '✖' | '☒' => has_negative_candidate = true,
879                _ => {}
880            }
881        }
882
883        for cell in cells.iter_mut() {
884            let Some(text) = cell.text.clone() else {
885                continue;
886            };
887            let trimmed = text.trim();
888            if trimmed.chars().count() != 1 {
889                continue;
890            }
891            let mapped = match trimmed.chars().next().unwrap_or_default() {
892                // Safe positive normalization.
893                'ü' | 'Ü' | '√' | '☑' => Some("✓"),
894                // Ambiguous L is normalized only when the table appears checkbox-like.
895                'L' if has_positive_candidate && has_negative_candidate => Some("✓"),
896                // Safe negative normalization.
897                '✕' | '✖' | '☒' => Some("✗"),
898                // Ambiguous X/x are normalized only when the table appears checkbox-like.
899                'X' | 'x' if has_positive_candidate && has_negative_candidate => Some("✗"),
900                _ => None,
901            };
902
903            if let Some(symbol) = mapped {
904                cell.text = Some(symbol.to_string());
905            }
906        }
907    }
908
909    /// PaddleX-style text concatenation for one cell.
910    fn join_ocr_texts_paddlex_style(
911        candidate_indices: &[usize],
912        ocr_candidates: &[(OcrSource, TextRegion)],
913    ) -> String {
914        let mut joined = String::new();
915
916        for (i, &candidate_idx) in candidate_indices.iter().enumerate() {
917            let Some((_, region)) = ocr_candidates.get(candidate_idx) else {
918                continue;
919            };
920            let Some(text) = region.text.as_deref() else {
921                continue;
922            };
923
924            let mut content = text.to_string();
925            if candidate_indices.len() > 1 {
926                if content.is_empty() {
927                    continue;
928                }
929                if content.starts_with(' ') {
930                    content = content[1..].to_string();
931                }
932                if content.starts_with("<b>") {
933                    content = content[3..].to_string();
934                }
935                if content.ends_with("</b>") {
936                    content.truncate(content.len().saturating_sub(4));
937                }
938                if content.is_empty() {
939                    continue;
940                }
941                if i != candidate_indices.len() - 1 && !content.ends_with(' ') {
942                    content.push_str("<br/>");
943                }
944            }
945            joined.push_str(&content);
946        }
947
948        joined
949    }
950
951    /// PaddleX-style row-aware OCR-to-cell matching.
952    ///
953    /// Returns:
954    /// - `Vec<Option<usize>>`: for each `<td>` in structure order, the mapped cell index
955    /// - `HashSet<usize>`: matched OCR candidate indices
956    fn match_table_cells_with_structure_rows(
957        cells: &mut [TableCell],
958        structure_tokens: &[String],
959        ocr_candidates: &[(OcrSource, TextRegion)],
960        row_y_tolerance: f32,
961        cell_bboxes_override: Option<&[BoundingBox]>,
962    ) -> Option<(Vec<Option<usize>>, std::collections::HashSet<usize>)> {
963        if cells.is_empty() || structure_tokens.is_empty() || ocr_candidates.is_empty() {
964            return None;
965        }
966
967        // --- Sort cells into rows ---
968        // Sort structure cells — their bboxes drive both IoA matching and the
969        // td→cell text-assignment step.  Detected-cell bboxes (cell_bboxes_override)
970        // are intentionally NOT used for IoA because the detected model can produce
971        // a different cell count per row than the structure tokens, causing local_idx
972        // to diverge from td_index and corrupting OCR-to-cell assignments.
973        //
974        // When cell_bboxes_override is present, cross-row OCR deduplication is
975        // enabled downstream to prevent large detected cells spanning multiple
976        // structure rows from duplicating content.
977        let (cell_sorted_indices, cell_row_flags) =
978            Self::sort_table_cells_boxes(cells, row_y_tolerance);
979
980        if cell_sorted_indices.is_empty() || cell_row_flags.is_empty() {
981            return None;
982        }
983
984        let mut row_start_index = Self::find_row_start_index(structure_tokens);
985        if row_start_index.is_empty() {
986            return None;
987        }
988
989        // Align structure-cell row flags with structure-token row boundaries.
990        // cell_aligned is used both for IoA matching (correct space) and td→cell mapping.
991        let mut cell_aligned = Self::map_and_get_max(&cell_row_flags, &row_start_index);
992        cell_aligned.push(cell_sorted_indices.len());
993        row_start_index.push(
994            structure_tokens
995                .iter()
996                .filter(|t| Self::is_td_end_token(t))
997                .count(),
998        );
999
1000        // --- Per-row matching: cell → OCR (PaddleX style) ---
1001        // For each cell in the row, collect ALL OCR boxes with IoA > 0.7.
1002        // When using detected cell bboxes (cell_bboxes_override is Some), apply
1003        // cross-row deduplication: an OCR box already claimed by an earlier row is
1004        // not re-matched in a later row.  This prevents large detected cells that
1005        // span multiple structure rows from duplicating their content across those rows.
1006        // In pure E2E mode (cell_bboxes_override is None) the PaddleX v2 behavior of
1007        // independent per-row matching is preserved.
1008        let use_cross_row_dedup = cell_bboxes_override.is_some();
1009        let mut globally_matched_ocr: std::collections::HashSet<usize> =
1010            std::collections::HashSet::new();
1011        let mut all_matched: Vec<std::collections::HashMap<usize, Vec<usize>>> = Vec::new();
1012
1013        for k in 0..cell_aligned.len().saturating_sub(1) {
1014            let row_start = cell_aligned[k].min(cell_sorted_indices.len());
1015            let row_end = cell_aligned[k + 1].min(cell_sorted_indices.len());
1016
1017            let mut matched: std::collections::HashMap<usize, Vec<usize>> =
1018                std::collections::HashMap::new();
1019
1020            for (local_idx, &cell_idx) in cell_sorted_indices[row_start..row_end].iter().enumerate()
1021            {
1022                // Always use structure cell bbox for IoA matching.  Detected-cell bboxes
1023                // (cell_bboxes_override) are not used here because their cell count per
1024                // row can differ from the structure td count, causing local_idx to
1025                // diverge from td_index and corrupt the OCR-to-cell assignment.
1026                let cell_box = &cells[cell_idx.min(cells.len() - 1)].bbox;
1027
1028                for (ocr_idx, (_, ocr_region)) in ocr_candidates.iter().enumerate() {
1029                    if use_cross_row_dedup && globally_matched_ocr.contains(&ocr_idx) {
1030                        continue;
1031                    }
1032                    // IoA = intersection / OCR_area (PaddleX compute_inter > 0.7)
1033                    let ioa = ocr_region.bounding_box.ioa(cell_box);
1034                    if ioa > 0.7 {
1035                        matched.entry(local_idx).or_default().push(ocr_idx);
1036                    }
1037                }
1038            }
1039
1040            if use_cross_row_dedup {
1041                for indices in matched.values() {
1042                    globally_matched_ocr.extend(indices.iter().copied());
1043                }
1044            }
1045
1046            all_matched.push(matched);
1047        }
1048
1049        // --- Build td_to_cell_mapping by iterating structure tokens ---
1050        // table.cells maps exactly 1:1 with td tokens in structure order.
1051        let mut td_to_cell_mapping: Vec<Option<usize>> = Vec::new();
1052        let mut matched_candidate_indices: std::collections::HashSet<usize> =
1053            std::collections::HashSet::new();
1054
1055        let mut td_index = 0usize;
1056        let mut td_count = 0usize;
1057        let mut matched_row_idx = 0usize;
1058
1059        for tag in structure_tokens {
1060            if tag == "<tr>" {
1061                td_index = 0; // Reset cell index at row start
1062                continue;
1063            }
1064            if !Self::is_td_end_token(tag) {
1065                continue;
1066            }
1067
1068            let row_matches = all_matched.get(matched_row_idx);
1069            let matched_ocr_indices = row_matches.and_then(|m| m.get(&td_index));
1070            let matched_text = matched_ocr_indices
1071                .and_then(|indices| Self::compose_matched_cell_text(indices, ocr_candidates));
1072
1073            if let Some(indices) = matched_ocr_indices {
1074                matched_candidate_indices.extend(indices.iter().copied());
1075            }
1076
1077            // Map td position to the original cell index via sorted ordering.
1078            // Use cell_aligned (derived from structure-cell row flags) rather than
1079            // match_aligned (derived from detected-cell row flags).  When the two
1080            // models disagree on cell count per row, using match_aligned here would
1081            // offset into the wrong row of cell_sorted_indices.
1082            let mapped_cell_idx = cell_aligned
1083                .get(matched_row_idx)
1084                .copied()
1085                .and_then(|row_start| {
1086                    let sorted_pos = row_start + td_index;
1087                    cell_sorted_indices.get(sorted_pos).copied()
1088                })
1089                .filter(|&idx| idx < cells.len());
1090
1091            td_to_cell_mapping.push(mapped_cell_idx);
1092
1093            if let (Some(cell_idx), Some(text)) = (mapped_cell_idx, matched_text)
1094                && let Some(cell) = cells.get_mut(cell_idx)
1095            {
1096                let has_text = cell
1097                    .text
1098                    .as_ref()
1099                    .map(|t| !t.trim().is_empty())
1100                    .unwrap_or(false);
1101                if !has_text {
1102                    cell.text = Some(text);
1103                }
1104            }
1105
1106            td_index += 1;
1107            td_count += 1;
1108
1109            if matched_row_idx + 1 < row_start_index.len()
1110                && td_count >= row_start_index[matched_row_idx + 1]
1111            {
1112                matched_row_idx += 1;
1113            }
1114        }
1115
1116        if td_to_cell_mapping.is_empty() {
1117            None
1118        } else {
1119            Some((td_to_cell_mapping, matched_candidate_indices))
1120        }
1121    }
1122
1123    /// Collects cell texts in the order they appear in structure tokens.
1124    ///
1125    /// Uses grid-based `(row, col)` matching when cells have grid info, which
1126    /// correctly handles rowspan/colspan cases where cells.len() != td_count.
1127    /// Falls back to index-based matching when grid info is unavailable.
1128    fn collect_cell_texts_for_tokens(
1129        cells: &[TableCell],
1130        tokens: &[String],
1131    ) -> Vec<Option<String>> {
1132        if cells.is_empty() {
1133            return Vec::new();
1134        }
1135
1136        // Parse grid positions for each <td> token
1137        let token_grid = parse_cell_grid_info(tokens);
1138        let td_count = token_grid.len();
1139
1140        // Build a lookup from (row, col) -> cell index for cells that have grid info
1141        let mut grid_to_cell: std::collections::HashMap<(usize, usize), usize> =
1142            std::collections::HashMap::new();
1143        let mut has_grid_info = false;
1144
1145        for (cell_idx, cell) in cells.iter().enumerate() {
1146            if let (Some(row), Some(col)) = (cell.row, cell.col) {
1147                grid_to_cell.insert((row, col), cell_idx);
1148                has_grid_info = true;
1149            }
1150        }
1151
1152        if has_grid_info {
1153            // Grid-based matching: match tokens to cells by (row, col) position
1154            token_grid
1155                .iter()
1156                .map(|gi| {
1157                    grid_to_cell
1158                        .get(&(gi.row, gi.col))
1159                        .and_then(|&idx| cells.get(idx))
1160                        .and_then(|cell| cell.text.clone())
1161                })
1162                .collect()
1163        } else {
1164            // Fallback: cells don't have grid info, use index-based matching
1165            (0..td_count)
1166                .map(|i| cells.get(i).and_then(|cell| cell.text.clone()))
1167                .collect()
1168        }
1169    }
1170
1171    /// Sort table cells row-by-row (top-to-bottom, left-to-right) and return row flags.
1172    ///
1173    /// Returns `(sorted_indices, flags)` where `flags` contains cumulative row starts.
1174    fn sort_table_cells_boxes(
1175        cells: &[TableCell],
1176        row_y_tolerance: f32,
1177    ) -> (Vec<usize>, Vec<usize>) {
1178        if cells.is_empty() {
1179            return (Vec::new(), Vec::new());
1180        }
1181
1182        let mut by_y: Vec<usize> = (0..cells.len()).collect();
1183        by_y.sort_by(|&a, &b| {
1184            cells[a]
1185                .bbox
1186                .y_min()
1187                .partial_cmp(&cells[b].bbox.y_min())
1188                .unwrap_or(Ordering::Equal)
1189        });
1190
1191        let mut rows: Vec<Vec<usize>> = Vec::new();
1192        let mut current_row: Vec<usize> = Vec::new();
1193        let mut current_y: Option<f32> = None;
1194
1195        for idx in by_y {
1196            let y = cells[idx].bbox.y_min();
1197            match current_y {
1198                None => {
1199                    current_row.push(idx);
1200                    current_y = Some(y);
1201                }
1202                Some(row_y) if (y - row_y).abs() <= row_y_tolerance => {
1203                    current_row.push(idx);
1204                }
1205                Some(_) => {
1206                    current_row.sort_by(|&a, &b| {
1207                        cells[a]
1208                            .bbox
1209                            .x_min()
1210                            .partial_cmp(&cells[b].bbox.x_min())
1211                            .unwrap_or(Ordering::Equal)
1212                    });
1213                    rows.push(current_row);
1214                    current_row = vec![idx];
1215                    current_y = Some(y);
1216                }
1217            }
1218        }
1219
1220        if !current_row.is_empty() {
1221            current_row.sort_by(|&a, &b| {
1222                cells[a]
1223                    .bbox
1224                    .x_min()
1225                    .partial_cmp(&cells[b].bbox.x_min())
1226                    .unwrap_or(Ordering::Equal)
1227            });
1228            rows.push(current_row);
1229        }
1230
1231        let mut sorted = Vec::with_capacity(cells.len());
1232        let mut flags = Vec::with_capacity(rows.len() + 1);
1233        flags.push(0);
1234
1235        for row in rows {
1236            sorted.extend(row.iter().copied());
1237            let next = flags.last().copied().unwrap_or(0) + row.len();
1238            flags.push(next);
1239        }
1240
1241        (sorted, flags)
1242    }
1243
1244    /// Find the first table-cell index for each row in structure tokens.
1245    fn find_row_start_index(structure_tokens: &[String]) -> Vec<usize> {
1246        let mut row_start_indices = Vec::new();
1247        let mut current_index = 0usize;
1248        let mut inside_row = false;
1249
1250        for token in structure_tokens {
1251            if token == "<tr>" {
1252                inside_row = true;
1253            } else if token == "</tr>" {
1254                inside_row = false;
1255            } else if Self::is_td_end_token(token) && inside_row {
1256                row_start_indices.push(current_index);
1257                inside_row = false;
1258            }
1259
1260            if Self::is_td_end_token(token) {
1261                current_index += 1;
1262            }
1263        }
1264
1265        row_start_indices
1266    }
1267
1268    /// Align row boundary flags from detected cells to structure row starts.
1269    fn map_and_get_max(table_cells_flag: &[usize], row_start_index: &[usize]) -> Vec<usize> {
1270        let mut max_values = Vec::with_capacity(row_start_index.len());
1271        let mut i = 0usize;
1272        let mut max_value: Option<usize> = None;
1273
1274        for &row_start in row_start_index {
1275            while i < table_cells_flag.len() && table_cells_flag[i] <= row_start {
1276                max_value =
1277                    Some(max_value.map_or(table_cells_flag[i], |v| v.max(table_cells_flag[i])));
1278                i += 1;
1279            }
1280            max_values.push(max_value.unwrap_or(row_start));
1281        }
1282
1283        max_values
1284    }
1285
1286    /// Whether a structure token corresponds to the end of one table cell.
1287    fn is_td_end_token(token: &str) -> bool {
1288        token == "<td></td>"
1289            || token == "</td>"
1290            || (token.contains("<td") && token.contains("</td>"))
1291    }
1292
1293    /// Compose cell text from matched OCR fragments, mirroring PaddleX merge logic.
1294    fn compose_matched_cell_text(
1295        matched_indices: &[usize],
1296        ocr_candidates: &[(OcrSource, TextRegion)],
1297    ) -> Option<String> {
1298        if matched_indices.is_empty() {
1299            return None;
1300        }
1301
1302        let mut merged = String::new();
1303
1304        for (i, &ocr_idx) in matched_indices.iter().enumerate() {
1305            let Some((_, region)) = ocr_candidates.get(ocr_idx) else {
1306                continue;
1307            };
1308            let Some(raw_text) = region.text.as_deref() else {
1309                continue;
1310            };
1311
1312            let mut content = raw_text.to_string();
1313            if matched_indices.len() > 1 {
1314                if content.starts_with(' ') {
1315                    content = content.chars().skip(1).collect();
1316                }
1317                content = content.replace("<b>", "");
1318                content = content.replace("</b>", "");
1319                if content.is_empty() {
1320                    continue;
1321                }
1322                if i != matched_indices.len() - 1 && !content.ends_with(' ') {
1323                    content.push_str("<br/>");
1324                }
1325            }
1326
1327            merged.push_str(&content);
1328        }
1329
1330        let merged = merged.trim_end().to_string();
1331        if merged.is_empty() {
1332            None
1333        } else {
1334            Some(merged)
1335        }
1336    }
1337
1338    /// Intersection over OCR area (`inter / rec2_area`), matching PaddleX `compute_inter`.
1339    fn compute_inter(rec1: &BoundingBox, rec2: &BoundingBox) -> f32 {
1340        let x_left = rec1.x_min().max(rec2.x_min());
1341        let y_top = rec1.y_min().max(rec2.y_min());
1342        let x_right = rec1.x_max().min(rec2.x_max());
1343        let y_bottom = rec1.y_max().min(rec2.y_max());
1344
1345        let inter_width = (x_right - x_left).max(0.0);
1346        let inter_height = (y_bottom - y_top).max(0.0);
1347        let inter_area = inter_width * inter_height;
1348
1349        let rec2_area = (rec2.x_max() - rec2.x_min()) * (rec2.y_max() - rec2.y_min());
1350        if rec2_area <= 0.0 {
1351            0.0
1352        } else {
1353            inter_area / rec2_area
1354        }
1355    }
1356
1357    /// Detects and splits OCR boxes that span multiple table cells.
1358    ///
1359    /// Returns:
1360    /// - Vec<TextRegion>: New text regions created from split OCR boxes
1361    /// - HashSet<usize>: Indices of original regions that were split
1362    /// - HashMap<usize, Vec<usize>>: Mapping from cell_idx -> indices in the new split_regions vec
1363    fn split_cross_cell_ocr_boxes(
1364        text_regions: &[TextRegion],
1365        relevant_indices: &[usize],
1366        cells: &[oar_ocr_core::domain::structure::TableCell],
1367    ) -> (
1368        Vec<TextRegion>,
1369        std::collections::HashSet<usize>,
1370        std::collections::HashMap<usize, Vec<usize>>,
1371    ) {
1372        let mut split_regions: Vec<TextRegion> = Vec::new();
1373        let mut split_ocr_indices: std::collections::HashSet<usize> =
1374            std::collections::HashSet::new();
1375        let mut cell_assignments: std::collections::HashMap<usize, Vec<usize>> =
1376            std::collections::HashMap::new();
1377
1378        // Build a subset of text regions for the table
1379        let table_regions: Vec<TextRegion> = relevant_indices
1380            .iter()
1381            .map(|&idx| text_regions[idx].clone())
1382            .collect();
1383
1384        if table_regions.is_empty() || cells.is_empty() {
1385            return (split_regions, split_ocr_indices, cell_assignments);
1386        }
1387
1388        // Use the cross-cell splitting utility
1389        let split_config = OcrSplitConfig::default();
1390        let (expanded, processed_local_indices) =
1391            create_expanded_ocr_for_table(&table_regions, cells, Some(&split_config));
1392
1393        // Map local indices back to original indices
1394        for local_idx in processed_local_indices {
1395            if local_idx < relevant_indices.len() {
1396                split_ocr_indices.insert(relevant_indices[local_idx]);
1397            }
1398        }
1399
1400        // Add expanded regions and track cell assignments
1401        for region in expanded {
1402            let region_idx = split_regions.len();
1403
1404            // Find the best matching cell for this expanded region
1405            let mut best_cell_idx = None;
1406            let mut best_iou = 0.0f32;
1407
1408            for (cell_idx, cell) in cells.iter().enumerate() {
1409                let iou = region.bounding_box.iou(&cell.bbox);
1410                if iou > best_iou {
1411                    best_iou = iou;
1412                    best_cell_idx = Some(cell_idx);
1413                }
1414            }
1415
1416            // Only assign to a cell if there's actual overlap
1417            if let Some(cell_idx) = best_cell_idx {
1418                cell_assignments
1419                    .entry(cell_idx)
1420                    .or_default()
1421                    .push(region_idx);
1422            }
1423
1424            split_regions.push(region);
1425        }
1426
1427        tracing::debug!(
1428            "Cross-cell OCR splitting: {} original regions processed, {} new regions created",
1429            split_ocr_indices.len(),
1430            split_regions.len()
1431        );
1432
1433        (split_regions, split_ocr_indices, cell_assignments)
1434    }
1435
1436    /// Calculates the Intersection over Union (IoU) between two bounding boxes.
1437    fn calculate_iou(bbox1: &BoundingBox, bbox2: &BoundingBox) -> f32 {
1438        let x1_min = bbox1.x_min();
1439        let y1_min = bbox1.y_min();
1440        let x1_max = bbox1.x_max();
1441        let y1_max = bbox1.y_max();
1442
1443        let x2_min = bbox2.x_min();
1444        let y2_min = bbox2.y_min();
1445        let x2_max = bbox2.x_max();
1446        let y2_max = bbox2.y_max();
1447
1448        let inter_x_min = x1_min.max(x2_min);
1449        let inter_y_min = y1_min.max(y2_min);
1450        let inter_x_max = x1_max.min(x2_max);
1451        let inter_y_max = y1_max.min(y2_max);
1452
1453        let inter_w = (inter_x_max - inter_x_min).max(0.0);
1454        let inter_h = (inter_y_max - inter_y_min).max(0.0);
1455        let inter_area = inter_w * inter_h;
1456
1457        let area1 = (x1_max - x1_min) * (y1_max - y1_min);
1458        let area2 = (x2_max - x2_min) * (y2_max - y2_min);
1459        let union_area = area1 + area2 - inter_area;
1460
1461        if union_area > 0.0 {
1462            inter_area / union_area
1463        } else {
1464            0.0
1465        }
1466    }
1467
1468    /// Calculates the L1 distance between two axis-aligned boxes.
1469    fn l1_distance(bbox1: &BoundingBox, bbox2: &BoundingBox) -> f32 {
1470        let b1 = [bbox1.x_min(), bbox1.y_min(), bbox1.x_max(), bbox1.y_max()];
1471        let b2 = [bbox2.x_min(), bbox2.y_min(), bbox2.x_max(), bbox2.y_max()];
1472
1473        (b2[0] - b1[0]).abs()
1474            + (b2[1] - b1[1]).abs()
1475            + (b2[2] - b1[2]).abs()
1476            + (b2[3] - b1[3]).abs()
1477    }
1478
1479    /// PaddleX table matcher distance (used in E2E path).
1480    fn paddlex_distance(table_box: &BoundingBox, ocr_box: &BoundingBox) -> f32 {
1481        let x1 = table_box.x_min();
1482        let y1 = table_box.y_min();
1483        let x2 = table_box.x_max();
1484        let y2 = table_box.y_max();
1485        let x3 = ocr_box.x_min();
1486        let y3 = ocr_box.y_min();
1487        let x4 = ocr_box.x_max();
1488        let y4 = ocr_box.y_max();
1489
1490        let dis = (x3 - x1).abs() + (y3 - y1).abs() + (x4 - x2).abs() + (y4 - y2).abs();
1491        let dis_2 = (x3 - x1).abs() + (y3 - y1).abs();
1492        let dis_3 = (x4 - x2).abs() + (y4 - y2).abs();
1493        dis + dis_2.min(dis_3)
1494    }
1495
1496    /// Marks small inline formulas to be absorbed into the text flow.
1497    ///
1498    /// PaddleX: Small formula elements should be absorbed into the text flow,
1499    /// not kept as separate layout elements.
1500    ///
1501    /// This function:
1502    /// 1. Finds small formula elements that should be inline (not display formulas)
1503    /// 2. Clears their text and order_index so the formula element won't be rendered
1504    /// 3. The corresponding TextRegion with label="formula" (already created in structure.rs)
1505    ///    will become an orphan and be handled with proper $...$ wrapping
1506    fn inject_inline_formulas(
1507        elements: &mut [LayoutElement],
1508        _text_regions: &mut Vec<TextRegion>,
1509        _cfg: &StitchConfig,
1510    ) {
1511        use oar_ocr_core::domain::structure::LayoutElementType;
1512
1513        let mut inline_formula_indices: Vec<usize> = Vec::new();
1514
1515        // Size threshold: formulas smaller than 80k pixels² are likely inline
1516        const INLINE_FORMULA_MAX_AREA: f32 = 80000.0;
1517
1518        for (idx, element) in elements.iter().enumerate() {
1519            if element.element_type != LayoutElementType::Formula {
1520                continue;
1521            }
1522
1523            // Only process formulas that have text
1524            let formula_text = if let Some(text) = &element.text {
1525                if !text.is_empty() {
1526                    text
1527                } else {
1528                    continue;
1529                }
1530            } else {
1531                continue;
1532            };
1533
1534            let formula_area = element.bbox.area();
1535            tracing::debug!(
1536                "Formula idx {}: area={:.1}, text={}",
1537                idx,
1538                formula_area,
1539                formula_text
1540            );
1541
1542            // Small formulas are treated as inline
1543            if formula_area < INLINE_FORMULA_MAX_AREA {
1544                inline_formula_indices.push(idx);
1545                tracing::debug!(
1546                    "Marking formula idx {} as inline (area {:.1} < {})",
1547                    idx,
1548                    formula_area,
1549                    INLINE_FORMULA_MAX_AREA
1550                );
1551            }
1552        }
1553
1554        // Clear inline formula elements so they won't be rendered separately
1555        for idx in &inline_formula_indices {
1556            if let Some(element) = elements.get_mut(*idx) {
1557                tracing::debug!(
1558                    "Clearing inline formula idx {} to use TextRegion with label=formula",
1559                    idx
1560                );
1561                element.text = None;
1562                element.order_index = None;
1563            }
1564        }
1565
1566        if !inline_formula_indices.is_empty() {
1567            tracing::debug!("Marked {} formulas as inline", inline_formula_indices.len());
1568        }
1569    }
1570
1571    fn stitch_layout_elements(
1572        elements: &mut [LayoutElement],
1573        text_regions: &[TextRegion],
1574        used_indices: &mut std::collections::HashSet<usize>,
1575        cfg: &StitchConfig,
1576    ) {
1577        tracing::debug!(
1578            "stitch_layout_elements: {} elements, {} regions, {} already used",
1579            elements.len(),
1580            text_regions.len(),
1581            used_indices.len()
1582        );
1583
1584        for (elem_idx, element) in elements.iter_mut().enumerate() {
1585            // Skip special types that have their own content handling:
1586            // - Table: handled separately with cell-level matching
1587            // - Formula: filled with LaTeX content
1588            // - Seal: may have specialized seal OCR results
1589            // This matches PP-StructureV3's behavior in standardized_data()
1590            if EXCLUDED_FROM_OCR_LABELS.contains(&element.element_type) {
1591                continue;
1592            }
1593
1594            let mut element_texts: Vec<(&TextRegion, &str)> = Vec::new();
1595
1596            for (idx, region) in text_regions.iter().enumerate() {
1597                if let Some(text) = &region.text
1598                    && Self::is_overlapping(&element.bbox, &region.bounding_box, cfg)
1599                {
1600                    element_texts.push((region, text));
1601                    // Only mark as used if not already used (to allow sharing if needed,
1602                    // though typically strict assignment is better. Some systems allow one-to-many
1603                    // matching, but here we track usage to find orphans)
1604                    used_indices.insert(idx);
1605                }
1606            }
1607
1608            if !element_texts.is_empty() {
1609                tracing::debug!(
1610                    "Element {} ({:?}): matched {} regions",
1611                    elem_idx,
1612                    element.element_type,
1613                    element_texts.len()
1614                );
1615
1616                // Debug: log all text regions being joined
1617                for (region, text) in &element_texts {
1618                    tracing::debug!("  - region with label={:?}, text={:?}", region.label, text);
1619                }
1620
1621                // Compute seg metadata (seg_start_x, seg_end_x, num_lines) for get_seg_flag.
1622                // Sort a copy to find first/last spans and count lines.
1623                let mut sorted_for_meta = element_texts.clone();
1624                sorted_for_meta.sort_by(|(r1, _), (r2, _)| {
1625                    r1.bounding_box
1626                        .center()
1627                        .y
1628                        .partial_cmp(&r2.bounding_box.center().y)
1629                        .unwrap_or(Ordering::Equal)
1630                });
1631                let mut lines = Vec::new();
1632                let mut current_line = Vec::new();
1633                for item in std::mem::take(&mut sorted_for_meta) {
1634                    if current_line.is_empty() {
1635                        current_line.push(item);
1636                    } else {
1637                        let first_in_line = &current_line[0].0.bounding_box;
1638                        if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) {
1639                            current_line.push(item);
1640                        } else {
1641                            current_line.sort_by(|(r1, _), (r2, _)| {
1642                                r1.bounding_box
1643                                    .center()
1644                                    .x
1645                                    .partial_cmp(&r2.bounding_box.center().x)
1646                                    .unwrap_or(Ordering::Equal)
1647                            });
1648                            lines.push(current_line);
1649                            current_line = vec![item];
1650                        }
1651                    }
1652                }
1653                if !current_line.is_empty() {
1654                    current_line.sort_by(|(r1, _), (r2, _)| {
1655                        r1.bounding_box
1656                            .center()
1657                            .x
1658                            .partial_cmp(&r2.bounding_box.center().x)
1659                            .unwrap_or(Ordering::Equal)
1660                    });
1661                    lines.push(current_line);
1662                }
1663                for mut line in lines {
1664                    sorted_for_meta.append(&mut line);
1665                }
1666
1667                // seg_start_x: first span's left edge (PaddleX: line[0].spans[0].box[0])
1668                element.seg_start_x = Some(sorted_for_meta[0].0.bounding_box.x_min());
1669                // seg_end_x: last span's right edge (PaddleX: line[-1].spans[-1].box[2])
1670                element.seg_end_x = Some(sorted_for_meta.last().unwrap().0.bounding_box.x_max());
1671
1672                // Count distinct lines (Y-groups)
1673                let mut num_lines = 1u32;
1674                let mut prev_bbox = &sorted_for_meta[0].0.bounding_box;
1675                for (region, _) in &sorted_for_meta[1..] {
1676                    if !Self::is_same_text_line_bbox(prev_bbox, &region.bounding_box, cfg) {
1677                        num_lines += 1;
1678                        prev_bbox = &region.bounding_box;
1679                    }
1680                }
1681                element.num_lines = Some(num_lines);
1682            }
1683
1684            Self::sort_and_join_texts(&mut element_texts, Some(&element.bbox), cfg, |joined| {
1685                element.text = Some(joined);
1686            });
1687        }
1688    }
1689
1690    /// Fills formula layout elements with LaTeX content from formula recognition results.
1691    ///
1692    /// This ensures formula elements have correct content even if OCR matching
1693    /// thresholds prevented proper association.
1694    fn fill_formula_elements(
1695        elements: &mut [LayoutElement],
1696        formulas: &[FormulaResult],
1697        _cfg: &StitchConfig,
1698    ) {
1699        for element in elements.iter_mut() {
1700            if element.element_type != LayoutElementType::Formula {
1701                continue;
1702            }
1703
1704            // Skip if element already has content from OCR matching
1705            if element.text.is_some() {
1706                continue;
1707            }
1708
1709            // Find the best matching formula result by bidirectional IoA.
1710            // IoA (intersection / self_area) is much more permissive than IoU for
1711            // size-mismatched bboxes. PaddleX uses simple intersection overlap (>3px).
1712            let mut best_formula: Option<&FormulaResult> = None;
1713            let mut best_score = 0.0f32;
1714
1715            for formula in formulas {
1716                let ioa_element = element.bbox.ioa(&formula.bbox);
1717                let ioa_formula = formula.bbox.ioa(&element.bbox);
1718                let score = ioa_element.max(ioa_formula);
1719                if score > best_score {
1720                    best_score = score;
1721                    best_formula = Some(formula);
1722                }
1723            }
1724
1725            // Fallback: if no IoA match, try center-containment matching.
1726            // Find formula whose center is within the element bbox (or vice versa).
1727            if best_score < 0.05 {
1728                let elem_center = element.bbox.center();
1729                let mut best_dist = f32::MAX;
1730
1731                for formula in formulas {
1732                    let fc = formula.bbox.center();
1733                    let fc_inside = fc.x >= element.bbox.x_min()
1734                        && fc.x <= element.bbox.x_max()
1735                        && fc.y >= element.bbox.y_min()
1736                        && fc.y <= element.bbox.y_max();
1737                    let ec_inside = elem_center.x >= formula.bbox.x_min()
1738                        && elem_center.x <= formula.bbox.x_max()
1739                        && elem_center.y >= formula.bbox.y_min()
1740                        && elem_center.y <= formula.bbox.y_max();
1741
1742                    if fc_inside || ec_inside {
1743                        let dx = fc.x - elem_center.x;
1744                        let dy = fc.y - elem_center.y;
1745                        let dist = dx * dx + dy * dy;
1746                        if dist < best_dist {
1747                            best_dist = dist;
1748                            best_formula = Some(formula);
1749                            best_score = 0.05;
1750                        }
1751                    }
1752                }
1753            }
1754
1755            if best_score >= 0.05
1756                && let Some(formula) = best_formula
1757            {
1758                element.text = Some(formula.latex.clone());
1759            }
1760        }
1761    }
1762
1763    /// Checks if two bounding boxes overlap significantly (intersection dimensions > 3px).
1764    /// Matches `get_overlap_boxes_idx` logic.
1765    fn is_overlapping(bbox1: &BoundingBox, bbox2: &BoundingBox, cfg: &StitchConfig) -> bool {
1766        let x1_min = bbox1.x_min();
1767        let y1_min = bbox1.y_min();
1768        let x1_max = bbox1.x_max();
1769        let y1_max = bbox1.y_max();
1770
1771        let x2_min = bbox2.x_min();
1772        let y2_min = bbox2.y_min();
1773        let x2_max = bbox2.x_max();
1774        let y2_max = bbox2.y_max();
1775
1776        let inter_x_min = x1_min.max(x2_min);
1777        let inter_y_min = y1_min.max(y2_min);
1778        let inter_x_max = x1_max.min(x2_max);
1779        let inter_y_max = y1_max.min(y2_max);
1780
1781        let inter_w = inter_x_max - inter_x_min;
1782        let inter_h = inter_y_max - inter_y_min;
1783
1784        inter_w > cfg.overlap_min_pixels && inter_h > cfg.overlap_min_pixels
1785    }
1786
1787    /// Checks whether two OCR spans should be grouped into the same visual line.
1788    ///
1789    /// Primary signal follows PaddleX-style line-height overlap:
1790    /// vertical_overlap / min(height1, height2) >= threshold.
1791    /// A small adaptive center-Y fallback is kept for robustness on noisy boxes.
1792    fn is_same_text_line_bbox(
1793        bbox1: &BoundingBox,
1794        bbox2: &BoundingBox,
1795        cfg: &StitchConfig,
1796    ) -> bool {
1797        let h1 = (bbox1.y_max() - bbox1.y_min()).max(1.0);
1798        let h2 = (bbox2.y_max() - bbox2.y_min()).max(1.0);
1799        let inter_h =
1800            (bbox1.y_max().min(bbox2.y_max()) - bbox1.y_min().max(bbox2.y_min())).max(0.0);
1801        let overlap_ratio = inter_h / h1.min(h2);
1802        if overlap_ratio >= cfg.line_height_iou_threshold {
1803            return true;
1804        }
1805
1806        let adaptive_tol = (h1.min(h2) * 0.5).max(1.0);
1807        let center_delta = (bbox1.center().y - bbox2.center().y).abs();
1808        center_delta <= adaptive_tol.max(cfg.same_line_y_tolerance * 0.25)
1809    }
1810
1811    fn sort_and_join_texts<F>(
1812        texts: &mut Vec<(&TextRegion, &str)>,
1813        container_bbox: Option<&BoundingBox>,
1814        cfg: &StitchConfig,
1815        update_fn: F,
1816    ) where
1817        F: FnOnce(String),
1818    {
1819        if texts.is_empty() {
1820            return;
1821        }
1822
1823        // Sort spatially: top-to-bottom, then left-to-right
1824        texts.sort_by(|(r1, _), (r2, _)| {
1825            r1.bounding_box
1826                .center()
1827                .y
1828                .partial_cmp(&r2.bounding_box.center().y)
1829                .unwrap_or(Ordering::Equal)
1830        });
1831        let mut lines = Vec::new();
1832        let mut current_line = Vec::new();
1833        for item in std::mem::take(texts) {
1834            if current_line.is_empty() {
1835                current_line.push(item);
1836            } else {
1837                let first_in_line = &current_line[0].0.bounding_box;
1838                if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) {
1839                    current_line.push(item);
1840                } else {
1841                    current_line.sort_by(|(r1, _), (r2, _)| {
1842                        r1.bounding_box
1843                            .center()
1844                            .x
1845                            .partial_cmp(&r2.bounding_box.center().x)
1846                            .unwrap_or(Ordering::Equal)
1847                    });
1848                    lines.push(current_line);
1849                    current_line = vec![item];
1850                }
1851            }
1852        }
1853        if !current_line.is_empty() {
1854            current_line.sort_by(|(r1, _), (r2, _)| {
1855                r1.bounding_box
1856                    .center()
1857                    .x
1858                    .partial_cmp(&r2.bounding_box.center().x)
1859                    .unwrap_or(Ordering::Equal)
1860            });
1861            lines.push(current_line);
1862        }
1863        for mut line in lines {
1864            texts.append(&mut line);
1865        }
1866
1867        // Smart text joining following format_line logic:
1868        // - Texts on the same line are joined directly (no separator)
1869        // - A space is added only if the previous text ends with an English letter
1870        // - Newlines are added conditionally based on geometric gap (paragraph break detection)
1871        let mut result = String::new();
1872        let mut prev_region: Option<&TextRegion> = None;
1873
1874        tracing::debug!(
1875            "sort_and_join_texts: processing {} text regions",
1876            texts.len()
1877        );
1878
1879        for (region, text) in texts.iter() {
1880            if text.is_empty() {
1881                continue;
1882            }
1883
1884            if let Some(last_region) = prev_region {
1885                if !Self::is_same_text_line_bbox(
1886                    &last_region.bounding_box,
1887                    &region.bounding_box,
1888                    cfg,
1889                ) {
1890                    // New visual line detected.
1891                    // Decide whether to insert '\n' (hard break) or ' ' (soft break/wrap).
1892                    let mut add_newline = false;
1893                    let mut is_line_wrap = false;
1894
1895                    if let Some(container) = container_bbox {
1896                        let container_width = container.x_max() - container.x_min();
1897                        let right_gap = container.x_max() - last_region.bounding_box.x_max();
1898                        let tail_char = last_non_whitespace_char(&result);
1899                        let ends_with_non_break_punct =
1900                            tail_char.is_some_and(is_non_break_line_end_punctuation);
1901                        // PaddleX: English lines use a larger right-gap threshold.
1902                        let paragraph_gap_ratio =
1903                            if tail_char.is_some_and(|c| c.is_ascii_alphabetic()) {
1904                                0.5
1905                            } else {
1906                                0.3
1907                            };
1908
1909                        if !ends_with_non_break_punct
1910                            && right_gap > container_width * paragraph_gap_ratio
1911                        {
1912                            // Previous line ended far from the right edge → paragraph break.
1913                            add_newline = true;
1914                        } else {
1915                            // Previous line extends close to the right edge → line wrap.
1916                            is_line_wrap = true;
1917                        }
1918                    }
1919
1920                    // Dehyphenation: only strip trailing hyphen when the previous line
1921                    // is a wrapped line (extends close to container right edge).
1922                    // This preserves hyphens in compound words like "real-time",
1923                    // "end-to-end", "one-to-many" that end short lines.
1924                    // Matches PaddleX format_line behavior where hyphens are stripped
1925                    // at line-wrap boundaries.
1926                    let prev_ends_hyphen = result.ends_with('-');
1927                    if prev_ends_hyphen && is_line_wrap {
1928                        // Line wraps at hyphen → word-break hyphen, remove it
1929                        result.pop();
1930                        // Don't add any separator - words should be joined
1931                    } else if add_newline {
1932                        if !result.ends_with('\n') {
1933                            result.push('\n');
1934                        }
1935                    } else {
1936                        // Soft wrap - treat as space if needed (English) or join (CJK)
1937                        if let Some(last_char) = result.chars().last()
1938                            && last_char != '\n'
1939                            && needs_space_after(last_char)
1940                        {
1941                            result.push(' ');
1942                        }
1943                    }
1944                } else {
1945                    // Same visual line - join with smart spacing
1946                    // PaddleX format_line: add space after English letters OR after formulas
1947                    let needs_spacing = if let Some(last_char) = result.chars().last()
1948                        && last_char != '\n'
1949                        && needs_space_after(last_char)
1950                    {
1951                        true
1952                    } else {
1953                        // PaddleX: add space after formula when next content is on same line
1954                        last_region.is_formula()
1955                    };
1956
1957                    if needs_spacing {
1958                        result.push(' ');
1959                    }
1960                }
1961            }
1962
1963            // PaddleX: formula spans are wrapped with $...$ delimiters
1964            // Inline formulas (mixed with text on same line): $formula$
1965            // Display formulas (standalone line): $$formula$$ (display math)
1966            let is_formula = region.is_formula();
1967            let text_to_add = if is_formula {
1968                // Don't double-wrap if formula model already added delimiters
1969                let already_wrapped =
1970                    text.starts_with('$') || text.starts_with("\\(") || text.starts_with("\\[");
1971                if already_wrapped {
1972                    text.to_string()
1973                } else {
1974                    // Check if this is a display formula (starts a new line with no other content yet on this line)
1975                    // Display formulas typically appear at the start of a line after a newline
1976                    let is_display = result.is_empty() || result.ends_with('\n');
1977
1978                    if is_display {
1979                        // Display formula: $$...$$
1980                        format!("$${}$$", text)
1981                    } else {
1982                        // Inline formula: $...$
1983                        format!("${}$", text)
1984                    }
1985                }
1986            } else {
1987                text.to_string()
1988            };
1989
1990            result.push_str(&text_to_add);
1991            prev_region = Some(region);
1992        }
1993
1994        // Trim trailing whitespace
1995        let joined = result.trim_end().to_string();
1996        update_fn(joined);
1997    }
1998
1999    /// Sorts layout elements using the enhanced xycut_enhanced algorithm.
2000    ///
2001    /// Uses cross-layout detection, direction-aware XY-cut, overlapping box shrinking,
2002    /// weighted distance insertion, and child block association for accurate reading order.
2003    fn sort_layout_elements_enhanced(
2004        elements: &mut Vec<LayoutElement>,
2005        page_width: f32,
2006        page_height: f32,
2007    ) {
2008        use oar_ocr_core::processors::layout_sorting::{SortableElement, sort_layout_enhanced};
2009
2010        if elements.is_empty() {
2011            return;
2012        }
2013
2014        let sortable_elements: Vec<_> = elements
2015            .iter()
2016            .map(|e| SortableElement {
2017                bbox: e.bbox.clone(),
2018                element_type: e.element_type,
2019                num_lines: e.num_lines,
2020            })
2021            .collect();
2022
2023        let sorted_indices = sort_layout_enhanced(&sortable_elements, page_width, page_height);
2024        if sorted_indices.len() != elements.len() {
2025            return;
2026        }
2027
2028        let sorted_elements: Vec<_> = sorted_indices
2029            .into_iter()
2030            .map(|idx| elements[idx].clone())
2031            .collect();
2032        *elements = sorted_elements;
2033    }
2034
2035    /// Sorts layout elements using the XY-cut algorithm (legacy fallback).
2036    #[allow(dead_code)]
2037    fn sort_layout_elements(elements: &mut Vec<LayoutElement>, _width: f32, _cfg: &StitchConfig) {
2038        if elements.len() <= 1 {
2039            return;
2040        }
2041
2042        // Use shared XY-cut implementation from processors module.
2043        let bboxes: Vec<BoundingBox> = elements.iter().map(|e| e.bbox.clone()).collect();
2044        let order = crate::processors::sort_by_xycut(
2045            &bboxes,
2046            crate::processors::SortDirection::Vertical,
2047            1,
2048        );
2049
2050        if order.len() != elements.len() {
2051            return;
2052        }
2053
2054        let mut reordered = Vec::with_capacity(elements.len());
2055        for idx in order {
2056            reordered.push(elements[idx].clone());
2057        }
2058
2059        *elements = reordered;
2060    }
2061}
2062
2063/// Checks if a space should be added after the given character.
2064/// Based on format_line logic: add space only after English letters.
2065fn needs_space_after(c: char) -> bool {
2066    c.is_ascii_alphabetic()
2067}
2068
2069fn last_non_whitespace_char(text: &str) -> Option<char> {
2070    text.chars().rev().find(|c| !c.is_whitespace())
2071}
2072
2073/// Punctuation that should not trigger hard paragraph breaks across line wraps.
2074fn is_non_break_line_end_punctuation(c: char) -> bool {
2075    matches!(c, ',' | ',' | '、' | ';' | ';' | ':' | ':')
2076}
2077
2078#[cfg(test)]
2079mod tests {
2080    use super::*;
2081    use crate::oarocr::TextRegion;
2082    use oar_ocr_core::processors::BoundingBox;
2083
2084    fn make_region(bbox: BoundingBox, text: &str) -> TextRegion {
2085        TextRegion {
2086            bounding_box: bbox.clone(),
2087            dt_poly: Some(bbox.clone()),
2088            rec_poly: Some(bbox),
2089            text: Some(text.into()),
2090            confidence: Some(0.9),
2091            orientation_angle: None,
2092            word_boxes: None,
2093            label: None,
2094        }
2095    }
2096
2097    #[test]
2098    fn test_normalize_tiny_symbol_for_paddlex_dash() {
2099        let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 10.0, 9.0), "=");
2100        region.confidence = Some(0.33);
2101        ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2102        assert_eq!(region.text.as_deref(), Some("-"));
2103    }
2104
2105    #[test]
2106    fn test_normalize_tiny_symbol_for_paddlex_comma() {
2107        let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 7.0, 6.0), "=");
2108        region.confidence = Some(0.40);
2109        ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2110        assert_eq!(region.text.as_deref(), Some(","));
2111    }
2112
2113    #[test]
2114    fn test_normalize_tiny_symbol_for_paddlex_semicolon() {
2115        let mut region = make_region(BoundingBox::from_coords(0.0, 0.0, 12.0, 13.0), "0");
2116        region.confidence = Some(0.13);
2117        ResultStitcher::normalize_tiny_symbol_for_paddlex(&mut region);
2118        assert_eq!(region.text.as_deref(), Some(";"));
2119    }
2120
2121    #[test]
2122    fn test_is_overlapping_threshold() {
2123        let b1 = BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0);
2124        let b2 = BoundingBox::from_coords(5.0, 5.0, 20.0, 20.0);
2125        let cfg = StitchConfig::default();
2126        assert!(ResultStitcher::is_overlapping(&b1, &b2, &cfg));
2127        let cfg2 = StitchConfig {
2128            overlap_min_pixels: 5.0,
2129            ..cfg.clone()
2130        };
2131        assert!(!ResultStitcher::is_overlapping(&b1, &b2, &cfg2));
2132    }
2133
2134    #[test]
2135    fn test_sort_and_join_texts_tolerance() {
2136        let b1 = BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0);
2137        let b2 = BoundingBox::from_coords(12.0, 1.0, 20.0, 11.0);
2138        let r1 = TextRegion {
2139            bounding_box: b1.clone(),
2140            dt_poly: Some(b1.clone()),
2141            rec_poly: Some(b1),
2142            text: Some("A".into()),
2143            confidence: Some(0.9),
2144            orientation_angle: None,
2145            word_boxes: None,
2146            label: None,
2147        };
2148        let r2 = TextRegion {
2149            bounding_box: b2.clone(),
2150            dt_poly: Some(b2.clone()),
2151            rec_poly: Some(b2),
2152            text: Some("B".into()),
2153            confidence: Some(0.9),
2154            orientation_angle: None,
2155            word_boxes: None,
2156            label: None,
2157        };
2158        let mut texts = vec![(&r1, "A"), (&r2, "B")];
2159        let cfg = StitchConfig::default();
2160        let mut joined = String::new();
2161        ResultStitcher::sort_and_join_texts(&mut texts, None, &cfg, |j| {
2162            joined = j;
2163        });
2164        assert_eq!(joined, "A B");
2165    }
2166
2167    #[test]
2168    fn test_sort_and_join_texts_english_line_uses_larger_paragraph_gap_threshold() {
2169        let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "Line");
2170        let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2171        let mut texts = vec![(&r1, "Line"), (&r2, "next")];
2172        let cfg = StitchConfig::default();
2173        let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2174        let mut joined = String::new();
2175        ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2176        assert_eq!(joined, "Line next");
2177    }
2178
2179    #[test]
2180    fn test_sort_and_join_texts_non_english_tail_keeps_original_paragraph_gap_threshold() {
2181        let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "2024");
2182        let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2183        let mut texts = vec![(&r1, "2024"), (&r2, "next")];
2184        let cfg = StitchConfig::default();
2185        let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2186        let mut joined = String::new();
2187        ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2188        assert_eq!(joined, "2024\nnext");
2189    }
2190
2191    #[test]
2192    fn test_sort_and_join_texts_non_break_punctuation_suppresses_newline() {
2193        let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 20.0, 10.0), "Note:");
2194        let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next");
2195        let mut texts = vec![(&r1, "Note:"), (&r2, "next")];
2196        let cfg = StitchConfig::default();
2197        let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0);
2198        let mut joined = String::new();
2199        ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j);
2200        assert_eq!(joined, "Note:next");
2201    }
2202
2203    #[test]
2204    fn test_normalize_checkbox_symbols_in_table_checkbox_like() {
2205        let mut cells = vec![
2206            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("ü"),
2207            TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("X"),
2208            TableCell::new(BoundingBox::from_coords(20.0, 0.0, 30.0, 10.0), 1.0).with_text("L"),
2209        ];
2210
2211        ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells);
2212
2213        assert_eq!(cells[0].text.as_deref(), Some("✓"));
2214        assert_eq!(cells[1].text.as_deref(), Some("✗"));
2215        assert_eq!(cells[2].text.as_deref(), Some("✓"));
2216    }
2217
2218    #[test]
2219    fn test_normalize_checkbox_symbols_in_table_keeps_ambiguous_when_not_checkbox_like() {
2220        let mut cells = vec![
2221            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("L"),
2222            TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("A"),
2223        ];
2224
2225        ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells);
2226
2227        assert_eq!(cells[0].text.as_deref(), Some("L"));
2228        assert_eq!(cells[1].text.as_deref(), Some("A"));
2229    }
2230
2231    #[test]
2232    fn test_find_row_start_index_with_compact_td_tokens() {
2233        let tokens = vec![
2234            "<table>".to_string(),
2235            "<tbody>".to_string(),
2236            "<tr>".to_string(),
2237            "<td></td>".to_string(),
2238            "<td></td>".to_string(),
2239            "</tr>".to_string(),
2240            "<tr>".to_string(),
2241            "<td rowspan=\"2\"></td>".to_string(),
2242            "<td></td>".to_string(),
2243            "</tr>".to_string(),
2244            "</tbody>".to_string(),
2245            "</table>".to_string(),
2246        ];
2247
2248        let row_start = ResultStitcher::find_row_start_index(&tokens);
2249        assert_eq!(row_start, vec![0, 2]);
2250    }
2251
2252    #[test]
2253    fn test_match_table_cells_with_structure_rows() {
2254        let mut cells = vec![
2255            TableCell::new(BoundingBox::from_coords(50.0, 0.0, 100.0, 20.0), 1.0), // row0 col1
2256            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 50.0, 20.0), 1.0),   // row0 col0
2257            TableCell::new(BoundingBox::from_coords(0.0, 20.0, 50.0, 40.0), 1.0),  // row1 col0
2258            TableCell::new(BoundingBox::from_coords(50.0, 20.0, 100.0, 40.0), 1.0), // row1 col1
2259        ];
2260
2261        let structure_tokens = vec![
2262            "<table>".to_string(),
2263            "<tbody>".to_string(),
2264            "<tr>".to_string(),
2265            "<td></td>".to_string(),
2266            "<td></td>".to_string(),
2267            "</tr>".to_string(),
2268            "<tr>".to_string(),
2269            "<td></td>".to_string(),
2270            "<td></td>".to_string(),
2271            "</tr>".to_string(),
2272            "</tbody>".to_string(),
2273            "</table>".to_string(),
2274        ];
2275
2276        let ocr_candidates = vec![
2277            (
2278                OcrSource::Original(0),
2279                make_region(BoundingBox::from_coords(2.0, 2.0, 48.0, 18.0), "A"),
2280            ),
2281            (
2282                OcrSource::Original(1),
2283                make_region(BoundingBox::from_coords(52.0, 2.0, 98.0, 18.0), "B"),
2284            ),
2285            (
2286                OcrSource::Original(2),
2287                make_region(BoundingBox::from_coords(2.0, 22.0, 48.0, 38.0), "C"),
2288            ),
2289            (
2290                OcrSource::Original(3),
2291                make_region(BoundingBox::from_coords(52.0, 22.0, 98.0, 38.0), "D"),
2292            ),
2293        ];
2294
2295        let (mapping, matched) = ResultStitcher::match_table_cells_with_structure_rows(
2296            &mut cells,
2297            &structure_tokens,
2298            &ocr_candidates,
2299            10.0,
2300            None,
2301        )
2302        .expect("expected row-aware matching result");
2303
2304        assert_eq!(mapping, vec![Some(1), Some(0), Some(2), Some(3)]);
2305        assert_eq!(matched.len(), 4);
2306
2307        assert_eq!(cells[1].text.as_deref(), Some("A"));
2308        assert_eq!(cells[0].text.as_deref(), Some("B"));
2309        assert_eq!(cells[2].text.as_deref(), Some("C"));
2310        assert_eq!(cells[3].text.as_deref(), Some("D"));
2311    }
2312
2313    #[test]
2314    fn test_match_table_and_ocr_by_iou_distance_prefers_first_cell_on_exact_tie() {
2315        let cells = vec![
2316            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2317            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2318        ];
2319        let ocr_candidates = vec![(
2320            OcrSource::Original(0),
2321            make_region(BoundingBox::from_coords(2.0, 2.0, 18.0, 18.0), "X"),
2322        )];
2323
2324        let (mapping, matched) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2325            &cells,
2326            &ocr_candidates,
2327            false,
2328            true,
2329        );
2330
2331        assert_eq!(matched.len(), 1);
2332        assert_eq!(mapping.get(&0), Some(&vec![0]));
2333        assert!(!mapping.contains_key(&1));
2334    }
2335
2336    #[test]
2337    fn test_match_table_and_ocr_by_iou_distance_boundary_near_tie_stays_stable() {
2338        // Near a row boundary, tiny float jitter should not flip assignment order.
2339        let cells = vec![
2340            TableCell::new(BoundingBox::from_coords(0.0, 0.0, 20.0, 20.0), 1.0),
2341            TableCell::new(BoundingBox::from_coords(0.0, 9.99995, 20.0, 29.99995), 1.0),
2342        ];
2343        let ocr_candidates = vec![(
2344            OcrSource::Original(0),
2345            make_region(BoundingBox::from_coords(0.0, 10.0, 20.0, 20.0), "Y"),
2346        )];
2347
2348        let (mapping, _) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2349            &cells,
2350            &ocr_candidates,
2351            false,
2352            true,
2353        );
2354
2355        // PaddleX-style tie break keeps the first cell index.
2356        assert_eq!(mapping.get(&0), Some(&vec![0]));
2357        assert!(!mapping.contains_key(&1));
2358    }
2359
2360    #[test]
2361    fn test_match_table_and_ocr_by_iou_distance_boundary_straddle_prefers_upper_row() {
2362        // Mirrors the remaining PaddleX mismatch case where a tiny OCR fragment straddles
2363        // two adjacent rows in the same column.
2364        let cells = vec![
2365            TableCell::new(
2366                BoundingBox::from_coords(564.6841, 142.27391, 584.9476, 157.74164),
2367                1.0,
2368            )
2369            .with_position(2, 2),
2370            TableCell::new(
2371                BoundingBox::from_coords(565.3968, 158.34259, 584.0292, 171.04494),
2372                1.0,
2373            )
2374            .with_position(3, 2),
2375        ];
2376        let ocr_candidates = vec![(
2377            OcrSource::Original(0),
2378            make_region(BoundingBox::from_coords(567.0, 151.0, 583.0, 166.0), "84"),
2379        )];
2380
2381        let (mapping, matched) = ResultStitcher::match_table_and_ocr_by_iou_distance(
2382            &cells,
2383            &ocr_candidates,
2384            false,
2385            true,
2386        );
2387
2388        assert_eq!(matched.len(), 1);
2389        assert_eq!(mapping.get(&0), Some(&vec![0]));
2390        assert!(!mapping.contains_key(&1));
2391    }
2392}