oar_ocr_core/domain/
structure.rs

1//! Document structure analysis result types.
2//!
3//! This module defines the result types for document structure analysis,
4//! including layout detection, table recognition, and formula recognition.
5
6use super::text_region::TextRegion;
7use crate::processors::BoundingBox;
8use image::RgbImage;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::path::Path;
13use std::sync::Arc;
14
15/// Title numbering pattern for detecting section numbers like 1, 1.2, 1.2.3, (1), 一、etc.
16/// This follows standard title numbering pattern.
17static TITLE_NUMBERING_REGEX: Lazy<Regex> = Lazy::new(|| {
18    Regex::new(
19        r"(?x)
20        ^\s*
21        (
22            # Arabic numerals: 1, 1.2, 1.2.3, etc.
23            [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
24            |
25            # Parenthesized Arabic numerals: (1), (1.2), etc.
26            [(（][1-9][0-9]*(?:\.[1-9][0-9]*)*[)）]
27            |
28            # Chinese numerals with punctuation: 一、 二、
29            [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
30            |
31            # Parenthesized Chinese numerals: （一）
32            [(（][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[)）]
33            |
34            # Roman numerals with delimiter (period or followed by space)
35            (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
36        )
37        (\s+)
38        (.*)
39        $
40    ",
41    )
42    .unwrap_or_else(|e| panic!("Invalid title numbering regex: {e}"))
43});
44
45/// Format a paragraph title with automatic level detection based on numbering.
46///
47/// Following PaddleX's title formatting logic:
48/// - Extracts numbering prefix (1.2.3, etc.)
49/// - Determines heading level from number of dots
50/// - Returns (level, formatted_title) where level starts from 2 (## for paragraph titles)
51///
52/// PaddleX logic: `level = dots + 1`, then uses `#{'#' * level}` which means:
53/// - "1 Introduction" (0 dots) -> level=1 -> `## 1 Introduction`
54/// - "2.1 Method" (1 dot) -> level=2 -> `### 2.1 Method`
55/// - "2.1.1 Details" (2 dots) -> level=3 -> `#### 2.1.1 Details`
56///
57/// To align with PaddleX, we return level+1 to account for the extra `#`:
58/// - "1 Introduction" -> (2, "1 Introduction") -> `## 1 Introduction`
59/// - "2.1 Method" -> (3, "2.1 Method") -> `### 2.1 Method`
60/// - "2.1.1 Details" -> (4, "2.1.1 Details") -> `#### 2.1.1 Details`
61fn format_title_with_level(title: &str) -> (usize, String) {
62    // Clean up line breaks
63    let cleaned = title.replace("-\n", "").replace('\n', " ");
64
65    if let Some(captures) = TITLE_NUMBERING_REGEX.captures(&cleaned) {
66        let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
67        let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
68
69        // Determine level from dots in numbering (PaddleX: dots + 1, then +1 for base ##)
70        // 1 -> 2 (##), 1.2 -> 3 (###), 1.2.3 -> 4 (####)
71        let dot_count = numbering.matches('.').count();
72        let level = dot_count + 2; // +1 for PaddleX logic, +1 for base ## level
73
74        // Reconstruct title: numbering + space + content
75        let formatted = if title_content.is_empty() {
76            numbering.trim_end_matches('.').to_string()
77        } else {
78            format!(
79                "{} {}",
80                numbering.trim_end_matches('.'),
81                title_content.trim_start()
82            )
83        };
84
85        // Clamp level to reasonable range (2-6 for markdown, since # is for doc_title)
86        let level = level.clamp(2, 6);
87
88        (level, formatted)
89    } else {
90        // No numbering detected, default to level 2 (## heading)
91        (2, cleaned)
92    }
93}
94
95/// A detected document region block (from PP-DocBlockLayout).
96///
97/// Region blocks represent hierarchical groupings of layout elements,
98/// typically columns or logical sections of a document. They are used
99/// for hierarchical reading order determination.
100///
101/// # PP-StructureV3 Alignment
102///
103/// PP-DocBlockLayout detects "region" type blocks that group related
104/// layout elements together. Elements within the same region should
105/// be read together before moving to the next region.
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct RegionBlock {
108    /// Bounding box of the region
109    pub bbox: BoundingBox,
110    /// Confidence score of the detection
111    pub confidence: f32,
112    /// Index of this region in the reading order
113    pub order_index: Option<u32>,
114    /// Indices of layout elements that belong to this region
115    pub element_indices: Vec<usize>,
116}
117
118/// Page continuation flags for multi-page document processing.
119///
120/// These flags indicate whether the page starts or ends in the middle of
121/// a semantic paragraph, which is crucial for properly concatenating
122/// markdown output from multiple pages.
123///
124/// - `paragraph_start`: `false` means this page continues a paragraph from previous page
125/// - `paragraph_end`: `false` means this page's content continues to next page
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct PageContinuationFlags {
128    /// Whether the first element on this page is a paragraph continuation
129    pub paragraph_start: bool,
130    /// Whether the last element on this page continues to the next page
131    pub paragraph_end: bool,
132}
133
134impl PageContinuationFlags {
135    pub fn new(paragraph_start: bool, paragraph_end: bool) -> Self {
136        Self {
137            paragraph_start,
138            paragraph_end,
139        }
140    }
141
142    /// Returns the tuple format (is_start, is_end) for compatibility
143    pub fn as_tuple(&self) -> (bool, bool) {
144        (self.paragraph_start, self.paragraph_end)
145    }
146}
147
148/// Result of document structure analysis.
149///
150/// This struct contains all the results from analyzing a document's structure,
151/// including layout elements, tables, formulas, and OCR results.
152///
153/// # Coordinate System
154///
155/// The coordinate system of bounding boxes depends on which preprocessing was applied:
156///
157/// - **No preprocessing**: Boxes are in the original input image's coordinate system.
158///
159/// - **Orientation correction only** (`orientation_angle` set, `rectified_img` is None):
160///   Boxes are transformed back to the original input image's coordinate system.
161///
162/// - **Rectification applied** (`rectified_img` is Some):
163///   Boxes remain in the **rectified image's coordinate system**. Neural network-based
164///   rectification (UVDoc) warps cannot be precisely inverted, so use `rectified_img`
165///   for visualization instead of the original image.
166///
167/// - **Both orientation and rectification**: Boxes are in the rectified coordinate system
168///   (rectification takes precedence since it's applied after orientation correction).
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct StructureResult {
171    /// Path to the input image file
172    pub input_path: Arc<str>,
173    /// Index of the image in a batch (0 for single image processing)
174    pub index: usize,
175    /// Detected layout elements (text regions, tables, figures, etc.)
176    pub layout_elements: Vec<LayoutElement>,
177    /// Recognized tables with their structure and content
178    pub tables: Vec<TableResult>,
179    /// Recognized mathematical formulas
180    pub formulas: Vec<FormulaResult>,
181    /// OCR text regions (if OCR was integrated)
182    pub text_regions: Option<Vec<TextRegion>>,
183    /// Document orientation angle (if orientation correction was used)
184    pub orientation_angle: Option<f32>,
185    /// Detected region blocks for hierarchical ordering (PP-DocBlockLayout)
186    /// When present, layout_elements are already sorted by region hierarchy
187    pub region_blocks: Option<Vec<RegionBlock>>,
188    /// Rectified image (if document rectification was used)
189    /// Note: Bounding boxes are already transformed back to original coordinates for rotation,
190    /// but for rectification (UVDoc), boxes are in the rectified image's coordinate system.
191    /// Use this image for visualization when rectification was applied.
192    #[serde(skip)]
193    pub rectified_img: Option<Arc<RgbImage>>,
194    /// Page continuation flags for multi-page document processing.
195    /// This indicates whether this page continues a paragraph from the previous page
196    /// or continues to the next page, which is crucial for proper markdown concatenation.
197    pub page_continuation_flags: Option<PageContinuationFlags>,
198}
199
200impl StructureResult {
201    /// Creates a new structure result.
202    pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
203        Self {
204            input_path: input_path.into(),
205            index,
206            layout_elements: Vec::new(),
207            tables: Vec::new(),
208            formulas: Vec::new(),
209            text_regions: None,
210            orientation_angle: None,
211            region_blocks: None,
212            rectified_img: None,
213            page_continuation_flags: None,
214        }
215    }
216
217    /// Adds layout elements to the result.
218    pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
219        self.layout_elements = elements;
220        self
221    }
222
223    /// Adds tables to the result.
224    pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
225        self.tables = tables;
226        self
227    }
228
229    /// Adds formulas to the result.
230    pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
231        self.formulas = formulas;
232        self
233    }
234
235    /// Adds OCR text regions to the result.
236    pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
237        self.text_regions = Some(regions);
238        self
239    }
240
241    /// Adds region blocks to the result (PP-DocBlockLayout).
242    ///
243    /// Region blocks represent hierarchical groupings of layout elements.
244    /// When set, layout_elements should already be sorted by region hierarchy.
245    pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
246        self.region_blocks = Some(blocks);
247        self
248    }
249
250    /// Sets page continuation flags for multi-page document processing.
251    pub fn with_page_continuation_flags(mut self, flags: PageContinuationFlags) -> Self {
252        self.page_continuation_flags = Some(flags);
253        self
254    }
255
256    /// Converts the result to a Markdown string.
257    ///
258    /// Follows PP-StructureV3's formatting rules:
259    /// - DocTitle: `# title`
260    /// - ParagraphTitle: Auto-detect numbering (1.2.3 -> ###)
261    /// - Formula: `$$latex$$`
262    /// - Table: HTML with border
263    /// - Images: `![Figure](caption)`
264    ///
265    /// Note: Low-confidence text elements that overlap with table regions are filtered out
266    /// to avoid duplicate content from table OCR.
267    pub fn to_markdown(&self) -> String {
268        // Collect table bboxes for overlap filtering
269        let table_bboxes: Vec<&BoundingBox> = self
270            .layout_elements
271            .iter()
272            .filter(|e| e.element_type == LayoutElementType::Table)
273            .map(|e| &e.bbox)
274            .collect();
275
276        let mut md = String::new();
277        let elements = &self.layout_elements;
278
279        for (idx, element) in elements.iter().enumerate() {
280            // PP-StructureV3 markdown ignores auxiliary labels.
281            if matches!(
282                element.element_type,
283                LayoutElementType::Number
284                    | LayoutElementType::Footnote
285                    | LayoutElementType::Header
286                    | LayoutElementType::HeaderImage
287                    | LayoutElementType::Footer
288                    | LayoutElementType::FooterImage
289                    | LayoutElementType::AsideText
290            ) {
291                continue;
292            }
293
294            // Filter out low-confidence text elements that overlap with tables
295            // These are typically OCR artifacts from table cell text that shouldn't be
296            // output separately in markdown
297            if element.element_type == LayoutElementType::Text {
298                let overlaps_table = table_bboxes.iter().any(|table_bbox| {
299                    element.bbox.ioa(table_bbox) > 0.3 // >30% of text is inside table
300                });
301
302                // Skip low-confidence text that overlaps with table regions
303                // Standard logic filters these in the stitching phase
304                if overlaps_table && element.confidence < 0.7 {
305                    continue;
306                }
307            }
308
309            match element.element_type {
310                // Document title
311                LayoutElementType::DocTitle => {
312                    md.push_str("\n# ");
313                    if let Some(text) = &element.text {
314                        let cleaned = clean_ocr_text(text);
315                        md.push_str(&cleaned);
316                    }
317                    md.push_str("\n\n");
318                }
319                // Paragraph/section title - auto-detect numbering for level
320                LayoutElementType::ParagraphTitle => {
321                    if let Some(text) = &element.text {
322                        let cleaned = clean_ocr_text(text);
323                        let (level, formatted_title) = format_title_with_level(&cleaned);
324                        md.push('\n');
325                        for _ in 0..level {
326                            md.push('#');
327                        }
328                        md.push(' ');
329                        md.push_str(&formatted_title);
330                        md.push_str("\n\n");
331                    } else {
332                        md.push_str("\n## \n\n");
333                    }
334                }
335                // Table - preserve HTML structure with border and center alignment
336                // Following PaddleX's format with <div style="text-align: center;"> wrapper
337                LayoutElementType::Table => {
338                    if let Some(table) =
339                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
340                    {
341                        if let Some(html) = &table.html_structure {
342                            // Simplify table HTML (remove html/body wrappers) and add border
343                            let simplified = simplify_table_html(html);
344                            let table_with_border =
345                                simplified.replacen("<table>", "<table border=\"1\">", 1);
346                            // Wrap with center-aligned div for better markdown rendering
347                            md.push_str("\n<div style=\"text-align: center;\">");
348                            md.push_str(&table_with_border);
349                            md.push_str("</div>\n\n");
350                        } else {
351                            md.push_str("\n[Table]\n\n");
352                        }
353                    } else {
354                        md.push_str("\n[Table]\n\n");
355                    }
356                }
357                // Formula - detect inline vs display formula based on context
358                LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
359                    // Check if this formula is on the same line as adjacent text elements
360                    // to determine if it's an inline formula or display formula
361                    let is_inline = {
362                        // Look for previous non-formula text element on the same line
363                        let has_prev_text = (0..idx).rev().any(|i| {
364                            let prev = &elements[i];
365                            !prev.element_type.is_formula()
366                                && (prev.element_type == LayoutElementType::Text
367                                    || prev.element_type == LayoutElementType::ReferenceContent)
368                                && is_same_line(&element.bbox, &prev.bbox)
369                        });
370
371                        // Look for next non-formula text element on the same line
372                        let has_next_text = ((idx + 1)..elements.len()).any(|i| {
373                            let next = &elements[i];
374                            !next.element_type.is_formula()
375                                && (next.element_type == LayoutElementType::Text
376                                    || next.element_type == LayoutElementType::ReferenceContent)
377                                && is_same_line(&element.bbox, &next.bbox)
378                        });
379
380                        has_prev_text || has_next_text
381                    };
382
383                    if is_inline {
384                        // Inline formula: use $...$
385                        md.push('$');
386                        if let Some(latex) = &element.text {
387                            md.push_str(latex);
388                        }
389                        md.push_str("$ ");
390                    } else {
391                        // Display formula: use $$...$$
392                        md.push_str("\n$$");
393                        if let Some(latex) = &element.text {
394                            md.push_str(latex);
395                        }
396                        md.push_str("$$\n\n");
397                    }
398                }
399                // Image/Chart - figure format with center alignment
400                LayoutElementType::Image | LayoutElementType::Chart => {
401                    // Use HTML img tag with center alignment for better rendering
402                    md.push_str("\n<div style=\"text-align: center;\"><img src=\"");
403                    // Generate a placeholder image name based on element bbox
404                    let img_name = format!(
405                        "imgs/img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg",
406                        if element.element_type == LayoutElementType::Chart {
407                            "chart"
408                        } else {
409                            "image"
410                        },
411                        element.bbox.x_min(),
412                        element.bbox.y_min(),
413                        element.bbox.x_max(),
414                        element.bbox.y_max()
415                    );
416                    md.push_str(&img_name);
417                    md.push_str("\" alt=\"Image\" width=\"");
418                    // Calculate width percentage based on element size
419                    let width_pct =
420                        ((element.bbox.x_max() - element.bbox.x_min()) / 12.0).clamp(20.0, 100.0);
421                    md.push_str(&format!("{:.0}%", width_pct));
422                    md.push_str("\" /></div>\n\n");
423                }
424                // Seal - show as image with text
425                LayoutElementType::Seal => {
426                    md.push_str("\n![Seal]");
427                    if let Some(text) = &element.text {
428                        md.push_str("\n> ");
429                        md.push_str(text);
430                    }
431                    md.push_str("\n\n");
432                }
433                // Captions - with center alignment following PaddleX
434                _ if element.element_type.is_caption() => {
435                    if let Some(text) = &element.text {
436                        md.push_str("\n<div style=\"text-align: center;\">");
437                        md.push_str(text);
438                        md.push_str(" </div>\n\n");
439                    }
440                }
441                // Abstract - following PaddleX format with proper text handling
442                LayoutElementType::Abstract => {
443                    if let Some(text) = &element.text {
444                        // Check for "Abstract" or "摘要" heading
445                        let lower = text.to_lowercase();
446                        if lower.contains("abstract") || lower.contains("摘要") {
447                            md.push_str("\n## **Abstract**\n\n");
448                        }
449                        let formatted = format_text_block(text);
450                        md.push_str(&formatted);
451                        md.push_str("\n\n");
452                    }
453                }
454                // Reference - following PaddleX's format_reference_block
455                LayoutElementType::Reference => {
456                    if let Some(text) = &element.text {
457                        let formatted = format_reference_block(text);
458                        md.push('\n');
459                        md.push_str(&formatted);
460                        md.push_str("\n\n");
461                    }
462                }
463                // Content (table of contents) - following PaddleX's soft breaks
464                LayoutElementType::Content => {
465                    if let Some(text) = &element.text {
466                        let formatted = format_content_block(text);
467                        md.push('\n');
468                        md.push_str(&formatted);
469                        md.push_str("\n\n");
470                    }
471                }
472                // Footnote - following PaddleX's vision_footnote handling
473                LayoutElementType::Footnote => {
474                    if let Some(text) = &element.text {
475                        let formatted = format_vision_footnote_block(text);
476                        md.push('\n');
477                        md.push_str(&formatted);
478                        md.push_str("\n\n");
479                    }
480                }
481                // List
482                LayoutElementType::List => {
483                    if let Some(text) = &element.text {
484                        let cleaned = format_text_block(text);
485                        // Split by newlines and format as list items
486                        for line in cleaned.lines() {
487                            let line = line.trim();
488                            if !line.is_empty() {
489                                md.push_str("- ");
490                                md.push_str(line);
491                                md.push('\n');
492                            }
493                        }
494                        md.push('\n');
495                    }
496                }
497                // Header/Footer - smaller text (typically excluded from markdown)
498                _ if element.element_type.is_header() || element.element_type.is_footer() => {
499                    // Skip headers and footers in markdown output
500                    // They typically contain page numbers and repeating info
501                    continue;
502                }
503                // Default text elements - following PaddleX's text handling
504                _ => {
505                    if let Some(text) = &element.text {
506                        let formatted = format_text_block(text);
507                        md.push_str(&formatted);
508                        md.push_str("\n\n");
509                    }
510                }
511            }
512        }
513        md.trim().to_string()
514    }
515
516    /// Calculates the page continuation flags for this result.
517    ///
518    /// This follows PaddleX's `get_seg_flag` logic to determine whether
519    /// the page starts/ends in the middle of a semantic paragraph.
520    ///
521    /// Returns (paragraph_start, paragraph_end) where:
522    /// - `paragraph_start`: false means page continues from previous
523    /// - `paragraph_end`: false means content continues to next page
524    pub fn calculate_continuation_flags(&self) -> PageContinuationFlags {
525        let elements = &self.layout_elements;
526
527        if elements.is_empty() {
528            return PageContinuationFlags::new(true, true);
529        }
530
531        // Estimate page width from rectified image or element bboxes
532        let page_width = self
533            .rectified_img
534            .as_ref()
535            .map(|img| img.width() as f32)
536            .or_else(|| {
537                elements
538                    .iter()
539                    .map(|e| e.bbox.x_max())
540                    .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
541            });
542
543        // Filter to only text elements for continuation analysis
544        let text_elements: Vec<_> = elements
545            .iter()
546            .filter(|e| {
547                matches!(
548                    e.element_type,
549                    LayoutElementType::Text
550                        | LayoutElementType::DocTitle
551                        | LayoutElementType::ParagraphTitle
552                        | LayoutElementType::Abstract
553                        | LayoutElementType::Reference
554                )
555            })
556            .collect();
557
558        if text_elements.is_empty() {
559            return PageContinuationFlags::new(true, true);
560        }
561
562        // Calculate paragraph start flag
563        let first = &text_elements[0];
564        let paragraph_start = is_new_paragraph_start(first, page_width);
565
566        // Calculate paragraph end flag
567        let last = &text_elements[text_elements.len() - 1];
568        let paragraph_end = is_paragraph_complete(last, page_width);
569
570        PageContinuationFlags::new(paragraph_start, paragraph_end)
571    }
572
573    /// Converts the result to an HTML string.
574    ///
575    /// Follows PP-StructureV3's formatting rules with semantic HTML tags.
576    pub fn to_html(&self) -> String {
577        let mut html = String::from(
578            "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
579        );
580
581        for element in &self.layout_elements {
582            match element.element_type {
583                // Document title
584                LayoutElementType::DocTitle => {
585                    html.push_str("<h1>");
586                    if let Some(text) = &element.text {
587                        html.push_str(&Self::escape_html(text));
588                    }
589                    html.push_str("</h1>\n");
590                }
591                // Paragraph/section title
592                LayoutElementType::ParagraphTitle => {
593                    html.push_str("<h2>");
594                    if let Some(text) = &element.text {
595                        html.push_str(&Self::escape_html(text));
596                    }
597                    html.push_str("</h2>\n");
598                }
599                // Table - embed HTML structure with simplified markup
600                LayoutElementType::Table => {
601                    if let Some(table) =
602                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
603                    {
604                        if let Some(table_html) = &table.html_structure {
605                            // Simplify table HTML (remove html/body wrappers) and add border styling
606                            let simplified = simplify_table_html(table_html);
607                            let styled = simplified.replacen(
608                                "<table>",
609                                "<table border=\"1\" style=\"border-collapse: collapse;\">",
610                                1,
611                            );
612                            html.push_str(&styled);
613                            html.push('\n');
614                        } else {
615                            html.push_str("<p>[Table]</p>\n");
616                        }
617                    } else {
618                        html.push_str("<p>[Table]</p>\n");
619                    }
620                }
621                // Formula - use math tags
622                LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
623                    html.push_str("<p class=\"formula\">$$");
624                    if let Some(latex) = &element.text {
625                        html.push_str(&Self::escape_html(latex));
626                    }
627                    html.push_str("$$</p>\n");
628                }
629                // Image/Chart
630                LayoutElementType::Image | LayoutElementType::Chart => {
631                    html.push_str("<figure>\n<img alt=\"Figure\" />\n");
632                    if let Some(caption) = &element.text {
633                        html.push_str("<figcaption>");
634                        html.push_str(&Self::escape_html(caption));
635                        html.push_str("</figcaption>\n");
636                    }
637                    html.push_str("</figure>\n");
638                }
639                // Seal
640                LayoutElementType::Seal => {
641                    html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
642                    if let Some(text) = &element.text {
643                        html.push_str("<figcaption>");
644                        html.push_str(&Self::escape_html(text));
645                        html.push_str("</figcaption>\n");
646                    }
647                    html.push_str("</figure>\n");
648                }
649                // Captions
650                _ if element.element_type.is_caption() => {
651                    if let Some(text) = &element.text {
652                        html.push_str("<figcaption>");
653                        html.push_str(&Self::escape_html(text));
654                        html.push_str("</figcaption>\n");
655                    }
656                }
657                // Abstract
658                LayoutElementType::Abstract => {
659                    html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
660                    if let Some(text) = &element.text {
661                        html.push_str(&Self::escape_html(text));
662                    }
663                    html.push_str("</p>\n</section>\n");
664                }
665                // Reference
666                LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
667                    html.push_str("<section class=\"references\">\n<p>");
668                    if let Some(text) = &element.text {
669                        html.push_str(&Self::escape_html(text));
670                    }
671                    html.push_str("</p>\n</section>\n");
672                }
673                // List
674                LayoutElementType::List => {
675                    html.push_str("<ul>\n");
676                    if let Some(text) = &element.text {
677                        for line in text.lines() {
678                            html.push_str("<li>");
679                            html.push_str(&Self::escape_html(line));
680                            html.push_str("</li>\n");
681                        }
682                    }
683                    html.push_str("</ul>\n");
684                }
685                // Header
686                _ if element.element_type.is_header() => {
687                    html.push_str("<header>");
688                    if let Some(text) = &element.text {
689                        html.push_str(&Self::escape_html(text));
690                    }
691                    html.push_str("</header>\n");
692                }
693                // Footer
694                _ if element.element_type.is_footer() => {
695                    html.push_str("<footer>");
696                    if let Some(text) = &element.text {
697                        html.push_str(&Self::escape_html(text));
698                    }
699                    html.push_str("</footer>\n");
700                }
701                // Default text
702                _ => {
703                    if let Some(text) = &element.text {
704                        html.push_str("<p>");
705                        html.push_str(&Self::escape_html(text));
706                        html.push_str("</p>\n");
707                    }
708                }
709            }
710        }
711        html.push_str("</body>\n</html>");
712        html
713    }
714
715    /// Escapes HTML special characters.
716    fn escape_html(text: &str) -> String {
717        text.replace('&', "&amp;")
718            .replace('<', "&lt;")
719            .replace('>', "&gt;")
720            .replace('"', "&quot;")
721            .replace('\'', "&#39;")
722    }
723
724    /// Converts the result to a JSON Value.
725    pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
726        serde_json::to_value(self)
727    }
728
729    /// Saves the analysis results to the specified directory.
730    ///
731    /// This generates:
732    /// - `*_res.json`: The full structured result
733    /// - `*_res.html`: An HTML representation
734    ///
735    /// Note: Markdown export with image extraction should use the example utilities
736    /// (`examples/utils/markdown.rs`) instead, as that requires I/O operations
737    /// that belong in the application layer. Use `StructureResult::to_markdown()`
738    /// for pure markdown generation without side effects.
739    ///
740    /// # Arguments
741    ///
742    /// * `output_dir` - Directory to save the output files
743    /// * `to_json` - If true, save a JSON representation
744    /// * `to_html` - If true, save an HTML representation
745    pub fn save_results(
746        &self,
747        output_dir: impl AsRef<Path>,
748        to_json: bool,
749        to_html: bool,
750    ) -> std::io::Result<()> {
751        let output_dir = output_dir.as_ref();
752        if !output_dir.exists() {
753            std::fs::create_dir_all(output_dir)?;
754        }
755
756        let input_path = Path::new(self.input_path.as_ref());
757        // Extract file stem, handling PDF page suffix (e.g., "file.pdf#3" -> "file_003")
758        let stem = if let Some(path_str) = input_path.to_str() {
759            if let Some(hash_idx) = path_str.rfind('#') {
760                // This is a PDF page reference like "file.pdf#3"
761                let base = &path_str[..hash_idx];
762                let page_num = &path_str[hash_idx + 1..];
763                let base_stem = Path::new(base)
764                    .file_stem()
765                    .and_then(|s| s.to_str())
766                    .unwrap_or("result");
767                format!("{}_{}", base_stem, page_num)
768            } else {
769                input_path
770                    .file_stem()
771                    .and_then(|s| s.to_str())
772                    .unwrap_or("result")
773                    .to_string()
774            }
775        } else {
776            "result".to_string()
777        };
778
779        // Save JSON
780        if to_json {
781            let json_path = output_dir.join(format!("{}.json", stem));
782            let json_file = std::fs::File::create(json_path)?;
783            serde_json::to_writer_pretty(json_file, self)?;
784        }
785
786        // Save HTML
787        if to_html {
788            let html_path = output_dir.join(format!("{}.html", stem));
789            std::fs::write(html_path, self.to_html())?;
790        }
791
792        Ok(())
793    }
794}
795
796/// Checks if a text element appears to start a new paragraph.
797///
798/// Following PaddleX's logic: if the text starts near the left edge of the page
799/// (within 5% of page width), it's likely the start of a new paragraph.
800fn is_new_paragraph_start(element: &LayoutElement, page_width: Option<f32>) -> bool {
801    let left = element.bbox.x_min();
802    let threshold = page_width.map_or(50.0, |w| w * 0.05); // 5% of page width
803    left <= threshold
804}
805
806/// Checks if a text element appears to complete its paragraph on this page.
807///
808/// Following PaddleX's logic: if the text ends before the right edge of the page
809/// (not within 10% of right margin), the paragraph likely ends here.
810fn is_paragraph_complete(element: &LayoutElement, page_width: Option<f32>) -> bool {
811    let right = element.bbox.x_max();
812
813    // If we have page width info, check if element ends before the right edge
814    if let Some(width) = page_width {
815        let right_margin = width * 0.1;
816        return right <= (width - right_margin);
817    }
818
819    // Conservative default: assume paragraphs end
820    true
821}
822
823/// Concatenates markdown content from multiple pages into a single document.
824///
825/// This follows PaddleX's `concatenate_markdown_pages` logic to intelligently
826/// merge pages while preserving paragraph continuity.
827///
828/// # Arguments
829///
830/// * `results` - Slice of structure results from multiple pages (in order)
831///
832/// # Returns
833///
834/// A single markdown string with all pages properly concatenated
835pub fn concatenate_markdown_pages(results: &[StructureResult]) -> String {
836    if results.is_empty() {
837        return String::new();
838    }
839
840    if results.len() == 1 {
841        return results[0].to_markdown();
842    }
843
844    let mut markdown = String::new();
845    let mut prev_page_end_flag = true; // First page is treated as starting fresh
846
847    for result in results.iter() {
848        let flags = result
849            .page_continuation_flags
850            .as_ref()
851            .cloned()
852            .unwrap_or_else(|| result.calculate_continuation_flags());
853
854        let page_markdown = result.to_markdown();
855
856        // Skip empty pages
857        if page_markdown.trim().is_empty() {
858            prev_page_end_flag = flags.paragraph_end;
859            continue;
860        }
861
862        let page_first_continues = !flags.paragraph_start;
863        let _page_last_continues = !flags.paragraph_end;
864
865        // Determine how to join this page
866        if page_first_continues && !prev_page_end_flag {
867            // Both pages are in the middle of the same paragraph
868            // Check for Chinese characters to decide spacing
869            let last_char = markdown.chars().last();
870            let first_char = page_markdown.chars().next();
871
872            let last_is_chinese = last_char.is_some_and(is_chinese_char);
873            let first_is_chinese = first_char.is_some_and(is_chinese_char);
874
875            if !last_is_chinese && !first_is_chinese {
876                // Non-Chinese text: add space
877                markdown.push(' ');
878                markdown.push_str(page_markdown.trim_start());
879            } else {
880                // Chinese or mixed: direct concatenation
881                markdown.push_str(page_markdown.trim_start());
882            }
883        } else {
884            // New paragraph or section
885            if !markdown.is_empty() {
886                markdown.push_str("\n\n");
887            }
888            markdown.push_str(&page_markdown);
889        }
890
891        prev_page_end_flag = flags.paragraph_end;
892    }
893
894    markdown.trim().to_string()
895}
896
897/// Cleans OCR text content by removing common artifacts.
898///
899/// This function removes PDF line-break hyphens and fixes spacing issues
900/// in OCR text content. It should only be applied to raw OCR text, not to
901/// formatted markdown or HTML.
902///
903/// Following PaddleX's approach:
904/// 1. Remove hyphenation artifacts: `-\n` -> `` (join words)
905/// 2. Convert newlines to spaces: `\n` -> ` `
906fn clean_ocr_text(text: &str) -> String {
907    // First remove hyphenation (word breaks), then convert newlines to spaces
908    text.replace("-\n", "").replace('\n', " ")
909}
910
911/// Formats text blocks following PaddleX's text handling:
912/// 1. First remove hyphenation: `-\n` -> `` (join broken words)
913/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
914///
915/// This converts OCR line breaks into proper paragraph breaks.
916fn format_text_block(text: &str) -> String {
917    // First, remove hyphenation artifacts (word breaks at line ends)
918    let dehyphenated = text.replace("-\n", "");
919    // Collapse double newlines to single (undo paragraph breaks)
920    let step1 = dehyphenated.replace("\n\n", "\n");
921    // Then, convert single newlines to paragraph breaks
922    step1.replace('\n', "\n\n")
923}
924
925/// Formats content blocks (table of contents) following PaddleX:
926/// `.replace("-\n", "  \n").replace("\n", "  \n")`
927///
928/// This uses markdown's soft line break (two spaces at end of line).
929fn format_content_block(text: &str) -> String {
930    // Handle PDF hyphen line breaks first
931    let step1 = text.replace("-\n", "  \n");
932    // Convert newlines to soft breaks
933    step1.replace('\n', "  \n")
934}
935
936/// Formats reference blocks, following PaddleX's `format_first_line_func`:
937/// - First remove hyphenation: `-\n` -> ``
938/// - Detects "References" or "参考文献" keyword
939/// - Adds markdown heading if found
940fn format_reference_block(text: &str) -> String {
941    // First remove hyphenation
942    let dehyphenated = text.replace("-\n", "");
943    let lines: Vec<&str> = dehyphenated.lines().collect();
944
945    // Check first non-empty line for reference keywords
946    let mut result = String::new();
947    let mut added_heading = false;
948
949    for (i, line) in lines.iter().enumerate() {
950        let trimmed = line.trim();
951        if trimmed.is_empty() {
952            continue;
953        }
954
955        // Check if this is a reference heading line
956        if !added_heading && (trimmed.contains("References") || trimmed.contains("参考文献")) {
957            result.push_str("## **References**\n\n");
958            added_heading = true;
959            // Skip the heading line itself, continue with content
960            continue;
961        }
962
963        // Add remaining lines
964        if i > 0 || result.is_empty() {
965            if !result.is_empty() {
966                result.push('\n');
967            }
968            result.push_str(trimmed);
969        }
970    }
971
972    if result.is_empty() {
973        dehyphenated
974    } else {
975        result
976    }
977}
978
979/// Formats vision footnote blocks following PaddleX:
980/// 1. First remove hyphenation: `-\n` -> ``
981/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
982fn format_vision_footnote_block(text: &str) -> String {
983    let dehyphenated = text.replace("-\n", "");
984    let step1 = dehyphenated.replace("\n\n", "\n");
985    step1.replace('\n', "\n\n")
986}
987
988/// Checks if a character is a Chinese character.
989///
990/// Used to determine spacing rules when concatenating pages.
991fn is_chinese_char(c: char) -> bool {
992    match c {
993        '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
994        '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
995        '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
996        '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
997        '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
998        '\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E
999        '\u{2CEB0}'..='\u{2EBEF}' => // CJK Unified Ideographs Extension F
1000            true,
1001        _ => false,
1002    }
1003}
1004
1005/// Checks if a character is a lowercase letter.
1006fn is_lowercase(c: char) -> bool {
1007    c.is_ascii_lowercase()
1008}
1009
1010/// Checks if a character is an uppercase letter.
1011fn is_uppercase(c: char) -> bool {
1012    c.is_ascii_uppercase()
1013}
1014
1015/// Checks if a character is a digit.
1016fn is_digit(c: char) -> bool {
1017    c.is_ascii_digit()
1018}
1019
1020/// Removes PDF hyphenation artifacts from text.
1021///
1022/// PDFs often break words at line ends with hyphens like "frame-work",
1023/// "com-pared", etc. This function detects and removes these hyphens
1024/// when they appear to be line-break hyphens rather than intentional hyphens.
1025///
1026/// Rules:
1027/// 1. Hyphen followed by lowercase letter is likely a hyphenation artifact
1028/// 2. Hyphen followed by space and lowercase letter is also artifact
1029/// 3. Hyphen followed by newline and lowercase letter is artifact
1030/// 4. Preserve intentional hyphens (compound words, hyphenated phrases)
1031/// 5. Preserve hyphens in URLs and technical patterns
1032fn dehyphenate(text: &str) -> String {
1033    let mut result = String::with_capacity(text.len());
1034    let chars: Vec<char> = text.chars().collect();
1035    let len = chars.len();
1036    let mut i = 0;
1037
1038    // Helper to check if we're in a URL-like pattern
1039    let is_url_context = |pos: usize| -> bool {
1040        // Look at a window around the hyphen for URL patterns
1041        let start = pos.saturating_sub(10);
1042        let end = (pos + 5).min(len);
1043        let window: String = chars[start..end].iter().collect();
1044        window.contains("http") || window.contains("www") || window.contains("://")
1045    };
1046
1047    while i < len {
1048        if chars[i] == '-' {
1049            // Skip dehyphenation for URL contexts
1050            if is_url_context(i) {
1051                result.push('-');
1052                i += 1;
1053                continue;
1054            }
1055
1056            // Check if this is a hyphenation artifact
1057            let is_artifact = if i + 1 < len {
1058                let next = chars[i + 1];
1059                if next == '\n' {
1060                    // Hyphen followed by newline - check what's after the newline
1061                    if i + 2 < len {
1062                        let after_newline = chars[i + 2];
1063                        is_lowercase(after_newline)
1064                    } else {
1065                        false
1066                    }
1067                } else if is_lowercase(next) {
1068                    // Hyphen followed directly by lowercase letter (e.g., "com-puted")
1069                    // But check if preceded by lowercase to avoid removing intentional hyphens
1070                    // like in "RT-DETR" or "one-to-many"
1071                    i > 0 && is_lowercase(chars[i - 1])
1072                } else if next.is_whitespace() && i + 2 < len {
1073                    let after_space = chars[i + 2];
1074                    // Hyphen + space + lowercase letter (e.g., "com- puted")
1075                    is_lowercase(after_space) && i > 0 && is_lowercase(chars[i - 1])
1076                } else {
1077                    false
1078                }
1079            } else {
1080                false
1081            };
1082
1083            if is_artifact {
1084                // Skip the hyphen
1085                // Also skip following newline/space if present
1086                if i + 1 < len {
1087                    let next = chars[i + 1];
1088                    if next == '\n' || next.is_whitespace() {
1089                        i += 1;
1090                    }
1091                }
1092            } else {
1093                result.push('-');
1094            }
1095        } else {
1096            result.push(chars[i]);
1097        }
1098        i += 1;
1099    }
1100
1101    result
1102}
1103
1104/// Fixes missing spaces between merged words.
1105///
1106/// OCR and PDF extraction can result in merged words like
1107/// "enhancetheencoder'sfeaturerepresentation" or "48.1%AP".
1108/// This function detects and fixes common patterns.
1109fn fix_merged_words(text: &str) -> String {
1110    let mut result = String::with_capacity(text.len());
1111    let chars: Vec<char> = text.chars().collect();
1112    let mut i = 0;
1113
1114    while i < chars.len() {
1115        let current = chars[i];
1116
1117        if i > 0 {
1118            let prev = chars[i - 1];
1119
1120            // Detect missing space between lowercase and lowercase (after apostrophe or consonant)
1121            // e.g., "encoder'sfeature" -> "encoder's feature"
1122            if is_lowercase(prev) && is_lowercase(current) {
1123                // Only add space if previous was apostrophe or word boundary context
1124                // This is a heuristic - in practice you'd want more sophisticated NLP
1125                if i > 1 && chars[i - 2] == '\'' {
1126                    result.push(' ');
1127                }
1128                // Also detect lowercase followed by uppercase
1129                // e.g., "RT-DETRis" -> "RT-DETR is"
1130            } else if is_lowercase(prev) && is_uppercase(current) {
1131                // Check if the uppercase starts a new word (not an acronym)
1132                // If next char is lowercase, it's likely a new word
1133                if i + 1 < chars.len() && is_lowercase(chars[i + 1]) {
1134                    result.push(' ');
1135                }
1136            }
1137            // Detect digit/percent followed by letter, or letter-digit-letter pattern
1138            // e.g., "48.1%AP" -> "48.1% AP"
1139            // e.g., "RT-DETRv3" shouldn't be split, but "model 100instances" -> "model 100 instances"
1140            else if ((is_digit(prev) || prev == '%') && is_uppercase(current))
1141                || (is_letter(prev)
1142                    && is_digit(current)
1143                    && i + 1 < chars.len()
1144                    && is_letter(chars[i + 1]))
1145            {
1146                result.push(' ');
1147            }
1148        }
1149
1150        result.push(current);
1151        i += 1;
1152    }
1153
1154    result
1155}
1156
1157/// Checks if a character is a letter.
1158fn is_letter(c: char) -> bool {
1159    is_lowercase(c) || is_uppercase(c)
1160}
1161
1162/// Simplifies table HTML by removing wrapper tags, following PaddleX's `simplify_table_func`.
1163///
1164/// This removes `<html>`, `</html>`, `<body>`, and `</body>` tags from table HTML
1165/// to produce cleaner markdown output.
1166fn simplify_table_html(html: &str) -> String {
1167    html.replace("<html>", "")
1168        .replace("</html>", "")
1169        .replace("<body>", "")
1170        .replace("</body>", "")
1171}
1172
1173/// Post-processes text content to fix common OCR/PDF artifacts.
1174///
1175/// This applies multiple cleanup steps:
1176/// 1. Dehyphenation - removes line-break hyphens
1177/// 2. Word merging fixes - adds missing spaces
1178/// 3. Spacing normalization - fixes multiple spaces
1179pub fn postprocess_text(text: &str) -> String {
1180    let text = dehyphenate(text);
1181    let text = fix_merged_words(&text);
1182
1183    // Normalize whitespace (collapse multiple spaces, fix spacing after punctuation)
1184    let mut result = String::new();
1185    let mut in_space = false;
1186
1187    for c in text.chars() {
1188        if c.is_whitespace() {
1189            if !in_space && !result.is_empty() {
1190                result.push(' ');
1191                in_space = true;
1192            }
1193        } else {
1194            // Fix missing space after period (when followed by letter)
1195            if c == '.' && !result.is_empty() {
1196                let last = result.chars().last().unwrap();
1197                if is_letter(last) || is_digit(last) {
1198                    result.push('.');
1199                    in_space = true;
1200                    continue;
1201                }
1202            }
1203            // Fix spacing after punctuation
1204            if in_space && matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')' | ']' | '}') {
1205                result.pop(); // Remove the space before punctuation
1206                result.push(c);
1207                continue;
1208            }
1209            result.push(c);
1210            in_space = false;
1211        }
1212    }
1213
1214    result
1215}
1216
1217/// Removes duplicate section headers from concatenated markdown.
1218///
1219/// When concatenating pages, section headers like "**Abstract**" or
1220/// "**References**" may appear multiple times. This function deduplicates
1221/// them while preserving the first occurrence.
1222fn deduplicate_sections(markdown: &str) -> String {
1223    let mut result = String::new();
1224    let mut seen_sections: std::collections::HashSet<String> = std::collections::HashSet::new();
1225
1226    for line in markdown.lines() {
1227        let trimmed = line.trim();
1228
1229        // Check for common section header patterns
1230        let is_section_header =
1231            trimmed.starts_with("**") && trimmed.ends_with("**") && trimmed.len() > 4;
1232
1233        let section_name = if is_section_header {
1234            trimmed[2..trimmed.len() - 2].to_string()
1235        } else {
1236            String::new()
1237        };
1238
1239        if is_section_header {
1240            if seen_sections.contains(&section_name) {
1241                // Skip duplicate section header
1242                continue;
1243            }
1244            seen_sections.insert(section_name);
1245        }
1246
1247        if !result.is_empty() {
1248            result.push('\n');
1249        }
1250        result.push_str(line);
1251    }
1252
1253    result
1254}
1255
1256/// Checks if two bounding boxes are on the same line (have significant vertical overlap).
1257///
1258/// Two boxes are considered on the same line if their vertical overlap is greater than
1259/// 50% of the smaller box's height.
1260fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool {
1261    let y1_min = bbox1.y_min();
1262    let y1_max = bbox1.y_max();
1263    let y2_min = bbox2.y_min();
1264    let y2_max = bbox2.y_max();
1265
1266    // Calculate vertical overlap
1267    let overlap_start = y1_min.max(y2_min);
1268    let overlap_end = y1_max.min(y2_max);
1269    let overlap = (overlap_end - overlap_start).max(0.0);
1270
1271    // Calculate minimum height
1272    let height1 = y1_max - y1_min;
1273    let height2 = y2_max - y2_min;
1274    let min_height = height1.min(height2);
1275
1276    // Consider same line if overlap > 50% of min height
1277    min_height > 0.0 && overlap / min_height > 0.5
1278}
1279
1280/// Filters empty formula blocks from markdown.
1281///
1282/// Formula blocks with no LaTeX content like `$$\n$$` are removed.
1283fn filter_empty_formulas(markdown: &str) -> String {
1284    let mut result = String::new();
1285    let lines: Vec<&str> = markdown.lines().collect();
1286    let mut i = 0;
1287
1288    while i < lines.len() {
1289        let line = lines[i];
1290
1291        // Check for empty formula block pattern
1292        if line.trim() == "$$" {
1293            // Check if next line is also $$ (empty formula)
1294            if i + 1 < lines.len() && lines[i + 1].trim() == "$$" {
1295                // Skip both lines
1296                i += 2;
1297                // Also skip the blank line after
1298                if i < lines.len() && lines[i].trim().is_empty() {
1299                    i += 1;
1300                }
1301                continue;
1302            }
1303            // Check if the next non-empty line contains actual content
1304            let mut j = i + 1;
1305            let has_content = if j < lines.len() {
1306                let mut found = false;
1307                while j < lines.len() {
1308                    if lines[j].trim() == "$$" {
1309                        break;
1310                    }
1311                    if !lines[j].trim().is_empty() {
1312                        found = true;
1313                        break;
1314                    }
1315                    j += 1;
1316                }
1317                found
1318            } else {
1319                false
1320            };
1321
1322            if !has_content {
1323                // Skip to closing $$
1324                while i < lines.len() && lines[i].trim() != "$$" {
1325                    i += 1;
1326                }
1327                if i < lines.len() {
1328                    i += 1; // Skip closing $$
1329                }
1330                continue;
1331            }
1332        }
1333
1334        if !result.is_empty() {
1335            result.push('\n');
1336        }
1337        result.push_str(line);
1338        i += 1;
1339    }
1340
1341    result
1342}
1343
1344/// Applies all post-processing steps to concatenated markdown.
1345///
1346/// This is the main entry point for cleaning up concatenated markdown output.
1347pub fn postprocess_markdown(markdown: &str) -> String {
1348    let markdown = filter_empty_formulas(markdown);
1349    let markdown = deduplicate_sections(&markdown);
1350
1351    // Apply text post-processing line by line for text content
1352    let mut result = String::new();
1353    let mut in_code_block = false;
1354    let mut in_formula = false;
1355
1356    for line in markdown.lines() {
1357        let trimmed = line.trim();
1358
1359        // Detect code blocks
1360        if trimmed.starts_with("```") {
1361            in_code_block = !in_code_block;
1362            result.push_str(line);
1363            result.push('\n');
1364            continue;
1365        }
1366
1367        // Detect formula blocks
1368        if trimmed == "$$" {
1369            in_formula = !in_formula;
1370            result.push_str(line);
1371            result.push('\n');
1372            continue;
1373        }
1374
1375        // Skip processing inside code/formula blocks
1376        if in_code_block || in_formula {
1377            result.push_str(line);
1378            result.push('\n');
1379            continue;
1380        }
1381
1382        // Process text content (skip headers, lists, etc.)
1383        if trimmed.starts_with('#')
1384            || trimmed.starts_with('*')
1385            || trimmed.starts_with('>')
1386            || trimmed.starts_with('|')
1387            || trimmed.starts_with('-')
1388            || trimmed.starts_with('+')
1389        {
1390            result.push_str(line);
1391        } else {
1392            result.push_str(&postprocess_text(line));
1393        }
1394        result.push('\n');
1395    }
1396
1397    result
1398}
1399
1400/// Extension trait for convenient multi-page processing.
1401pub trait StructureResultExt {
1402    /// Converts multiple results to a single concatenated markdown.
1403    fn to_concatenated_markdown(results: &[Self]) -> String
1404    where
1405        Self: Sized;
1406
1407    /// Saves multiple results with concatenated markdown.
1408    fn save_multi_page_results(
1409        results: &[Self],
1410        output_dir: impl AsRef<std::path::Path>,
1411        base_name: &str,
1412        to_json: bool,
1413        to_markdown: bool,
1414        to_html: bool,
1415    ) -> std::io::Result<()>
1416    where
1417        Self: Sized;
1418}
1419
1420impl StructureResultExt for StructureResult {
1421    fn to_concatenated_markdown(results: &[Self]) -> String {
1422        concatenate_markdown_pages(results)
1423    }
1424
1425    fn save_multi_page_results(
1426        results: &[Self],
1427        output_dir: impl AsRef<std::path::Path>,
1428        base_name: &str,
1429        to_json: bool,
1430        to_markdown: bool,
1431        to_html: bool,
1432    ) -> std::io::Result<()>
1433    where
1434        Self: Sized,
1435    {
1436        let output_dir = output_dir.as_ref();
1437        if !output_dir.exists() {
1438            std::fs::create_dir_all(output_dir)?;
1439        }
1440
1441        // Save individual page results
1442        for (idx, result) in results.iter().enumerate() {
1443            let page_dir = output_dir.join(format!("page_{:03}", idx));
1444            std::fs::create_dir_all(&page_dir)?;
1445            result.save_results(&page_dir, to_json, to_html)?;
1446        }
1447
1448        // Save concatenated markdown
1449        if to_markdown {
1450            let concat_md_path = output_dir.join(format!("{}.md", base_name));
1451            std::fs::write(concat_md_path, Self::to_concatenated_markdown(results))?;
1452        }
1453
1454        // Save concatenated JSON (array of results)
1455        if to_json {
1456            let concat_json_path = output_dir.join(format!("{}.json", base_name));
1457            let json_file = std::fs::File::create(concat_json_path)?;
1458            serde_json::to_writer_pretty(json_file, &results)?;
1459        }
1460
1461        Ok(())
1462    }
1463}
1464
1465/// A layout element detected in the document.
1466#[derive(Debug, Clone, Serialize, Deserialize)]
1467pub struct LayoutElement {
1468    /// Bounding box of the element
1469    pub bbox: BoundingBox,
1470    /// Type of the layout element
1471    pub element_type: LayoutElementType,
1472    /// Confidence score for the detection
1473    pub confidence: f32,
1474    /// Optional label for the element (original model label)
1475    pub label: Option<String>,
1476    /// Optional text content for the element
1477    pub text: Option<String>,
1478    /// Reading order index (1-based, assigned during stitching)
1479    ///
1480    /// This index represents the element's position in the reading order.
1481    /// Only elements that should be included in reading flow (text, tables,
1482    /// formulas, images, etc.) will have an order index assigned.
1483    /// Headers, footers, and other auxiliary elements may have `None`.
1484    pub order_index: Option<u32>,
1485}
1486
1487impl LayoutElement {
1488    /// Creates a new layout element.
1489    pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
1490        Self {
1491            bbox,
1492            element_type,
1493            confidence,
1494            label: None,
1495            text: None,
1496            order_index: None,
1497        }
1498    }
1499
1500    /// Sets the label for the element.
1501    pub fn with_label(mut self, label: impl Into<String>) -> Self {
1502        self.label = Some(label.into());
1503        self
1504    }
1505
1506    /// Sets the text content for the element.
1507    pub fn with_text(mut self, text: impl Into<String>) -> Self {
1508        self.text = Some(text.into());
1509        self
1510    }
1511}
1512
1513/// Layout element type supporting PP-StructureV3's full label set.
1514///
1515/// This enum represents both **semantic categories** and **fine-grained labels** for layout elements.
1516/// PP-StructureV3 models output 20 or 23 class labels depending on the model variant.
1517///
1518/// The original model-specific label is preserved in `LayoutElement.label` field.
1519///
1520/// # PP-StructureV3 Label Categories
1521///
1522/// **Document structure:**
1523/// - `DocTitle` - Document title (doc_title)
1524/// - `ParagraphTitle` - Section/paragraph title (paragraph_title)
1525/// - `Text` - General text content
1526/// - `Content` - Table of contents (content)
1527/// - `Abstract` - Abstract section
1528///
1529/// **Visual elements:**
1530/// - `Image` - Images/figures (image, figure)
1531/// - `Table` - Tables
1532/// - `Chart` - Charts/graphs
1533/// - `Formula` - Mathematical formulas
1534///
1535/// **Captions and titles:**
1536/// - `FigureTitle` - Figure caption (figure_title)
1537/// - `TableTitle` - Table caption (table_title)
1538/// - `ChartTitle` - Chart caption (chart_title)
1539/// - `FigureTableChartTitle` - Combined caption type
1540///
1541/// **Page structure:**
1542/// - `Header` - Page header
1543/// - `HeaderImage` - Header image
1544/// - `Footer` - Page footer
1545/// - `FooterImage` - Footer image
1546/// - `Footnote` - Footnotes
1547///
1548/// **Special elements:**
1549/// - `Seal` - Stamps/official seals
1550/// - `Number` - Page numbers
1551/// - `Reference` - References section
1552/// - `ReferenceContent` - Reference content
1553/// - `Algorithm` - Algorithm blocks
1554/// - `FormulaNumber` - Formula numbers
1555/// - `AsideText` - Marginal/aside text
1556/// - `List` - List items
1557///
1558/// - `Other` - Unknown/unmapped labels
1559#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1560pub enum LayoutElementType {
1561    /// Document title
1562    DocTitle,
1563    /// Paragraph/section title
1564    ParagraphTitle,
1565    /// General text content
1566    Text,
1567    /// Table of contents
1568    Content,
1569    /// Abstract section
1570    Abstract,
1571
1572    /// Image or figure
1573    Image,
1574    /// Table
1575    Table,
1576    /// Chart or graph
1577    Chart,
1578    /// Mathematical formula
1579    Formula,
1580
1581    /// Figure caption/title
1582    FigureTitle,
1583    /// Table caption/title
1584    TableTitle,
1585    /// Chart caption/title
1586    ChartTitle,
1587    /// Combined figure/table/chart title (PP-DocLayout)
1588    FigureTableChartTitle,
1589
1590    /// Page header
1591    Header,
1592    /// Header image
1593    HeaderImage,
1594    /// Page footer
1595    Footer,
1596    /// Footer image
1597    FooterImage,
1598    /// Footnote
1599    Footnote,
1600
1601    /// Stamp or official seal
1602    Seal,
1603    /// Page number
1604    Number,
1605    /// Reference section
1606    Reference,
1607    /// Reference content (PP-DocLayout_plus-L)
1608    ReferenceContent,
1609    /// Algorithm block
1610    Algorithm,
1611    /// Formula number
1612    FormulaNumber,
1613    /// Marginal/aside text
1614    AsideText,
1615    /// List items
1616    List,
1617
1618    /// Generic document region block (PP-DocBlockLayout)
1619    /// Used for hierarchical layout ordering and block grouping
1620    Region,
1621
1622    /// Other/unknown (original label preserved in LayoutElement.label)
1623    Other,
1624}
1625
1626impl LayoutElementType {
1627    /// Returns the string representation of the element type.
1628    ///
1629    /// This returns the PP-StructureV3 compatible label string.
1630    pub fn as_str(&self) -> &'static str {
1631        match self {
1632            // Document Structure
1633            LayoutElementType::DocTitle => "doc_title",
1634            LayoutElementType::ParagraphTitle => "paragraph_title",
1635            LayoutElementType::Text => "text",
1636            LayoutElementType::Content => "content",
1637            LayoutElementType::Abstract => "abstract",
1638
1639            // Visual Elements
1640            LayoutElementType::Image => "image",
1641            LayoutElementType::Table => "table",
1642            LayoutElementType::Chart => "chart",
1643            LayoutElementType::Formula => "formula",
1644
1645            // Captions
1646            LayoutElementType::FigureTitle => "figure_title",
1647            LayoutElementType::TableTitle => "table_title",
1648            LayoutElementType::ChartTitle => "chart_title",
1649            LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
1650
1651            // Page Structure
1652            LayoutElementType::Header => "header",
1653            LayoutElementType::HeaderImage => "header_image",
1654            LayoutElementType::Footer => "footer",
1655            LayoutElementType::FooterImage => "footer_image",
1656            LayoutElementType::Footnote => "footnote",
1657
1658            // Special Elements
1659            LayoutElementType::Seal => "seal",
1660            LayoutElementType::Number => "number",
1661            LayoutElementType::Reference => "reference",
1662            LayoutElementType::ReferenceContent => "reference_content",
1663            LayoutElementType::Algorithm => "algorithm",
1664            LayoutElementType::FormulaNumber => "formula_number",
1665            LayoutElementType::AsideText => "aside_text",
1666            LayoutElementType::List => "list",
1667
1668            // Region (PP-DocBlockLayout)
1669            LayoutElementType::Region => "region",
1670
1671            // Fallback
1672            LayoutElementType::Other => "other",
1673        }
1674    }
1675
1676    /// Creates a LayoutElementType from a string label with fine-grained mapping.
1677    ///
1678    /// This method maps model output labels to their corresponding fine-grained types,
1679    /// preserving the full PP-StructureV3 label set (20/23 classes).
1680    pub fn from_label(label: &str) -> Self {
1681        match label.to_lowercase().as_str() {
1682            // Document Structure
1683            "doc_title" => LayoutElementType::DocTitle,
1684            "paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
1685            "text" | "paragraph" => LayoutElementType::Text,
1686            "content" => LayoutElementType::Content,
1687            "abstract" => LayoutElementType::Abstract,
1688
1689            // Visual Elements
1690            "image" | "figure" => LayoutElementType::Image,
1691            "table" => LayoutElementType::Table,
1692            "chart" | "flowchart" => LayoutElementType::Chart,
1693            "formula" | "equation" | "display_formula" | "inline_formula" => {
1694                LayoutElementType::Formula
1695            }
1696
1697            // Captions
1698            "figure_title" => LayoutElementType::FigureTitle,
1699            "table_title" => LayoutElementType::TableTitle,
1700            "chart_title" => LayoutElementType::ChartTitle,
1701            "figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
1702
1703            // Page Structure
1704            "header" => LayoutElementType::Header,
1705            "header_image" => LayoutElementType::HeaderImage,
1706            "footer" => LayoutElementType::Footer,
1707            "footer_image" => LayoutElementType::FooterImage,
1708            "footnote" | "vision_footnote" => LayoutElementType::Footnote,
1709
1710            // Special Elements
1711            "seal" => LayoutElementType::Seal,
1712            "number" => LayoutElementType::Number,
1713            "reference" => LayoutElementType::Reference,
1714            "reference_content" => LayoutElementType::ReferenceContent,
1715            "algorithm" => LayoutElementType::Algorithm,
1716            "formula_number" => LayoutElementType::FormulaNumber,
1717            "aside_text" => LayoutElementType::AsideText,
1718            "list" => LayoutElementType::List,
1719            "vertical_text" => LayoutElementType::Text,
1720
1721            // Region (PP-DocBlockLayout)
1722            "region" => LayoutElementType::Region,
1723
1724            // Everything else maps to Other
1725            // The original label is preserved in LayoutElement.label
1726            _ => LayoutElementType::Other,
1727        }
1728    }
1729
1730    /// Returns the semantic category for this element type.
1731    ///
1732    /// This method groups fine-grained types into broader semantic categories,
1733    /// useful for processing logic that doesn't need fine-grained distinctions.
1734    ///
1735    /// # Categories
1736    ///
1737    /// - **Title**: DocTitle, ParagraphTitle
1738    /// - **Text**: Text, Content, Abstract
1739    /// - **Visual**: Image, Chart
1740    /// - **Table**: Table
1741    /// - **Caption**: FigureTitle, TableTitle, ChartTitle, FigureTableChartTitle
1742    /// - **Header**: Header, HeaderImage
1743    /// - **Footer**: Footer, FooterImage, Footnote
1744    /// - **Formula**: Formula, FormulaNumber
1745    /// - **Special**: Seal, Number, Reference, ReferenceContent, Algorithm, AsideText
1746    /// - **List**: List
1747    /// - **Other**: Other
1748    pub fn semantic_category(&self) -> &'static str {
1749        match self {
1750            // Title category
1751            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
1752
1753            // Text category
1754            LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
1755                "text"
1756            }
1757
1758            // Visual category
1759            LayoutElementType::Image | LayoutElementType::Chart => "visual",
1760
1761            // Table category
1762            LayoutElementType::Table => "table",
1763
1764            // Caption category
1765            LayoutElementType::FigureTitle
1766            | LayoutElementType::TableTitle
1767            | LayoutElementType::ChartTitle
1768            | LayoutElementType::FigureTableChartTitle => "caption",
1769
1770            // Header category
1771            LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
1772
1773            // Footer category
1774            LayoutElementType::Footer
1775            | LayoutElementType::FooterImage
1776            | LayoutElementType::Footnote => "footer",
1777
1778            // Formula category
1779            LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
1780
1781            // Special category
1782            LayoutElementType::Seal
1783            | LayoutElementType::Number
1784            | LayoutElementType::Reference
1785            | LayoutElementType::ReferenceContent
1786            | LayoutElementType::Algorithm
1787            | LayoutElementType::AsideText => "special",
1788
1789            // List category
1790            LayoutElementType::List => "list",
1791
1792            // Region category (PP-DocBlockLayout)
1793            LayoutElementType::Region => "region",
1794
1795            // Other
1796            LayoutElementType::Other => "other",
1797        }
1798    }
1799
1800    /// Returns whether this element type is a title variant.
1801    pub fn is_title(&self) -> bool {
1802        matches!(
1803            self,
1804            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
1805        )
1806    }
1807
1808    /// Returns whether this element type is a visual element (image, chart, figure).
1809    pub fn is_visual(&self) -> bool {
1810        matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
1811    }
1812
1813    /// Returns whether this element type is a caption variant.
1814    pub fn is_caption(&self) -> bool {
1815        matches!(
1816            self,
1817            LayoutElementType::FigureTitle
1818                | LayoutElementType::TableTitle
1819                | LayoutElementType::ChartTitle
1820                | LayoutElementType::FigureTableChartTitle
1821        )
1822    }
1823
1824    /// Returns whether this element type is a header variant.
1825    pub fn is_header(&self) -> bool {
1826        matches!(
1827            self,
1828            LayoutElementType::Header | LayoutElementType::HeaderImage
1829        )
1830    }
1831
1832    /// Returns whether this element type is a footer variant.
1833    pub fn is_footer(&self) -> bool {
1834        matches!(
1835            self,
1836            LayoutElementType::Footer
1837                | LayoutElementType::FooterImage
1838                | LayoutElementType::Footnote
1839        )
1840    }
1841
1842    /// Returns whether this element type is a formula variant.
1843    pub fn is_formula(&self) -> bool {
1844        matches!(
1845            self,
1846            LayoutElementType::Formula | LayoutElementType::FormulaNumber
1847        )
1848    }
1849
1850    /// Returns whether this element type contains text content that should be OCR'd.
1851    pub fn should_ocr(&self) -> bool {
1852        matches!(
1853            self,
1854            LayoutElementType::Text
1855                | LayoutElementType::Content
1856                | LayoutElementType::Abstract
1857                | LayoutElementType::DocTitle
1858                | LayoutElementType::ParagraphTitle
1859                | LayoutElementType::FigureTitle
1860                | LayoutElementType::TableTitle
1861                | LayoutElementType::ChartTitle
1862                | LayoutElementType::FigureTableChartTitle
1863                | LayoutElementType::Header
1864                | LayoutElementType::HeaderImage
1865                | LayoutElementType::Footer
1866                | LayoutElementType::FooterImage
1867                | LayoutElementType::Footnote
1868                | LayoutElementType::Reference
1869                | LayoutElementType::ReferenceContent
1870                | LayoutElementType::Algorithm
1871                | LayoutElementType::AsideText
1872                | LayoutElementType::List
1873                | LayoutElementType::Number
1874        )
1875    }
1876}
1877
1878/// Removes heavily-overlapping layout elements in-place.
1879///
1880/// This mirrors PP-Structure-style overlap suppression where text takes priority over images.
1881/// Returns the number of elements removed.
1882pub fn remove_overlapping_layout_elements(
1883    layout_elements: &mut Vec<LayoutElement>,
1884    overlap_threshold: f32,
1885) -> usize {
1886    use std::collections::HashSet;
1887
1888    if layout_elements.len() <= 1 {
1889        return 0;
1890    }
1891
1892    let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
1893    let labels: Vec<&str> = layout_elements
1894        .iter()
1895        .map(|e| e.element_type.as_str())
1896        .collect();
1897
1898    let remove_indices =
1899        crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
1900    if remove_indices.is_empty() {
1901        return 0;
1902    }
1903
1904    let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
1905    let before = layout_elements.len();
1906
1907    let mut idx = 0;
1908    layout_elements.retain(|_| {
1909        let keep = !remove_set.contains(&idx);
1910        idx += 1;
1911        keep
1912    });
1913
1914    before.saturating_sub(layout_elements.len())
1915}
1916
1917/// Applies small, PP-Structure-style label fixes to layout elements.
1918///
1919/// This is intended to capture lightweight "glue" heuristics that shouldn't live in `predict`.
1920pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
1921    if layout_elements.is_empty() {
1922        return;
1923    }
1924
1925    let mut footnote_indices: Vec<usize> = Vec::new();
1926    let mut paragraph_title_indices: Vec<usize> = Vec::new();
1927    let mut bottom_text_y_max: f32 = 0.0;
1928    let mut max_block_area: f32 = 0.0;
1929    let mut doc_title_num: usize = 0;
1930
1931    for (idx, elem) in layout_elements.iter().enumerate() {
1932        let area =
1933            (elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
1934        max_block_area = max_block_area.max(area);
1935
1936        match elem.element_type {
1937            LayoutElementType::Footnote => footnote_indices.push(idx),
1938            LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
1939            LayoutElementType::Text => {
1940                bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
1941            }
1942            LayoutElementType::DocTitle => doc_title_num += 1,
1943            _ => {}
1944        }
1945    }
1946
1947    for idx in footnote_indices {
1948        if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
1949            layout_elements[idx].element_type = LayoutElementType::Text;
1950            layout_elements[idx].label = Some("text".to_string());
1951        }
1952    }
1953
1954    let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
1955    if only_one_paragraph_title {
1956        let idx = paragraph_title_indices[0];
1957        let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
1958            * (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
1959
1960        let title_area_ratio_threshold = 0.3f32;
1961        if area > max_block_area * title_area_ratio_threshold {
1962            layout_elements[idx].element_type = LayoutElementType::DocTitle;
1963            layout_elements[idx].label = Some("doc_title".to_string());
1964        }
1965    }
1966}
1967
1968/// Result of table recognition.
1969#[derive(Debug, Clone, Serialize, Deserialize)]
1970pub struct TableResult {
1971    /// Bounding box of the table in the original image
1972    pub bbox: BoundingBox,
1973    /// Table type (wired or wireless)
1974    pub table_type: TableType,
1975    /// Confidence score for table type classification (None if classifier wasn't configured/run)
1976    pub classification_confidence: Option<f32>,
1977    /// Confidence score for table structure recognition (None if structure recognition failed)
1978    pub structure_confidence: Option<f32>,
1979    /// Detected table cells
1980    pub cells: Vec<TableCell>,
1981    /// HTML structure of the table (if available)
1982    pub html_structure: Option<String>,
1983    /// OCR text content for each cell (if OCR was integrated)
1984    pub cell_texts: Option<Vec<Option<String>>>,
1985    /// Structure tokens from table structure recognition (used for HTML generation after stitching)
1986    #[serde(skip)]
1987    pub structure_tokens: Option<Vec<String>>,
1988}
1989
1990impl TableResult {
1991    /// Creates a new table result.
1992    pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
1993        Self {
1994            bbox,
1995            table_type,
1996            classification_confidence: None,
1997            structure_confidence: None,
1998            cells: Vec::new(),
1999            html_structure: None,
2000            cell_texts: None,
2001            structure_tokens: None,
2002        }
2003    }
2004
2005    /// Sets the classification confidence.
2006    pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
2007        self.classification_confidence = Some(confidence);
2008        self
2009    }
2010
2011    /// Sets the structure recognition confidence.
2012    pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
2013        self.structure_confidence = Some(confidence);
2014        self
2015    }
2016
2017    /// Sets the table cells.
2018    pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
2019        self.cells = cells;
2020        self
2021    }
2022
2023    /// Sets the HTML structure.
2024    pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
2025        self.html_structure = Some(html.into());
2026        self
2027    }
2028
2029    /// Sets the cell texts from OCR.
2030    pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
2031        self.cell_texts = Some(texts);
2032        self
2033    }
2034
2035    /// Sets the structure tokens for later HTML generation.
2036    pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
2037        self.structure_tokens = Some(tokens);
2038        self
2039    }
2040
2041    /// Returns the best available confidence score for this table.
2042    ///
2043    /// This method provides a unified confidence API for callers who want to filter
2044    /// tables by confidence without caring whether classification or structure
2045    /// recognition was used. Priority:
2046    /// 1. If both classification and structure confidence are available, returns
2047    ///    the minimum (most conservative estimate)
2048    /// 2. If only structure confidence is available (common when classifier isn't
2049    ///    configured), returns that
2050    /// 3. If only classification confidence is available, returns that
2051    /// 4. Returns `None` only if neither confidence is available (stub result)
2052    pub fn confidence(&self) -> Option<f32> {
2053        match (self.classification_confidence, self.structure_confidence) {
2054            (Some(cls), Some(str)) => Some(cls.min(str)),
2055            (None, Some(str)) => Some(str),
2056            (Some(cls), None) => Some(cls),
2057            (None, None) => None,
2058        }
2059    }
2060
2061    /// Returns true if this table has valid structure data.
2062    ///
2063    /// A table is considered valid if it has either cells or an HTML structure.
2064    /// Stub results (created when structure recognition fails) will return false.
2065    pub fn has_structure(&self) -> bool {
2066        !self.cells.is_empty() || self.html_structure.is_some()
2067    }
2068}
2069
2070/// Type of table.
2071#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
2072pub enum TableType {
2073    /// Table with visible borders
2074    Wired,
2075    /// Table without visible borders
2076    Wireless,
2077    /// Unknown table type
2078    Unknown,
2079}
2080
2081/// A cell in a table.
2082#[derive(Debug, Clone, Serialize, Deserialize)]
2083pub struct TableCell {
2084    /// Bounding box of the cell
2085    pub bbox: BoundingBox,
2086    /// Row index (0-based)
2087    pub row: Option<usize>,
2088    /// Column index (0-based)
2089    pub col: Option<usize>,
2090    /// Row span
2091    pub row_span: Option<usize>,
2092    /// Column span
2093    pub col_span: Option<usize>,
2094    /// Confidence score for the cell detection
2095    pub confidence: f32,
2096    /// Text content of the cell (if available)
2097    pub text: Option<String>,
2098}
2099
2100impl TableCell {
2101    /// Creates a new table cell.
2102    pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
2103        Self {
2104            bbox,
2105            row: None,
2106            col: None,
2107            row_span: None,
2108            col_span: None,
2109            confidence,
2110            text: None,
2111        }
2112    }
2113
2114    /// Sets the row and column indices.
2115    pub fn with_position(mut self, row: usize, col: usize) -> Self {
2116        self.row = Some(row);
2117        self.col = Some(col);
2118        self
2119    }
2120
2121    /// Sets the row and column spans.
2122    pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
2123        self.row_span = Some(row_span);
2124        self.col_span = Some(col_span);
2125        self
2126    }
2127
2128    /// Sets the text content.
2129    pub fn with_text(mut self, text: impl Into<String>) -> Self {
2130        self.text = Some(text.into());
2131        self
2132    }
2133}
2134
2135/// Result of formula recognition.
2136#[derive(Debug, Clone, Serialize, Deserialize)]
2137pub struct FormulaResult {
2138    /// Bounding box of the formula in the original image
2139    pub bbox: BoundingBox,
2140    /// LaTeX representation of the formula
2141    pub latex: String,
2142    /// Confidence score for the recognition
2143    pub confidence: f32,
2144}
2145
2146impl FormulaResult {
2147    /// Creates a new formula result.
2148    pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
2149        Self {
2150            bbox,
2151            latex: latex.into(),
2152            confidence,
2153        }
2154    }
2155}
2156
2157#[cfg(test)]
2158mod tests {
2159    use super::*;
2160
2161    #[test]
2162    fn test_structure_result_creation() {
2163        let result = StructureResult::new("test.jpg", 0);
2164        assert_eq!(result.input_path.as_ref(), "test.jpg");
2165        assert_eq!(result.index, 0);
2166        assert!(result.layout_elements.is_empty());
2167        assert!(result.tables.is_empty());
2168        assert!(result.formulas.is_empty());
2169        assert!(result.text_regions.is_none());
2170    }
2171
2172    #[test]
2173    fn test_layout_element_type_as_str() {
2174        assert_eq!(LayoutElementType::Text.as_str(), "text");
2175        assert_eq!(LayoutElementType::Table.as_str(), "table");
2176        assert_eq!(LayoutElementType::Formula.as_str(), "formula");
2177    }
2178
2179    #[test]
2180    fn test_table_result_creation() {
2181        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2182        let table = TableResult::new(bbox, TableType::Wired);
2183        assert_eq!(table.table_type, TableType::Wired);
2184        assert!(table.cells.is_empty());
2185        assert!(table.html_structure.is_none());
2186    }
2187
2188    #[test]
2189    fn test_structure_result_export() {
2190        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2191        let mut result = StructureResult::new("test.jpg", 0);
2192
2193        let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
2194            .with_text("Test Document");
2195
2196        let text =
2197            LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
2198
2199        result = result.with_layout_elements(vec![title, text]);
2200
2201        let md = result.to_markdown();
2202        assert!(md.contains("# Test Document"));
2203        assert!(md.contains("Hello world"));
2204
2205        let html = result.to_html();
2206        assert!(html.contains("<h1>Test Document</h1>"));
2207        assert!(html.contains("<p>Hello world</p>"));
2208    }
2209}
oar_ocr_core/domain/structure.rs

oar_ocr_core/domain/
structure.rs