oar_ocr_core/domain/
structure.rs

1//! Document structure analysis result types.
2//!
3//! This module defines the result types for document structure analysis,
4//! including layout detection, table recognition, and formula recognition.
5
6use super::text_region::TextRegion;
7use crate::processors::BoundingBox;
8use image::RgbImage;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::path::Path;
13use std::sync::Arc;
14
15/// Title numbering pattern for detecting section numbers like 1, 1.2, 1.2.3, (1), 一、etc.
16/// This follows standard title numbering pattern.
17static TITLE_NUMBERING_REGEX: Lazy<Regex> = Lazy::new(|| {
18    Regex::new(
19        r"(?x)
20        ^\s*
21        (
22            # Arabic numerals: 1, 1.2, 1.2.3, etc.
23            [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
24            |
25            # Parenthesized Arabic numerals: (1), (1.2), etc.
26            [((][1-9][0-9]*(?:\.[1-9][0-9]*)*[))]
27            |
28            # Chinese numerals with punctuation: 一、 二、
29            [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
30            |
31            # Parenthesized Chinese numerals: (一)
32            [((][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[))]
33            |
34            # Roman numerals with delimiter (period or followed by space)
35            (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
36        )
37        (\s+)
38        (.*)
39        $
40    ",
41    )
42    .expect("Invalid title numbering regex")
43});
44
45/// Format a paragraph title with automatic level detection based on numbering.
46///
47/// Following standard title formatting logic:
48/// - Extracts numbering prefix (1.2.3, etc.)
49/// - Determines heading level from number of dots (1.2.3 -> level 3)
50/// - Returns (level, formatted_title) where level is 1-based
51///
52/// # Examples
53///
54/// - "1 Introduction" -> (1, "1 Introduction")
55/// - "1.2 Methods" -> (2, "1.2 Methods")
56/// - "1.2.3 Details" -> (3, "1.2.3 Details")
57/// - "一、绪论" -> (1, "一、绪论")
58/// - "Just text" -> (2, "Just text") (default level 2 for no numbering)
59fn format_title_with_level(title: &str) -> (usize, String) {
60    // Clean up line breaks
61    let cleaned = title.replace("-\n", "").replace('\n', " ");
62
63    if let Some(captures) = TITLE_NUMBERING_REGEX.captures(&cleaned) {
64        let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
65        let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
66
67        // Determine level from dots in numbering
68        // 1 -> level 1, 1.2 -> level 2, 1.2.3 -> level 3
69        let level = if numbering.contains('.') {
70            numbering.matches('.').count() + 1
71        } else {
72            1
73        };
74
75        // Reconstruct title: numbering + space + content
76        let formatted = if title_content.is_empty() {
77            numbering.trim_end_matches('.').to_string()
78        } else {
79            format!(
80                "{} {}",
81                numbering.trim_end_matches('.'),
82                title_content.trim_start()
83            )
84        };
85
86        // Clamp level to reasonable range (1-6 for markdown)
87        let level = level.clamp(1, 6);
88
89        (level, formatted)
90    } else {
91        // No numbering detected, default to level 2 (## heading)
92        (2, cleaned)
93    }
94}
95
96/// A detected document region block (from PP-DocBlockLayout).
97///
98/// Region blocks represent hierarchical groupings of layout elements,
99/// typically columns or logical sections of a document. They are used
100/// for hierarchical reading order determination.
101///
102/// # PP-StructureV3 Alignment
103///
104/// PP-DocBlockLayout detects "region" type blocks that group related
105/// layout elements together. Elements within the same region should
106/// be read together before moving to the next region.
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct RegionBlock {
109    /// Bounding box of the region
110    pub bbox: BoundingBox,
111    /// Confidence score of the detection
112    pub confidence: f32,
113    /// Index of this region in the reading order
114    pub order_index: Option<u32>,
115    /// Indices of layout elements that belong to this region
116    pub element_indices: Vec<usize>,
117}
118
119/// Result of document structure analysis.
120///
121/// This struct contains all the results from analyzing a document's structure,
122/// including layout elements, tables, formulas, and OCR results.
123///
124/// # Coordinate System
125///
126/// The coordinate system of bounding boxes depends on which preprocessing was applied:
127///
128/// - **No preprocessing**: Boxes are in the original input image's coordinate system.
129///
130/// - **Orientation correction only** (`orientation_angle` set, `rectified_img` is None):
131///   Boxes are transformed back to the original input image's coordinate system.
132///
133/// - **Rectification applied** (`rectified_img` is Some):
134///   Boxes remain in the **rectified image's coordinate system**. Neural network-based
135///   rectification (UVDoc) warps cannot be precisely inverted, so use `rectified_img`
136///   for visualization instead of the original image.
137///
138/// - **Both orientation and rectification**: Boxes are in the rectified coordinate system
139///   (rectification takes precedence since it's applied after orientation correction).
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct StructureResult {
142    /// Path to the input image file
143    pub input_path: Arc<str>,
144    /// Index of the image in a batch (0 for single image processing)
145    pub index: usize,
146    /// Detected layout elements (text regions, tables, figures, etc.)
147    pub layout_elements: Vec<LayoutElement>,
148    /// Recognized tables with their structure and content
149    pub tables: Vec<TableResult>,
150    /// Recognized mathematical formulas
151    pub formulas: Vec<FormulaResult>,
152    /// OCR text regions (if OCR was integrated)
153    pub text_regions: Option<Vec<TextRegion>>,
154    /// Document orientation angle (if orientation correction was used)
155    pub orientation_angle: Option<f32>,
156    /// Detected region blocks for hierarchical ordering (PP-DocBlockLayout)
157    /// When present, layout_elements are already sorted by region hierarchy
158    pub region_blocks: Option<Vec<RegionBlock>>,
159    /// Rectified image (if document rectification was used)
160    /// Note: Bounding boxes are already transformed back to original coordinates for rotation,
161    /// but for rectification (UVDoc), boxes are in the rectified image's coordinate system.
162    /// Use this image for visualization when rectification was applied.
163    #[serde(skip)]
164    pub rectified_img: Option<Arc<RgbImage>>,
165}
166
167impl StructureResult {
168    /// Creates a new structure result.
169    pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
170        Self {
171            input_path: input_path.into(),
172            index,
173            layout_elements: Vec::new(),
174            tables: Vec::new(),
175            formulas: Vec::new(),
176            text_regions: None,
177            orientation_angle: None,
178            region_blocks: None,
179            rectified_img: None,
180        }
181    }
182
183    /// Adds layout elements to the result.
184    pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
185        self.layout_elements = elements;
186        self
187    }
188
189    /// Adds tables to the result.
190    pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
191        self.tables = tables;
192        self
193    }
194
195    /// Adds formulas to the result.
196    pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
197        self.formulas = formulas;
198        self
199    }
200
201    /// Adds OCR text regions to the result.
202    pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
203        self.text_regions = Some(regions);
204        self
205    }
206
207    /// Adds region blocks to the result (PP-DocBlockLayout).
208    ///
209    /// Region blocks represent hierarchical groupings of layout elements.
210    /// When set, layout_elements should already be sorted by region hierarchy.
211    pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
212        self.region_blocks = Some(blocks);
213        self
214    }
215
216    /// Converts the result to a Markdown string.
217    ///
218    /// Follows PP-StructureV3's formatting rules:
219    /// - DocTitle: `# title`
220    /// - ParagraphTitle: Auto-detect numbering (1.2.3 -> ###)
221    /// - Formula: `$$latex$$`
222    /// - Table: HTML with border
223    /// - Images: `![Figure](caption)`
224    ///
225    /// Note: Low-confidence text elements that overlap with table regions are filtered out
226    /// to avoid duplicate content from table OCR.
227    pub fn to_markdown(&self) -> String {
228        // Collect table bboxes for overlap filtering
229        let table_bboxes: Vec<&BoundingBox> = self
230            .layout_elements
231            .iter()
232            .filter(|e| e.element_type == LayoutElementType::Table)
233            .map(|e| &e.bbox)
234            .collect();
235
236        let mut md = String::new();
237        for element in &self.layout_elements {
238            // PP-StructureV3 markdown ignores auxiliary labels.
239            if matches!(
240                element.element_type,
241                LayoutElementType::Number
242                    | LayoutElementType::Footnote
243                    | LayoutElementType::Header
244                    | LayoutElementType::HeaderImage
245                    | LayoutElementType::Footer
246                    | LayoutElementType::FooterImage
247                    | LayoutElementType::AsideText
248            ) {
249                continue;
250            }
251
252            // Filter out low-confidence text elements that overlap with tables
253            // These are typically OCR artifacts from table cell text that shouldn't be
254            // output separately in markdown
255            if element.element_type == LayoutElementType::Text {
256                let overlaps_table = table_bboxes.iter().any(|table_bbox| {
257                    element.bbox.ioa(table_bbox) > 0.3 // >30% of text is inside table
258                });
259
260                // Skip low-confidence text that overlaps with table regions
261                // Standard logic filters these in the stitching phase
262                if overlaps_table && element.confidence < 0.7 {
263                    continue;
264                }
265            }
266
267            match element.element_type {
268                // Document title
269                LayoutElementType::DocTitle => {
270                    md.push_str("\n# ");
271                    if let Some(text) = &element.text {
272                        md.push_str(&text.replace("-\n", "").replace('\n', " "));
273                    }
274                    md.push_str("\n\n");
275                }
276                // Paragraph/section title - auto-detect numbering for level
277                LayoutElementType::ParagraphTitle => {
278                    if let Some(text) = &element.text {
279                        let (level, formatted_title) = format_title_with_level(text);
280                        md.push('\n');
281                        for _ in 0..level {
282                            md.push('#');
283                        }
284                        md.push(' ');
285                        md.push_str(&formatted_title);
286                        md.push_str("\n\n");
287                    } else {
288                        md.push_str("\n## \n\n");
289                    }
290                }
291                // Table - preserve HTML structure with border
292                LayoutElementType::Table => {
293                    if let Some(table) =
294                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
295                    {
296                        if let Some(html) = &table.html_structure {
297                            // Add border to table for better visibility
298                            let table_with_border = html.replace("<table>", "<table border=\"1\">");
299                            md.push('\n');
300                            md.push_str(&table_with_border);
301                            md.push_str("\n\n");
302                        } else {
303                            md.push_str("\n[Table]\n\n");
304                        }
305                    } else {
306                        md.push_str("\n[Table]\n\n");
307                    }
308                }
309                // Formula - wrap with $$
310                LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
311                    md.push_str("\n$$");
312                    if let Some(latex) = &element.text {
313                        md.push_str(latex);
314                    }
315                    md.push_str("$$\n\n");
316                }
317                // Image/Chart - figure format
318                LayoutElementType::Image | LayoutElementType::Chart => {
319                    md.push_str("\n![Figure]");
320                    if let Some(caption) = &element.text {
321                        md.push('(');
322                        md.push_str(caption);
323                        md.push(')');
324                    }
325                    md.push_str("\n\n");
326                }
327                // Seal - show as image with text
328                LayoutElementType::Seal => {
329                    md.push_str("\n![Seal]");
330                    if let Some(text) = &element.text {
331                        md.push_str("\n> ");
332                        md.push_str(text);
333                    }
334                    md.push_str("\n\n");
335                }
336                // Captions
337                _ if element.element_type.is_caption() => {
338                    if let Some(text) = &element.text {
339                        md.push('*');
340                        md.push_str(text);
341                        md.push_str("*\n\n");
342                    }
343                }
344                // Abstract
345                LayoutElementType::Abstract => {
346                    md.push_str("\n**Abstract**\n\n");
347                    if let Some(text) = &element.text {
348                        md.push_str(text);
349                        md.push_str("\n\n");
350                    }
351                }
352                // Reference
353                LayoutElementType::Reference => {
354                    md.push_str("\n**References**\n\n");
355                    if let Some(text) = &element.text {
356                        md.push_str(text);
357                        md.push_str("\n\n");
358                    }
359                }
360                // List
361                LayoutElementType::List => {
362                    if let Some(text) = &element.text {
363                        // Split by newlines and format as list items
364                        for line in text.lines() {
365                            md.push_str("- ");
366                            md.push_str(line);
367                            md.push('\n');
368                        }
369                        md.push('\n');
370                    }
371                }
372                // Header/Footer - smaller text
373                _ if element.element_type.is_header() || element.element_type.is_footer() => {
374                    if let Some(text) = &element.text {
375                        md.push_str("<small>");
376                        md.push_str(text);
377                        md.push_str("</small>\n\n");
378                    }
379                }
380                // Default text elements
381                _ => {
382                    if let Some(text) = &element.text {
383                        // Convert double newlines to paragraph breaks
384                        let formatted = text.replace("\n\n", "\n").replace('\n', "\n\n");
385                        md.push_str(&formatted);
386                        md.push_str("\n\n");
387                    }
388                }
389            }
390        }
391        md.trim().to_string()
392    }
393
394    /// Converts the result to an HTML string.
395    ///
396    /// Follows PP-StructureV3's formatting rules with semantic HTML tags.
397    pub fn to_html(&self) -> String {
398        let mut html = String::from(
399            "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
400        );
401
402        for element in &self.layout_elements {
403            match element.element_type {
404                // Document title
405                LayoutElementType::DocTitle => {
406                    html.push_str("<h1>");
407                    if let Some(text) = &element.text {
408                        html.push_str(&Self::escape_html(text));
409                    }
410                    html.push_str("</h1>\n");
411                }
412                // Paragraph/section title
413                LayoutElementType::ParagraphTitle => {
414                    html.push_str("<h2>");
415                    if let Some(text) = &element.text {
416                        html.push_str(&Self::escape_html(text));
417                    }
418                    html.push_str("</h2>\n");
419                }
420                // Table - embed HTML structure
421                LayoutElementType::Table => {
422                    if let Some(table) =
423                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
424                    {
425                        if let Some(table_html) = &table.html_structure {
426                            // Add border styling
427                            let styled = table_html.replace(
428                                "<table>",
429                                "<table border=\"1\" style=\"border-collapse: collapse;\">",
430                            );
431                            html.push_str(&styled);
432                            html.push('\n');
433                        } else {
434                            html.push_str("<p>[Table]</p>\n");
435                        }
436                    } else {
437                        html.push_str("<p>[Table]</p>\n");
438                    }
439                }
440                // Formula - use math tags
441                LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
442                    html.push_str("<p class=\"formula\">$$");
443                    if let Some(latex) = &element.text {
444                        html.push_str(&Self::escape_html(latex));
445                    }
446                    html.push_str("$$</p>\n");
447                }
448                // Image/Chart
449                LayoutElementType::Image | LayoutElementType::Chart => {
450                    html.push_str("<figure>\n<img alt=\"Figure\" />\n");
451                    if let Some(caption) = &element.text {
452                        html.push_str("<figcaption>");
453                        html.push_str(&Self::escape_html(caption));
454                        html.push_str("</figcaption>\n");
455                    }
456                    html.push_str("</figure>\n");
457                }
458                // Seal
459                LayoutElementType::Seal => {
460                    html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
461                    if let Some(text) = &element.text {
462                        html.push_str("<figcaption>");
463                        html.push_str(&Self::escape_html(text));
464                        html.push_str("</figcaption>\n");
465                    }
466                    html.push_str("</figure>\n");
467                }
468                // Captions
469                _ if element.element_type.is_caption() => {
470                    if let Some(text) = &element.text {
471                        html.push_str("<figcaption>");
472                        html.push_str(&Self::escape_html(text));
473                        html.push_str("</figcaption>\n");
474                    }
475                }
476                // Abstract
477                LayoutElementType::Abstract => {
478                    html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
479                    if let Some(text) = &element.text {
480                        html.push_str(&Self::escape_html(text));
481                    }
482                    html.push_str("</p>\n</section>\n");
483                }
484                // Reference
485                LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
486                    html.push_str("<section class=\"references\">\n<p>");
487                    if let Some(text) = &element.text {
488                        html.push_str(&Self::escape_html(text));
489                    }
490                    html.push_str("</p>\n</section>\n");
491                }
492                // List
493                LayoutElementType::List => {
494                    html.push_str("<ul>\n");
495                    if let Some(text) = &element.text {
496                        for line in text.lines() {
497                            html.push_str("<li>");
498                            html.push_str(&Self::escape_html(line));
499                            html.push_str("</li>\n");
500                        }
501                    }
502                    html.push_str("</ul>\n");
503                }
504                // Header
505                _ if element.element_type.is_header() => {
506                    html.push_str("<header>");
507                    if let Some(text) = &element.text {
508                        html.push_str(&Self::escape_html(text));
509                    }
510                    html.push_str("</header>\n");
511                }
512                // Footer
513                _ if element.element_type.is_footer() => {
514                    html.push_str("<footer>");
515                    if let Some(text) = &element.text {
516                        html.push_str(&Self::escape_html(text));
517                    }
518                    html.push_str("</footer>\n");
519                }
520                // Default text
521                _ => {
522                    if let Some(text) = &element.text {
523                        html.push_str("<p>");
524                        html.push_str(&Self::escape_html(text));
525                        html.push_str("</p>\n");
526                    }
527                }
528            }
529        }
530        html.push_str("</body>\n</html>");
531        html
532    }
533
534    /// Escapes HTML special characters.
535    fn escape_html(text: &str) -> String {
536        text.replace('&', "&amp;")
537            .replace('<', "&lt;")
538            .replace('>', "&gt;")
539            .replace('"', "&quot;")
540            .replace('\'', "&#39;")
541    }
542
543    /// Converts the result to a JSON Value.
544    pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
545        serde_json::to_value(self)
546    }
547
548    /// Saves the analysis results to the specified directory.
549    ///
550    /// This generates:
551    /// - `*_res.json`: The full structured result
552    /// - `*_res.md`: A Markdown representation
553    /// - `*_res.html`: An HTML representation
554    ///
555    /// # Arguments
556    ///
557    /// * `to_html` - If true, save an HTML representation.
558    pub fn save_results(
559        &self,
560        output_dir: impl AsRef<Path>,
561        to_json: bool,
562        to_markdown: bool,
563        to_html: bool,
564    ) -> std::io::Result<()> {
565        let output_dir = output_dir.as_ref();
566        if !output_dir.exists() {
567            std::fs::create_dir_all(output_dir)?;
568        }
569
570        let input_path = Path::new(self.input_path.as_ref());
571        let stem = input_path
572            .file_stem()
573            .and_then(|s| s.to_str())
574            .unwrap_or("result");
575
576        // Save JSON
577        if to_json {
578            let json_path = output_dir.join(format!("{}.json", stem));
579            let json_file = std::fs::File::create(json_path)?;
580            serde_json::to_writer_pretty(json_file, self)?;
581        }
582
583        // Save Markdown
584        if to_markdown {
585            let md_path = output_dir.join(format!("{}.md", stem));
586            std::fs::write(md_path, self.to_markdown())?;
587        }
588
589        // Save HTML
590        if to_html {
591            let html_path = output_dir.join(format!("{}.html", stem));
592            std::fs::write(html_path, self.to_html())?;
593        }
594
595        Ok(())
596    }
597}
598
599/// A layout element detected in the document.
600#[derive(Debug, Clone, Serialize, Deserialize)]
601pub struct LayoutElement {
602    /// Bounding box of the element
603    pub bbox: BoundingBox,
604    /// Type of the layout element
605    pub element_type: LayoutElementType,
606    /// Confidence score for the detection
607    pub confidence: f32,
608    /// Optional label for the element (original model label)
609    pub label: Option<String>,
610    /// Optional text content for the element
611    pub text: Option<String>,
612    /// Reading order index (1-based, assigned during stitching)
613    ///
614    /// This index represents the element's position in the reading order.
615    /// Only elements that should be included in reading flow (text, tables,
616    /// formulas, images, etc.) will have an order index assigned.
617    /// Headers, footers, and other auxiliary elements may have `None`.
618    pub order_index: Option<u32>,
619}
620
621impl LayoutElement {
622    /// Creates a new layout element.
623    pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
624        Self {
625            bbox,
626            element_type,
627            confidence,
628            label: None,
629            text: None,
630            order_index: None,
631        }
632    }
633
634    /// Sets the label for the element.
635    pub fn with_label(mut self, label: impl Into<String>) -> Self {
636        self.label = Some(label.into());
637        self
638    }
639
640    /// Sets the text content for the element.
641    pub fn with_text(mut self, text: impl Into<String>) -> Self {
642        self.text = Some(text.into());
643        self
644    }
645}
646
647/// Layout element type supporting PP-StructureV3's full label set.
648///
649/// This enum represents both **semantic categories** and **fine-grained labels** for layout elements.
650/// PP-StructureV3 models output 20 or 23 class labels depending on the model variant.
651///
652/// The original model-specific label is preserved in `LayoutElement.label` field.
653///
654/// # PP-StructureV3 Label Categories
655///
656/// **Document structure:**
657/// - `DocTitle` - Document title (doc_title)
658/// - `ParagraphTitle` - Section/paragraph title (paragraph_title)
659/// - `Text` - General text content
660/// - `Content` - Table of contents (content)
661/// - `Abstract` - Abstract section
662///
663/// **Visual elements:**
664/// - `Image` - Images/figures (image, figure)
665/// - `Table` - Tables
666/// - `Chart` - Charts/graphs
667/// - `Formula` - Mathematical formulas
668///
669/// **Captions and titles:**
670/// - `FigureTitle` - Figure caption (figure_title)
671/// - `TableTitle` - Table caption (table_title)
672/// - `ChartTitle` - Chart caption (chart_title)
673/// - `FigureTableChartTitle` - Combined caption type
674///
675/// **Page structure:**
676/// - `Header` - Page header
677/// - `HeaderImage` - Header image
678/// - `Footer` - Page footer
679/// - `FooterImage` - Footer image
680/// - `Footnote` - Footnotes
681///
682/// **Special elements:**
683/// - `Seal` - Stamps/official seals
684/// - `Number` - Page numbers
685/// - `Reference` - References section
686/// - `ReferenceContent` - Reference content
687/// - `Algorithm` - Algorithm blocks
688/// - `FormulaNumber` - Formula numbers
689/// - `AsideText` - Marginal/aside text
690/// - `List` - List items
691///
692/// - `Other` - Unknown/unmapped labels
693#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
694pub enum LayoutElementType {
695    /// Document title
696    DocTitle,
697    /// Paragraph/section title
698    ParagraphTitle,
699    /// General text content
700    Text,
701    /// Table of contents
702    Content,
703    /// Abstract section
704    Abstract,
705
706    /// Image or figure
707    Image,
708    /// Table
709    Table,
710    /// Chart or graph
711    Chart,
712    /// Mathematical formula
713    Formula,
714
715    /// Figure caption/title
716    FigureTitle,
717    /// Table caption/title
718    TableTitle,
719    /// Chart caption/title
720    ChartTitle,
721    /// Combined figure/table/chart title (PP-DocLayout)
722    FigureTableChartTitle,
723
724    /// Page header
725    Header,
726    /// Header image
727    HeaderImage,
728    /// Page footer
729    Footer,
730    /// Footer image
731    FooterImage,
732    /// Footnote
733    Footnote,
734
735    /// Stamp or official seal
736    Seal,
737    /// Page number
738    Number,
739    /// Reference section
740    Reference,
741    /// Reference content (PP-DocLayout_plus-L)
742    ReferenceContent,
743    /// Algorithm block
744    Algorithm,
745    /// Formula number
746    FormulaNumber,
747    /// Marginal/aside text
748    AsideText,
749    /// List items
750    List,
751
752    /// Generic document region block (PP-DocBlockLayout)
753    /// Used for hierarchical layout ordering and block grouping
754    Region,
755
756    /// Other/unknown (original label preserved in LayoutElement.label)
757    Other,
758}
759
760impl LayoutElementType {
761    /// Returns the string representation of the element type.
762    ///
763    /// This returns the PP-StructureV3 compatible label string.
764    pub fn as_str(&self) -> &'static str {
765        match self {
766            // Document Structure
767            LayoutElementType::DocTitle => "doc_title",
768            LayoutElementType::ParagraphTitle => "paragraph_title",
769            LayoutElementType::Text => "text",
770            LayoutElementType::Content => "content",
771            LayoutElementType::Abstract => "abstract",
772
773            // Visual Elements
774            LayoutElementType::Image => "image",
775            LayoutElementType::Table => "table",
776            LayoutElementType::Chart => "chart",
777            LayoutElementType::Formula => "formula",
778
779            // Captions
780            LayoutElementType::FigureTitle => "figure_title",
781            LayoutElementType::TableTitle => "table_title",
782            LayoutElementType::ChartTitle => "chart_title",
783            LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
784
785            // Page Structure
786            LayoutElementType::Header => "header",
787            LayoutElementType::HeaderImage => "header_image",
788            LayoutElementType::Footer => "footer",
789            LayoutElementType::FooterImage => "footer_image",
790            LayoutElementType::Footnote => "footnote",
791
792            // Special Elements
793            LayoutElementType::Seal => "seal",
794            LayoutElementType::Number => "number",
795            LayoutElementType::Reference => "reference",
796            LayoutElementType::ReferenceContent => "reference_content",
797            LayoutElementType::Algorithm => "algorithm",
798            LayoutElementType::FormulaNumber => "formula_number",
799            LayoutElementType::AsideText => "aside_text",
800            LayoutElementType::List => "list",
801
802            // Region (PP-DocBlockLayout)
803            LayoutElementType::Region => "region",
804
805            // Fallback
806            LayoutElementType::Other => "other",
807        }
808    }
809
810    /// Creates a LayoutElementType from a string label with fine-grained mapping.
811    ///
812    /// This method maps model output labels to their corresponding fine-grained types,
813    /// preserving the full PP-StructureV3 label set (20/23 classes).
814    pub fn from_label(label: &str) -> Self {
815        match label.to_lowercase().as_str() {
816            // Document Structure
817            "doc_title" => LayoutElementType::DocTitle,
818            "paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
819            "text" | "paragraph" => LayoutElementType::Text,
820            "content" => LayoutElementType::Content,
821            "abstract" => LayoutElementType::Abstract,
822
823            // Visual Elements
824            "image" | "figure" => LayoutElementType::Image,
825            "table" => LayoutElementType::Table,
826            "chart" | "flowchart" => LayoutElementType::Chart,
827            "formula" | "equation" | "display_formula" | "inline_formula" => {
828                LayoutElementType::Formula
829            }
830
831            // Captions
832            "figure_title" => LayoutElementType::FigureTitle,
833            "table_title" => LayoutElementType::TableTitle,
834            "chart_title" => LayoutElementType::ChartTitle,
835            "figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
836
837            // Page Structure
838            "header" => LayoutElementType::Header,
839            "header_image" => LayoutElementType::HeaderImage,
840            "footer" => LayoutElementType::Footer,
841            "footer_image" => LayoutElementType::FooterImage,
842            "footnote" | "vision_footnote" => LayoutElementType::Footnote,
843
844            // Special Elements
845            "seal" => LayoutElementType::Seal,
846            "number" => LayoutElementType::Number,
847            "reference" => LayoutElementType::Reference,
848            "reference_content" => LayoutElementType::ReferenceContent,
849            "algorithm" => LayoutElementType::Algorithm,
850            "formula_number" => LayoutElementType::FormulaNumber,
851            "aside_text" => LayoutElementType::AsideText,
852            "list" => LayoutElementType::List,
853            "vertical_text" => LayoutElementType::Text,
854
855            // Region (PP-DocBlockLayout)
856            "region" => LayoutElementType::Region,
857
858            // Everything else maps to Other
859            // The original label is preserved in LayoutElement.label
860            _ => LayoutElementType::Other,
861        }
862    }
863
864    /// Returns the semantic category for this element type.
865    ///
866    /// This method groups fine-grained types into broader semantic categories,
867    /// useful for processing logic that doesn't need fine-grained distinctions.
868    ///
869    /// # Categories
870    ///
871    /// - **Title**: DocTitle, ParagraphTitle
872    /// - **Text**: Text, Content, Abstract
873    /// - **Visual**: Image, Chart
874    /// - **Table**: Table
875    /// - **Caption**: FigureTitle, TableTitle, ChartTitle, FigureTableChartTitle
876    /// - **Header**: Header, HeaderImage
877    /// - **Footer**: Footer, FooterImage, Footnote
878    /// - **Formula**: Formula, FormulaNumber
879    /// - **Special**: Seal, Number, Reference, ReferenceContent, Algorithm, AsideText
880    /// - **List**: List
881    /// - **Other**: Other
882    pub fn semantic_category(&self) -> &'static str {
883        match self {
884            // Title category
885            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
886
887            // Text category
888            LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
889                "text"
890            }
891
892            // Visual category
893            LayoutElementType::Image | LayoutElementType::Chart => "visual",
894
895            // Table category
896            LayoutElementType::Table => "table",
897
898            // Caption category
899            LayoutElementType::FigureTitle
900            | LayoutElementType::TableTitle
901            | LayoutElementType::ChartTitle
902            | LayoutElementType::FigureTableChartTitle => "caption",
903
904            // Header category
905            LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
906
907            // Footer category
908            LayoutElementType::Footer
909            | LayoutElementType::FooterImage
910            | LayoutElementType::Footnote => "footer",
911
912            // Formula category
913            LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
914
915            // Special category
916            LayoutElementType::Seal
917            | LayoutElementType::Number
918            | LayoutElementType::Reference
919            | LayoutElementType::ReferenceContent
920            | LayoutElementType::Algorithm
921            | LayoutElementType::AsideText => "special",
922
923            // List category
924            LayoutElementType::List => "list",
925
926            // Region category (PP-DocBlockLayout)
927            LayoutElementType::Region => "region",
928
929            // Other
930            LayoutElementType::Other => "other",
931        }
932    }
933
934    /// Returns whether this element type is a title variant.
935    pub fn is_title(&self) -> bool {
936        matches!(
937            self,
938            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
939        )
940    }
941
942    /// Returns whether this element type is a visual element (image, chart, figure).
943    pub fn is_visual(&self) -> bool {
944        matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
945    }
946
947    /// Returns whether this element type is a caption variant.
948    pub fn is_caption(&self) -> bool {
949        matches!(
950            self,
951            LayoutElementType::FigureTitle
952                | LayoutElementType::TableTitle
953                | LayoutElementType::ChartTitle
954                | LayoutElementType::FigureTableChartTitle
955        )
956    }
957
958    /// Returns whether this element type is a header variant.
959    pub fn is_header(&self) -> bool {
960        matches!(
961            self,
962            LayoutElementType::Header | LayoutElementType::HeaderImage
963        )
964    }
965
966    /// Returns whether this element type is a footer variant.
967    pub fn is_footer(&self) -> bool {
968        matches!(
969            self,
970            LayoutElementType::Footer
971                | LayoutElementType::FooterImage
972                | LayoutElementType::Footnote
973        )
974    }
975
976    /// Returns whether this element type is a formula variant.
977    pub fn is_formula(&self) -> bool {
978        matches!(
979            self,
980            LayoutElementType::Formula | LayoutElementType::FormulaNumber
981        )
982    }
983
984    /// Returns whether this element type contains text content that should be OCR'd.
985    pub fn should_ocr(&self) -> bool {
986        matches!(
987            self,
988            LayoutElementType::Text
989                | LayoutElementType::Content
990                | LayoutElementType::Abstract
991                | LayoutElementType::DocTitle
992                | LayoutElementType::ParagraphTitle
993                | LayoutElementType::FigureTitle
994                | LayoutElementType::TableTitle
995                | LayoutElementType::ChartTitle
996                | LayoutElementType::FigureTableChartTitle
997                | LayoutElementType::Header
998                | LayoutElementType::HeaderImage
999                | LayoutElementType::Footer
1000                | LayoutElementType::FooterImage
1001                | LayoutElementType::Footnote
1002                | LayoutElementType::Reference
1003                | LayoutElementType::ReferenceContent
1004                | LayoutElementType::Algorithm
1005                | LayoutElementType::AsideText
1006                | LayoutElementType::List
1007                | LayoutElementType::Number
1008        )
1009    }
1010}
1011
1012/// Removes heavily-overlapping layout elements in-place.
1013///
1014/// This mirrors PP-Structure-style overlap suppression where text takes priority over images.
1015/// Returns the number of elements removed.
1016pub fn remove_overlapping_layout_elements(
1017    layout_elements: &mut Vec<LayoutElement>,
1018    overlap_threshold: f32,
1019) -> usize {
1020    use std::collections::HashSet;
1021
1022    if layout_elements.len() <= 1 {
1023        return 0;
1024    }
1025
1026    let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
1027    let labels: Vec<&str> = layout_elements
1028        .iter()
1029        .map(|e| e.element_type.as_str())
1030        .collect();
1031
1032    let remove_indices =
1033        crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
1034    if remove_indices.is_empty() {
1035        return 0;
1036    }
1037
1038    let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
1039    let before = layout_elements.len();
1040
1041    let mut idx = 0;
1042    layout_elements.retain(|_| {
1043        let keep = !remove_set.contains(&idx);
1044        idx += 1;
1045        keep
1046    });
1047
1048    before.saturating_sub(layout_elements.len())
1049}
1050
1051/// Applies small, PP-Structure-style label fixes to layout elements.
1052///
1053/// This is intended to capture lightweight "glue" heuristics that shouldn't live in `predict`.
1054pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
1055    if layout_elements.is_empty() {
1056        return;
1057    }
1058
1059    let mut footnote_indices: Vec<usize> = Vec::new();
1060    let mut paragraph_title_indices: Vec<usize> = Vec::new();
1061    let mut bottom_text_y_max: f32 = 0.0;
1062    let mut max_block_area: f32 = 0.0;
1063    let mut doc_title_num: usize = 0;
1064
1065    for (idx, elem) in layout_elements.iter().enumerate() {
1066        let area =
1067            (elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
1068        max_block_area = max_block_area.max(area);
1069
1070        match elem.element_type {
1071            LayoutElementType::Footnote => footnote_indices.push(idx),
1072            LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
1073            LayoutElementType::Text => {
1074                bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
1075            }
1076            LayoutElementType::DocTitle => doc_title_num += 1,
1077            _ => {}
1078        }
1079    }
1080
1081    for idx in footnote_indices {
1082        if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
1083            layout_elements[idx].element_type = LayoutElementType::Text;
1084            layout_elements[idx].label = Some("text".to_string());
1085        }
1086    }
1087
1088    let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
1089    if only_one_paragraph_title {
1090        let idx = paragraph_title_indices[0];
1091        let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
1092            * (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
1093
1094        let title_area_ratio_threshold = 0.3f32;
1095        if area > max_block_area * title_area_ratio_threshold {
1096            layout_elements[idx].element_type = LayoutElementType::DocTitle;
1097            layout_elements[idx].label = Some("doc_title".to_string());
1098        }
1099    }
1100}
1101
1102/// Result of table recognition.
1103#[derive(Debug, Clone, Serialize, Deserialize)]
1104pub struct TableResult {
1105    /// Bounding box of the table in the original image
1106    pub bbox: BoundingBox,
1107    /// Table type (wired or wireless)
1108    pub table_type: TableType,
1109    /// Confidence score for table type classification (None if classifier wasn't configured/run)
1110    pub classification_confidence: Option<f32>,
1111    /// Confidence score for table structure recognition (None if structure recognition failed)
1112    pub structure_confidence: Option<f32>,
1113    /// Detected table cells
1114    pub cells: Vec<TableCell>,
1115    /// HTML structure of the table (if available)
1116    pub html_structure: Option<String>,
1117    /// OCR text content for each cell (if OCR was integrated)
1118    pub cell_texts: Option<Vec<Option<String>>>,
1119    /// Structure tokens from table structure recognition (used for HTML generation after stitching)
1120    #[serde(skip)]
1121    pub structure_tokens: Option<Vec<String>>,
1122}
1123
1124impl TableResult {
1125    /// Creates a new table result.
1126    pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
1127        Self {
1128            bbox,
1129            table_type,
1130            classification_confidence: None,
1131            structure_confidence: None,
1132            cells: Vec::new(),
1133            html_structure: None,
1134            cell_texts: None,
1135            structure_tokens: None,
1136        }
1137    }
1138
1139    /// Sets the classification confidence.
1140    pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
1141        self.classification_confidence = Some(confidence);
1142        self
1143    }
1144
1145    /// Sets the structure recognition confidence.
1146    pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
1147        self.structure_confidence = Some(confidence);
1148        self
1149    }
1150
1151    /// Sets the table cells.
1152    pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
1153        self.cells = cells;
1154        self
1155    }
1156
1157    /// Sets the HTML structure.
1158    pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
1159        self.html_structure = Some(html.into());
1160        self
1161    }
1162
1163    /// Sets the cell texts from OCR.
1164    pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
1165        self.cell_texts = Some(texts);
1166        self
1167    }
1168
1169    /// Sets the structure tokens for later HTML generation.
1170    pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
1171        self.structure_tokens = Some(tokens);
1172        self
1173    }
1174
1175    /// Returns the best available confidence score for this table.
1176    ///
1177    /// This method provides a unified confidence API for callers who want to filter
1178    /// tables by confidence without caring whether classification or structure
1179    /// recognition was used. Priority:
1180    /// 1. If both classification and structure confidence are available, returns
1181    ///    the minimum (most conservative estimate)
1182    /// 2. If only structure confidence is available (common when classifier isn't
1183    ///    configured), returns that
1184    /// 3. If only classification confidence is available, returns that
1185    /// 4. Returns `None` only if neither confidence is available (stub result)
1186    pub fn confidence(&self) -> Option<f32> {
1187        match (self.classification_confidence, self.structure_confidence) {
1188            (Some(cls), Some(str)) => Some(cls.min(str)),
1189            (None, Some(str)) => Some(str),
1190            (Some(cls), None) => Some(cls),
1191            (None, None) => None,
1192        }
1193    }
1194
1195    /// Returns true if this table has valid structure data.
1196    ///
1197    /// A table is considered valid if it has either cells or an HTML structure.
1198    /// Stub results (created when structure recognition fails) will return false.
1199    pub fn has_structure(&self) -> bool {
1200        !self.cells.is_empty() || self.html_structure.is_some()
1201    }
1202}
1203
1204/// Type of table.
1205#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1206pub enum TableType {
1207    /// Table with visible borders
1208    Wired,
1209    /// Table without visible borders
1210    Wireless,
1211    /// Unknown table type
1212    Unknown,
1213}
1214
1215/// A cell in a table.
1216#[derive(Debug, Clone, Serialize, Deserialize)]
1217pub struct TableCell {
1218    /// Bounding box of the cell
1219    pub bbox: BoundingBox,
1220    /// Row index (0-based)
1221    pub row: Option<usize>,
1222    /// Column index (0-based)
1223    pub col: Option<usize>,
1224    /// Row span
1225    pub row_span: Option<usize>,
1226    /// Column span
1227    pub col_span: Option<usize>,
1228    /// Confidence score for the cell detection
1229    pub confidence: f32,
1230    /// Text content of the cell (if available)
1231    pub text: Option<String>,
1232}
1233
1234impl TableCell {
1235    /// Creates a new table cell.
1236    pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
1237        Self {
1238            bbox,
1239            row: None,
1240            col: None,
1241            row_span: None,
1242            col_span: None,
1243            confidence,
1244            text: None,
1245        }
1246    }
1247
1248    /// Sets the row and column indices.
1249    pub fn with_position(mut self, row: usize, col: usize) -> Self {
1250        self.row = Some(row);
1251        self.col = Some(col);
1252        self
1253    }
1254
1255    /// Sets the row and column spans.
1256    pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
1257        self.row_span = Some(row_span);
1258        self.col_span = Some(col_span);
1259        self
1260    }
1261
1262    /// Sets the text content.
1263    pub fn with_text(mut self, text: impl Into<String>) -> Self {
1264        self.text = Some(text.into());
1265        self
1266    }
1267}
1268
1269/// Result of formula recognition.
1270#[derive(Debug, Clone, Serialize, Deserialize)]
1271pub struct FormulaResult {
1272    /// Bounding box of the formula in the original image
1273    pub bbox: BoundingBox,
1274    /// LaTeX representation of the formula
1275    pub latex: String,
1276    /// Confidence score for the recognition
1277    pub confidence: f32,
1278}
1279
1280impl FormulaResult {
1281    /// Creates a new formula result.
1282    pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
1283        Self {
1284            bbox,
1285            latex: latex.into(),
1286            confidence,
1287        }
1288    }
1289}
1290
1291#[cfg(test)]
1292mod tests {
1293    use super::*;
1294
1295    #[test]
1296    fn test_structure_result_creation() {
1297        let result = StructureResult::new("test.jpg", 0);
1298        assert_eq!(result.input_path.as_ref(), "test.jpg");
1299        assert_eq!(result.index, 0);
1300        assert!(result.layout_elements.is_empty());
1301        assert!(result.tables.is_empty());
1302        assert!(result.formulas.is_empty());
1303        assert!(result.text_regions.is_none());
1304    }
1305
1306    #[test]
1307    fn test_layout_element_type_as_str() {
1308        assert_eq!(LayoutElementType::Text.as_str(), "text");
1309        assert_eq!(LayoutElementType::Table.as_str(), "table");
1310        assert_eq!(LayoutElementType::Formula.as_str(), "formula");
1311    }
1312
1313    #[test]
1314    fn test_table_result_creation() {
1315        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
1316        let table = TableResult::new(bbox, TableType::Wired);
1317        assert_eq!(table.table_type, TableType::Wired);
1318        assert!(table.cells.is_empty());
1319        assert!(table.html_structure.is_none());
1320    }
1321
1322    #[test]
1323    fn test_structure_result_export() {
1324        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
1325        let mut result = StructureResult::new("test.jpg", 0);
1326
1327        let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
1328            .with_text("Test Document");
1329
1330        let text =
1331            LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
1332
1333        result = result.with_layout_elements(vec![title, text]);
1334
1335        let md = result.to_markdown();
1336        assert!(md.contains("# Test Document"));
1337        assert!(md.contains("Hello world"));
1338
1339        let html = result.to_html();
1340        assert!(html.contains("<h1>Test Document</h1>"));
1341        assert!(html.contains("<p>Hello world</p>"));
1342    }
1343}