oar_ocr_core/domain/
structure.rs

1//! Document structure analysis result types.
2//!
3//! This module defines the result types for document structure analysis,
4//! including layout detection, table recognition, and formula recognition.
5
6use super::text_region::TextRegion;
7use crate::processors::BoundingBox;
8use image::RgbImage;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use std::path::Path;
14use std::sync::Arc;
15
16/// Title numbering pattern for detecting section numbers like 1, 1.2, 1.2.3, (1), 一、etc.
17/// This follows standard title numbering pattern.
18static TITLE_NUMBERING_REGEX: Lazy<Regex> = Lazy::new(|| {
19    Regex::new(
20        r"(?x)
21        ^\s*
22        (
23            # Arabic numerals: 1, 1.2, 1.2.3, etc.
24            [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
25            |
26            # Parenthesized Arabic numerals: (1), (1.2), etc.
27            [(（][1-9][0-9]*(?:\.[1-9][0-9]*)*[)）]
28            |
29            # Chinese numerals with punctuation: 一、 二、
30            [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
31            |
32            # Parenthesized Chinese numerals: （一）
33            [(（][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[)）]
34            |
35            # Roman numerals with delimiter (period or followed by space)
36            (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
37        )
38        (\s+)
39        (.*)
40        $
41    ",
42    )
43    .unwrap_or_else(|e| panic!("Invalid title numbering regex: {e}"))
44});
45
46/// Format a paragraph title with automatic level detection based on numbering.
47///
48/// Following PaddleX's title formatting logic:
49/// - Extracts numbering prefix (1.2.3, etc.)
50/// - Determines heading level from number of dots
51/// - Returns (level, formatted_title) where level starts from 2 (## for paragraph titles)
52///
53/// PaddleX logic: `level = dots + 1`, then uses `#{'#' * level}` which means:
54/// - "1 Introduction" (0 dots) -> level=1 -> `## 1 Introduction`
55/// - "2.1 Method" (1 dot) -> level=2 -> `### 2.1 Method`
56/// - "2.1.1 Details" (2 dots) -> level=3 -> `#### 2.1.1 Details`
57///
58/// To align with PaddleX, we return level+1 to account for the extra `#`:
59/// - "1 Introduction" -> (2, "1 Introduction") -> `## 1 Introduction`
60/// - "2.1 Method" -> (3, "2.1 Method") -> `### 2.1 Method`
61/// - "2.1.1 Details" -> (4, "2.1.1 Details") -> `#### 2.1.1 Details`
62fn semantic_title_level_and_format(cleaned: &str) -> Option<(usize, String)> {
63    let trimmed = cleaned.trim();
64
65    // Common unnumbered top-level section headers.
66    let keyword = trimmed.trim_end_matches(':').to_ascii_uppercase();
67    if matches!(
68        keyword.as_str(),
69        "ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
70    ) {
71        return Some((2, trimmed.to_string()));
72    }
73
74    if let Some(captures) = TITLE_NUMBERING_REGEX.captures(cleaned) {
75        let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
76        let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
77
78        let dot_count = numbering.matches('.').count();
79        let level = (dot_count + 2).clamp(2, 6);
80
81        let formatted = if title_content.is_empty() {
82            numbering.trim_end_matches('.').to_string()
83        } else {
84            format!(
85                "{} {}",
86                numbering.trim_end_matches('.'),
87                title_content.trim_start()
88            )
89        };
90        return Some((level, formatted));
91    }
92
93    None
94}
95
96fn semantic_title_level(text: &str) -> Option<usize> {
97    let cleaned = text.replace("-\n", "").replace('\n', " ");
98    semantic_title_level_and_format(&cleaned).map(|(level, _)| level)
99}
100
101fn format_title_with_level(title: &str, clustered_level: Option<usize>) -> (usize, String) {
102    // Clean up line breaks
103    let cleaned = title.replace("-\n", "").replace('\n', " ");
104    if let Some((level, formatted)) = semantic_title_level_and_format(&cleaned) {
105        return (level, formatted);
106    }
107
108    // No semantic signal: use voting hint from relative/font-size signals.
109    let level = clustered_level.unwrap_or(2).clamp(2, 6);
110    (level, cleaned)
111}
112
113/// Estimate per-title heading levels using three-signal voting:
114/// 1) semantic numbering/keyword level
115/// 2) relative indentation order
116/// 3) font-size k-means (k<=4)
117///
118fn infer_paragraph_title_levels(elements: &[LayoutElement]) -> HashMap<usize, usize> {
119    let title_indices: Vec<usize> = elements
120        .iter()
121        .enumerate()
122        .filter(|(_, e)| e.element_type == LayoutElementType::ParagraphTitle)
123        .map(|(idx, _)| idx)
124        .collect();
125    if title_indices.is_empty() {
126        return HashMap::new();
127    }
128
129    let height_samples: Vec<(usize, f32)> = title_indices
130        .iter()
131        .filter_map(|&idx| {
132            let e = &elements[idx];
133            let height = (e.bbox.y_max() - e.bbox.y_min()).max(1.0);
134            let line_h = height / e.num_lines.unwrap_or(1).max(1) as f32;
135            let v = line_h.max(1.0);
136            if v.is_finite() { Some((idx, v)) } else { None }
137        })
138        .collect();
139
140    let indent_samples: Vec<(usize, f32)> = title_indices
141        .iter()
142        .filter_map(|&idx| {
143            let x = elements[idx].bbox.x_min();
144            if x.is_finite() { Some((idx, x)) } else { None }
145        })
146        .collect();
147    let semantic_levels: HashMap<usize, usize> = title_indices
148        .iter()
149        .filter_map(|&idx| {
150            elements[idx]
151                .text
152                .as_deref()
153                .and_then(semantic_title_level)
154                .map(|level| (idx, level))
155        })
156        .collect();
157
158    let font_levels = infer_levels_by_kmeans_feature(&height_samples, true);
159    // Smaller x_min (less indent) -> higher-level heading.
160    let relative_levels = infer_levels_by_kmeans_feature(&indent_samples, false);
161
162    let mut voted = HashMap::new();
163    for idx in title_indices {
164        let semantic_level = semantic_levels.get(&idx).copied();
165        let font_level = font_levels.get(&idx).copied();
166        let relative_level = relative_levels.get(&idx).copied();
167
168        let mut score = [0u8; 7];
169        if let Some(level) = semantic_level {
170            score[level.clamp(1, 6)] += 2;
171        }
172        if let Some(level) = font_level {
173            score[level.clamp(1, 6)] += 1;
174        }
175        if let Some(level) = relative_level {
176            score[level.clamp(1, 6)] += 1;
177        }
178
179        let mut best_level = semantic_level.unwrap_or(2);
180        let mut best_score = 0u8;
181        for (level, &s) in score.iter().enumerate().skip(1) {
182            if s > best_score {
183                best_score = s;
184                best_level = level;
185            } else if s == best_score && s > 0 {
186                let is_semantic = semantic_level == Some(level);
187                let best_is_semantic = semantic_level == Some(best_level);
188                if (is_semantic && !best_is_semantic)
189                    || (is_semantic == best_is_semantic && level < best_level)
190                {
191                    best_level = level;
192                }
193            }
194        }
195
196        if best_score == 0 {
197            best_level = semantic_level
198                .or(font_level)
199                .or(relative_level)
200                .unwrap_or(2);
201        }
202
203        voted.insert(idx, best_level.clamp(1, 6));
204    }
205
206    voted
207}
208
209/// Cluster one scalar feature into heading levels with 1D k-means.
210///
211/// `descending=true` means larger feature -> higher-level heading (smaller markdown depth).
212/// `descending=false` means smaller feature -> higher-level heading.
213fn infer_levels_by_kmeans_feature(
214    samples: &[(usize, f32)],
215    descending: bool,
216) -> HashMap<usize, usize> {
217    let clean_samples: Vec<(usize, f32)> = samples
218        .iter()
219        .copied()
220        .filter(|(_, v)| v.is_finite())
221        .collect();
222    if clean_samples.len() < 2 {
223        return HashMap::new();
224    }
225
226    let mut values: Vec<f32> = clean_samples.iter().map(|(_, v)| *v).collect();
227    values.sort_by(|a, b| a.total_cmp(b));
228    let unique_count = values
229        .windows(2)
230        .filter(|w| (w[1] - w[0]).abs() > 1e-3)
231        .count()
232        + 1;
233    let k = unique_count.clamp(1, 4).min(clean_samples.len());
234    if k <= 1 {
235        return HashMap::new();
236    }
237
238    let mut centroids = (0..k)
239        .map(|i| {
240            let pos = ((i as f32 + 0.5) / k as f32 * values.len() as f32).floor() as usize;
241            values[pos.min(values.len() - 1)]
242        })
243        .collect::<Vec<_>>();
244
245    for _ in 0..16 {
246        let mut sums = vec![0.0f32; k];
247        let mut counts = vec![0usize; k];
248        for (_, value) in &clean_samples {
249            let mut best_idx = 0usize;
250            let mut best_dist = f32::INFINITY;
251            for (idx, c) in centroids.iter().enumerate() {
252                let dist = (value - c).abs();
253                if dist < best_dist {
254                    best_dist = dist;
255                    best_idx = idx;
256                }
257            }
258            sums[best_idx] += *value;
259            counts[best_idx] += 1;
260        }
261        for idx in 0..k {
262            if counts[idx] > 0 {
263                centroids[idx] = sums[idx] / counts[idx] as f32;
264            }
265        }
266    }
267
268    let mut centroid_order: Vec<(usize, f32)> = centroids.iter().copied().enumerate().collect();
269    if descending {
270        centroid_order.sort_by(|a, b| b.1.total_cmp(&a.1));
271    } else {
272        centroid_order.sort_by(|a, b| a.1.total_cmp(&b.1));
273    }
274    let rank_by_cluster: HashMap<usize, usize> = centroid_order
275        .into_iter()
276        .enumerate()
277        .map(|(rank, (cluster_idx, _))| (cluster_idx, rank))
278        .collect();
279
280    let mut result = HashMap::new();
281    for (element_idx, value) in &clean_samples {
282        let mut best_idx = 0usize;
283        let mut best_dist = f32::INFINITY;
284        for (idx, c) in centroids.iter().enumerate() {
285            let dist = (value - c).abs();
286            if dist < best_dist {
287                best_dist = dist;
288                best_idx = idx;
289            }
290        }
291        let rank = rank_by_cluster.get(&best_idx).copied().unwrap_or(0);
292        let level = (rank + 2).clamp(2, 6);
293        result.insert(*element_idx, level);
294    }
295
296    result
297}
298
299/// A detected document region block (from PP-DocBlockLayout).
300///
301/// Region blocks represent hierarchical groupings of layout elements,
302/// typically columns or logical sections of a document. They are used
303/// for hierarchical reading order determination.
304///
305/// # PP-StructureV3 Alignment
306///
307/// PP-DocBlockLayout detects "region" type blocks that group related
308/// layout elements together. Elements within the same region should
309/// be read together before moving to the next region.
310#[derive(Debug, Clone, Serialize, Deserialize)]
311pub struct RegionBlock {
312    /// Bounding box of the region
313    pub bbox: BoundingBox,
314    /// Confidence score of the detection
315    pub confidence: f32,
316    /// Index of this region in the reading order
317    pub order_index: Option<u32>,
318    /// Indices of layout elements that belong to this region
319    pub element_indices: Vec<usize>,
320}
321
322/// Page continuation flags for multi-page document processing.
323///
324/// These flags indicate whether the page starts or ends in the middle of
325/// a semantic paragraph, which is crucial for properly concatenating
326/// markdown output from multiple pages.
327///
328/// - `paragraph_start`: `false` means this page continues a paragraph from previous page
329/// - `paragraph_end`: `false` means this page's content continues to next page
330#[derive(Debug, Clone, Serialize, Deserialize)]
331pub struct PageContinuationFlags {
332    /// Whether the first element on this page is a paragraph continuation
333    pub paragraph_start: bool,
334    /// Whether the last element on this page continues to the next page
335    pub paragraph_end: bool,
336}
337
338impl PageContinuationFlags {
339    pub fn new(paragraph_start: bool, paragraph_end: bool) -> Self {
340        Self {
341            paragraph_start,
342            paragraph_end,
343        }
344    }
345
346    /// Returns the tuple format (is_start, is_end) for compatibility
347    pub fn as_tuple(&self) -> (bool, bool) {
348        (self.paragraph_start, self.paragraph_end)
349    }
350}
351
352/// Result of document structure analysis.
353///
354/// This struct contains all the results from analyzing a document's structure,
355/// including layout elements, tables, formulas, and OCR results.
356///
357/// # Coordinate System
358///
359/// The coordinate system of bounding boxes depends on which preprocessing was applied:
360///
361/// - **No preprocessing**: Boxes are in the original input image's coordinate system.
362///
363/// - **Orientation correction only** (`orientation_angle` set, `rectified_img` is None):
364///   Boxes are transformed back to the original input image's coordinate system.
365///
366/// - **Rectification applied** (`rectified_img` is Some):
367///   Boxes remain in the **rectified image's coordinate system**. Neural network-based
368///   rectification (UVDoc) warps cannot be precisely inverted, so use `rectified_img`
369///   for visualization instead of the original image.
370///
371/// - **Both orientation and rectification**: Boxes are in the rectified coordinate system
372///   (rectification takes precedence since it's applied after orientation correction).
373#[derive(Debug, Clone, Serialize, Deserialize)]
374pub struct StructureResult {
375    /// Path to the input image file
376    pub input_path: Arc<str>,
377    /// Index of the image in a batch (0 for single image processing)
378    pub index: usize,
379    /// Detected layout elements (text regions, tables, figures, etc.)
380    pub layout_elements: Vec<LayoutElement>,
381    /// Recognized tables with their structure and content
382    pub tables: Vec<TableResult>,
383    /// Recognized mathematical formulas
384    pub formulas: Vec<FormulaResult>,
385    /// OCR text regions (if OCR was integrated)
386    pub text_regions: Option<Vec<TextRegion>>,
387    /// Document orientation angle (if orientation correction was used)
388    pub orientation_angle: Option<f32>,
389    /// Detected region blocks for hierarchical ordering (PP-DocBlockLayout)
390    /// When present, layout_elements are already sorted by region hierarchy
391    pub region_blocks: Option<Vec<RegionBlock>>,
392    /// Rectified image (if document rectification was used)
393    /// Note: Bounding boxes are already transformed back to original coordinates for rotation,
394    /// but for rectification (UVDoc), boxes are in the rectified image's coordinate system.
395    /// Use this image for visualization when rectification was applied.
396    #[serde(skip)]
397    pub rectified_img: Option<Arc<RgbImage>>,
398    /// Page continuation flags for multi-page document processing.
399    /// This indicates whether this page continues a paragraph from the previous page
400    /// or continues to the next page, which is crucial for proper markdown concatenation.
401    pub page_continuation_flags: Option<PageContinuationFlags>,
402}
403
404impl StructureResult {
405    /// Creates a new structure result.
406    pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
407        Self {
408            input_path: input_path.into(),
409            index,
410            layout_elements: Vec::new(),
411            tables: Vec::new(),
412            formulas: Vec::new(),
413            text_regions: None,
414            orientation_angle: None,
415            region_blocks: None,
416            rectified_img: None,
417            page_continuation_flags: None,
418        }
419    }
420
421    /// Adds layout elements to the result.
422    pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
423        self.layout_elements = elements;
424        self
425    }
426
427    /// Adds tables to the result.
428    pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
429        self.tables = tables;
430        self
431    }
432
433    /// Adds formulas to the result.
434    pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
435        self.formulas = formulas;
436        self
437    }
438
439    /// Adds OCR text regions to the result.
440    pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
441        self.text_regions = Some(regions);
442        self
443    }
444
445    /// Adds region blocks to the result (PP-DocBlockLayout).
446    ///
447    /// Region blocks represent hierarchical groupings of layout elements.
448    /// When set, layout_elements should already be sorted by region hierarchy.
449    pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
450        self.region_blocks = Some(blocks);
451        self
452    }
453
454    /// Sets page continuation flags for multi-page document processing.
455    pub fn with_page_continuation_flags(mut self, flags: PageContinuationFlags) -> Self {
456        self.page_continuation_flags = Some(flags);
457        self
458    }
459
460    /// Converts the result to a Markdown string.
461    ///
462    /// Follows PP-StructureV3's formatting rules:
463    /// - DocTitle: `# title`
464    /// - ParagraphTitle: Auto-detect numbering (1.2.3 -> ###)
465    /// - Formula: `$$latex$$`
466    /// - Table: HTML with border
467    /// - Images: `![Figure](caption)`
468    ///
469    /// Note: Low-confidence text elements that overlap with table regions are filtered out
470    /// to avoid duplicate content from table OCR.
471    pub fn to_markdown(&self) -> String {
472        // Collect table bboxes for overlap filtering
473        let table_bboxes: Vec<&BoundingBox> = self
474            .layout_elements
475            .iter()
476            .filter(|e| e.element_type == LayoutElementType::Table)
477            .map(|e| &e.bbox)
478            .collect();
479
480        // Compute original image width for image scaling (PaddleX: original_image_width)
481        let original_image_width = self
482            .rectified_img
483            .as_ref()
484            .map(|img| img.width() as f32)
485            .or_else(|| {
486                // Estimate from max element x-coordinate
487                self.layout_elements
488                    .iter()
489                    .map(|e| e.bbox.x_max())
490                    .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
491            })
492            .unwrap_or(1.0);
493
494        let mut md = String::new();
495        let elements = &self.layout_elements;
496        let paragraph_title_levels = infer_paragraph_title_levels(elements);
497        // Track the most recent Text/ReferenceContent element so paragraph
498        // continuation works across intervening figures/tables.
499        let mut prev_text_element: Option<&LayoutElement> = None;
500
501        for (idx, element) in elements.iter().enumerate() {
502            // PP-StructureV3 markdown ignores auxiliary labels.
503            if matches!(
504                element.element_type,
505                LayoutElementType::Number
506                    | LayoutElementType::Footnote
507                    | LayoutElementType::Header
508                    | LayoutElementType::HeaderImage
509                    | LayoutElementType::Footer
510                    | LayoutElementType::FooterImage
511                    | LayoutElementType::AsideText
512            ) {
513                continue;
514            }
515
516            // Filter out low-confidence text elements that overlap with tables
517            // These are typically OCR artifacts from table cell text that shouldn't be
518            // output separately in markdown
519            if element.element_type == LayoutElementType::Text {
520                let overlaps_table = table_bboxes.iter().any(|table_bbox| {
521                    element.bbox.ioa(table_bbox) > 0.3 // >30% of text is inside table
522                });
523
524                // Skip low-confidence text that overlaps with table regions
525                // Standard logic filters these in the stitching phase
526                if overlaps_table && element.confidence < 0.7 {
527                    continue;
528                }
529            }
530
531            // Determine seg_start_flag for paragraph continuity (PaddleX get_seg_flag).
532            // When both current and previous are "text" and seg_start_flag is false,
533            // they belong to the same paragraph — join without \n\n separator.
534            let seg_start_flag = get_seg_flag(element, prev_text_element);
535
536            let is_continuation = element.element_type == LayoutElementType::Text
537                && prev_text_element.is_some()
538                && !seg_start_flag;
539
540            // Add separator between elements
541            if !is_continuation {
542                // Normal case: separate elements with blank line
543            }
544
545            match element.element_type {
546                // Document title
547                LayoutElementType::DocTitle => {
548                    if !md.is_empty() {
549                        md.push_str("\n\n");
550                    }
551                    if let Some(text) = &element.text {
552                        let cleaned = clean_ocr_text(text);
553                        // Downgrade section-level keywords to ## when misclassified as DocTitle
554                        let keyword = cleaned.trim().trim_end_matches(':').to_ascii_uppercase();
555                        if matches!(
556                            keyword.as_str(),
557                            "ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
558                        ) {
559                            md.push_str("## ");
560                        } else {
561                            md.push_str("# ");
562                        }
563                        md.push_str(&cleaned);
564                    }
565                }
566                // Paragraph/section title - auto-detect numbering for level
567                LayoutElementType::ParagraphTitle => {
568                    if !md.is_empty() {
569                        md.push_str("\n\n");
570                    }
571                    if let Some(text) = &element.text {
572                        let cleaned = clean_ocr_text(text);
573                        let clustered = paragraph_title_levels.get(&idx).copied();
574                        let (level, formatted_title) = format_title_with_level(&cleaned, clustered);
575                        for _ in 0..level {
576                            md.push('#');
577                        }
578                        md.push(' ');
579                        md.push_str(&formatted_title);
580                    } else {
581                        md.push_str("## ");
582                    }
583                }
584                // Table - preserve HTML structure with border and center alignment
585                // Following PaddleX's format with <div style="text-align: center;"> wrapper
586                LayoutElementType::Table => {
587                    if !md.is_empty() {
588                        md.push_str("\n\n");
589                    }
590                    if let Some(table) =
591                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
592                    {
593                        if let Some(html) = &table.html_structure {
594                            // Simplify table HTML (remove html/body wrappers) and add border
595                            let simplified = simplify_table_html(html);
596                            let table_with_border =
597                                simplified.replacen("<table>", "<table border=\"1\">", 1);
598                            // PaddleX format_centered_by_html: clean newlines then wrap
599                            let cleaned = clean_ocr_text(&table_with_border);
600                            md.push_str("<div style=\"text-align: center;\">");
601                            md.push_str(&cleaned);
602                            md.push_str("</div>");
603                        } else {
604                            md.push_str("[Table]");
605                        }
606                    } else {
607                        md.push_str("[Table]");
608                    }
609                }
610                // FormulaNumber - equation labels like "(1)", "Eq. 1" etc.
611                // PaddleX does NOT include formula_number in handle_funcs_dict,
612                // so these are silently skipped in markdown output.
613                LayoutElementType::FormulaNumber => {
614                    continue;
615                }
616                // Formula - detect inline vs display formula based on context
617                LayoutElementType::Formula => {
618                    // Extract and clean LaTeX content; skip if empty
619                    let raw_content = element.text.as_deref().map(|s| s.trim()).unwrap_or("");
620                    if raw_content.is_empty() {
621                        continue;
622                    }
623                    // Remove only outer $$ or $ wrappers if present (from table cell injection)
624                    let latex_content =
625                        if raw_content.starts_with("$$") && raw_content.ends_with("$$") {
626                            &raw_content[2..raw_content.len() - 2]
627                        } else if raw_content.starts_with('$') && raw_content.ends_with('$') {
628                            &raw_content[1..raw_content.len() - 1]
629                        } else {
630                            raw_content
631                        };
632
633                    // Check if this formula is on the same line as adjacent text elements
634                    // to determine if it's an inline formula or display formula.
635                    // Only consider the nearest non-formula/non-formula-number neighbor
636                    // on each side, and require BOTH sides to have text on the same line.
637                    // This prevents display formulas from being misclassified as inline
638                    // when they happen to be vertically aligned with a distant text block.
639                    let is_inline = {
640                        let has_prev_text = (0..idx)
641                            .rev()
642                            .find(|&i| {
643                                let t = elements[i].element_type;
644                                !t.is_formula() && t != LayoutElementType::FormulaNumber
645                            })
646                            .is_some_and(|i| {
647                                let prev = &elements[i];
648                                (prev.element_type == LayoutElementType::Text
649                                    || prev.element_type == LayoutElementType::ReferenceContent)
650                                    && is_same_line(&element.bbox, &prev.bbox)
651                            });
652
653                        let has_next_text = ((idx + 1)..elements.len())
654                            .find(|&i| {
655                                let t = elements[i].element_type;
656                                !t.is_formula() && t != LayoutElementType::FormulaNumber
657                            })
658                            .is_some_and(|i| {
659                                let next = &elements[i];
660                                (next.element_type == LayoutElementType::Text
661                                    || next.element_type == LayoutElementType::ReferenceContent)
662                                    && is_same_line(&element.bbox, &next.bbox)
663                            });
664
665                        // Require text on BOTH sides for inline — a formula with text
666                        // only on one side is almost always a display equation.
667                        has_prev_text && has_next_text
668                    };
669
670                    if is_inline {
671                        // Inline formula: use $...$
672                        md.push('$');
673                        md.push_str(latex_content);
674                        md.push_str("$ ");
675                    } else {
676                        // Display formula: use $$...$$
677                        if !md.is_empty() {
678                            md.push_str("\n\n");
679                        }
680                        md.push_str("$$");
681                        md.push_str(latex_content);
682                        md.push_str("$$");
683                    }
684                }
685                // Image/Chart - figure format with center alignment
686                LayoutElementType::Image | LayoutElementType::Chart => {
687                    if !md.is_empty() {
688                        md.push_str("\n\n");
689                    }
690                    // Use HTML img tag with center alignment for better rendering
691                    md.push_str("<div style=\"text-align: center;\"><img src=\"");
692                    // Generate a placeholder image name based on element bbox
693                    let img_name = format!(
694                        "imgs/img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg",
695                        if element.element_type == LayoutElementType::Chart {
696                            "chart"
697                        } else {
698                            "image"
699                        },
700                        element.bbox.x_min(),
701                        element.bbox.y_min(),
702                        element.bbox.x_max(),
703                        element.bbox.y_max()
704                    );
705                    md.push_str(&img_name);
706                    md.push_str("\" alt=\"Image\" width=\"");
707                    // Calculate width percentage relative to original image width (PaddleX logic)
708                    let image_width = element.bbox.x_max() - element.bbox.x_min();
709                    let width_pct = (image_width / original_image_width * 100.0) as u32;
710                    let width_pct = width_pct.clamp(1, 100);
711                    md.push_str(&format!("{}%", width_pct));
712                    md.push_str("\" /></div>");
713                }
714                // Seal - show as image with text
715                LayoutElementType::Seal => {
716                    if !md.is_empty() {
717                        md.push_str("\n\n");
718                    }
719                    md.push_str("![Seal]");
720                    if let Some(text) = &element.text {
721                        md.push_str("\n> ");
722                        md.push_str(text);
723                    }
724                }
725                // Captions - with center alignment following PaddleX
726                _ if element.element_type.is_caption() => {
727                    if let Some(text) = &element.text {
728                        if !md.is_empty() {
729                            md.push_str("\n\n");
730                        }
731                        let cleaned = clean_ocr_text(text);
732                        md.push_str("<div style=\"text-align: center;\">");
733                        md.push_str(&cleaned);
734                        md.push_str(" </div>");
735                    }
736                }
737                // Abstract - following PaddleX's format_first_line_func with spliter=" "
738                LayoutElementType::Abstract => {
739                    if let Some(text) = &element.text {
740                        if !md.is_empty() {
741                            md.push_str("\n\n");
742                        }
743                        let formatted = format_first_line(text, " ", &["abstract", "摘要"], "## ");
744                        md.push_str(&formatted);
745                    }
746                }
747                // Reference - following PaddleX's format_first_line_func with spliter="\n"
748                LayoutElementType::Reference => {
749                    if let Some(text) = &element.text {
750                        if !md.is_empty() {
751                            md.push_str("\n\n");
752                        }
753                        let formatted =
754                            format_first_line(text, "\n", &["references", "参考文献"], "## ");
755                        md.push_str(&formatted);
756                    }
757                }
758                // Content (table of contents) - following PaddleX's soft breaks
759                LayoutElementType::Content => {
760                    if let Some(text) = &element.text {
761                        if !md.is_empty() {
762                            md.push_str("\n\n");
763                        }
764                        let formatted = format_content_block(text);
765                        md.push_str(&formatted);
766                    }
767                }
768                // Footnote - following PaddleX's vision_footnote handling
769                LayoutElementType::Footnote => {
770                    if let Some(text) = &element.text {
771                        if !md.is_empty() {
772                            md.push_str("\n\n");
773                        }
774                        let formatted = format_vision_footnote_block(text);
775                        md.push_str(&formatted);
776                    }
777                }
778                // List
779                LayoutElementType::List => {
780                    if let Some(text) = &element.text {
781                        if !md.is_empty() {
782                            md.push_str("\n\n");
783                        }
784                        let cleaned = format_text_block(text);
785                        // Split by newlines and format as list items
786                        for line in cleaned.lines() {
787                            let line = line.trim();
788                            if !line.is_empty() {
789                                md.push_str("- ");
790                                md.push_str(line);
791                                md.push('\n');
792                            }
793                        }
794                    }
795                }
796                // Algorithm block - PaddleX: block.content.strip("\n")
797                LayoutElementType::Algorithm => {
798                    if let Some(text) = &element.text {
799                        if !md.is_empty() {
800                            md.push_str("\n\n");
801                        }
802                        md.push_str(text.trim_matches('\n'));
803                    }
804                }
805                // Header/Footer - smaller text (typically excluded from markdown)
806                _ if element.element_type.is_header() || element.element_type.is_footer() => {
807                    // Skip headers and footers in markdown output
808                    // They typically contain page numbers and repeating info
809                    continue;
810                }
811                // Default text elements - following PaddleX's text handling
812                _ => {
813                    if let Some(text) = &element.text {
814                        let cleaned = clean_ocr_text(text);
815                        if has_bullet_markers(&cleaned) {
816                            if !md.is_empty() {
817                                md.push_str("\n\n");
818                            }
819                            format_as_bullet_list(&cleaned, &mut md);
820                        } else if is_continuation {
821                            let formatted = format_text_block(text);
822                            md.push_str(&formatted);
823                        } else {
824                            if !md.is_empty() {
825                                md.push_str("\n\n");
826                            }
827                            let formatted = format_text_block(text);
828                            md.push_str(&formatted);
829                        }
830                    }
831                }
832            }
833
834            if element.element_type == LayoutElementType::Text
835                || element.element_type == LayoutElementType::ReferenceContent
836            {
837                prev_text_element = Some(element);
838            }
839        }
840        md.trim().to_string()
841    }
842
843    /// Calculates the page continuation flags for this result.
844    ///
845    /// This follows PaddleX's `get_seg_flag` logic to determine whether
846    /// the page starts/ends in the middle of a semantic paragraph.
847    ///
848    /// Returns (paragraph_start, paragraph_end) where:
849    /// - `paragraph_start`: false means page continues from previous
850    /// - `paragraph_end`: false means content continues to next page
851    pub fn calculate_continuation_flags(&self) -> PageContinuationFlags {
852        let elements = &self.layout_elements;
853
854        if elements.is_empty() {
855            return PageContinuationFlags::new(true, true);
856        }
857
858        // Estimate page width from rectified image or element bboxes
859        let page_width = self
860            .rectified_img
861            .as_ref()
862            .map(|img| img.width() as f32)
863            .or_else(|| {
864                elements
865                    .iter()
866                    .map(|e| e.bbox.x_max())
867                    .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
868            });
869
870        // Filter to only text elements for continuation analysis
871        let text_elements: Vec<_> = elements
872            .iter()
873            .filter(|e| {
874                matches!(
875                    e.element_type,
876                    LayoutElementType::Text
877                        | LayoutElementType::DocTitle
878                        | LayoutElementType::ParagraphTitle
879                        | LayoutElementType::Abstract
880                        | LayoutElementType::Reference
881                )
882            })
883            .collect();
884
885        if text_elements.is_empty() {
886            return PageContinuationFlags::new(true, true);
887        }
888
889        // Calculate paragraph start flag
890        let first = &text_elements[0];
891        let paragraph_start = is_new_paragraph_start(first, page_width);
892
893        // Calculate paragraph end flag
894        let last = &text_elements[text_elements.len() - 1];
895        let paragraph_end = is_paragraph_complete(last, page_width);
896
897        PageContinuationFlags::new(paragraph_start, paragraph_end)
898    }
899
900    /// Converts the result to an HTML string.
901    ///
902    /// Follows PP-StructureV3's formatting rules with semantic HTML tags.
903    pub fn to_html(&self) -> String {
904        let mut html = String::from(
905            "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
906        );
907
908        for element in &self.layout_elements {
909            match element.element_type {
910                // Document title
911                LayoutElementType::DocTitle => {
912                    html.push_str("<h1>");
913                    if let Some(text) = &element.text {
914                        html.push_str(&Self::escape_html(text));
915                    }
916                    html.push_str("</h1>\n");
917                }
918                // Paragraph/section title
919                LayoutElementType::ParagraphTitle => {
920                    html.push_str("<h2>");
921                    if let Some(text) = &element.text {
922                        html.push_str(&Self::escape_html(text));
923                    }
924                    html.push_str("</h2>\n");
925                }
926                // Table - embed HTML structure with simplified markup
927                LayoutElementType::Table => {
928                    if let Some(table) =
929                        self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
930                    {
931                        if let Some(table_html) = &table.html_structure {
932                            // Simplify table HTML (remove html/body wrappers) and add border styling
933                            let simplified = simplify_table_html(table_html);
934                            let styled = simplified.replacen(
935                                "<table>",
936                                "<table border=\"1\" style=\"border-collapse: collapse;\">",
937                                1,
938                            );
939                            html.push_str(&styled);
940                            html.push('\n');
941                        } else {
942                            html.push_str("<p>[Table]</p>\n");
943                        }
944                    } else {
945                        html.push_str("<p>[Table]</p>\n");
946                    }
947                }
948                // Formula - use math tags
949                LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
950                    html.push_str("<p class=\"formula\">$$");
951                    if let Some(latex) = &element.text {
952                        html.push_str(&Self::escape_html(latex));
953                    }
954                    html.push_str("$$</p>\n");
955                }
956                // Image/Chart
957                LayoutElementType::Image | LayoutElementType::Chart => {
958                    html.push_str("<figure>\n<img alt=\"Figure\" />\n");
959                    if let Some(caption) = &element.text {
960                        html.push_str("<figcaption>");
961                        html.push_str(&Self::escape_html(caption));
962                        html.push_str("</figcaption>\n");
963                    }
964                    html.push_str("</figure>\n");
965                }
966                // Seal
967                LayoutElementType::Seal => {
968                    html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
969                    if let Some(text) = &element.text {
970                        html.push_str("<figcaption>");
971                        html.push_str(&Self::escape_html(text));
972                        html.push_str("</figcaption>\n");
973                    }
974                    html.push_str("</figure>\n");
975                }
976                // Captions
977                _ if element.element_type.is_caption() => {
978                    if let Some(text) = &element.text {
979                        html.push_str("<figcaption>");
980                        html.push_str(&Self::escape_html(text));
981                        html.push_str("</figcaption>\n");
982                    }
983                }
984                // Abstract
985                LayoutElementType::Abstract => {
986                    html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
987                    if let Some(text) = &element.text {
988                        html.push_str(&Self::escape_html(text));
989                    }
990                    html.push_str("</p>\n</section>\n");
991                }
992                // Reference
993                LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
994                    html.push_str("<section class=\"references\">\n<p>");
995                    if let Some(text) = &element.text {
996                        html.push_str(&Self::escape_html(text));
997                    }
998                    html.push_str("</p>\n</section>\n");
999                }
1000                // List
1001                LayoutElementType::List => {
1002                    html.push_str("<ul>\n");
1003                    if let Some(text) = &element.text {
1004                        for line in text.lines() {
1005                            html.push_str("<li>");
1006                            html.push_str(&Self::escape_html(line));
1007                            html.push_str("</li>\n");
1008                        }
1009                    }
1010                    html.push_str("</ul>\n");
1011                }
1012                // Header
1013                _ if element.element_type.is_header() => {
1014                    html.push_str("<header>");
1015                    if let Some(text) = &element.text {
1016                        html.push_str(&Self::escape_html(text));
1017                    }
1018                    html.push_str("</header>\n");
1019                }
1020                // Footer
1021                _ if element.element_type.is_footer() => {
1022                    html.push_str("<footer>");
1023                    if let Some(text) = &element.text {
1024                        html.push_str(&Self::escape_html(text));
1025                    }
1026                    html.push_str("</footer>\n");
1027                }
1028                // Default text
1029                _ => {
1030                    if let Some(text) = &element.text {
1031                        html.push_str("<p>");
1032                        html.push_str(&Self::escape_html(text));
1033                        html.push_str("</p>\n");
1034                    }
1035                }
1036            }
1037        }
1038        html.push_str("</body>\n</html>");
1039        html
1040    }
1041
1042    /// Escapes HTML special characters.
1043    fn escape_html(text: &str) -> String {
1044        text.replace('&', "&amp;")
1045            .replace('<', "&lt;")
1046            .replace('>', "&gt;")
1047            .replace('"', "&quot;")
1048            .replace('\'', "&#39;")
1049    }
1050
1051    /// Converts the result to a JSON Value.
1052    pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
1053        serde_json::to_value(self)
1054    }
1055
1056    /// Saves the analysis results to the specified directory.
1057    ///
1058    /// This generates:
1059    /// - `*_res.json`: The full structured result
1060    /// - `*_res.html`: An HTML representation
1061    ///
1062    /// Note: Markdown export with image extraction should use the example utilities
1063    /// (`examples/utils/markdown.rs`) instead, as that requires I/O operations
1064    /// that belong in the application layer. Use `StructureResult::to_markdown()`
1065    /// for pure markdown generation without side effects.
1066    ///
1067    /// # Arguments
1068    ///
1069    /// * `output_dir` - Directory to save the output files
1070    /// * `to_json` - If true, save a JSON representation
1071    /// * `to_html` - If true, save an HTML representation
1072    pub fn save_results(
1073        &self,
1074        output_dir: impl AsRef<Path>,
1075        to_json: bool,
1076        to_html: bool,
1077    ) -> std::io::Result<()> {
1078        let output_dir = output_dir.as_ref();
1079        if !output_dir.exists() {
1080            std::fs::create_dir_all(output_dir)?;
1081        }
1082
1083        let input_path = Path::new(self.input_path.as_ref());
1084        // Extract file stem, handling PDF page suffix (e.g., "file.pdf#3" -> "file_003")
1085        let stem = if let Some(path_str) = input_path.to_str() {
1086            if let Some(hash_idx) = path_str.rfind('#') {
1087                // This is a PDF page reference like "file.pdf#3"
1088                let base = &path_str[..hash_idx];
1089                let page_num = &path_str[hash_idx + 1..];
1090                let base_stem = Path::new(base)
1091                    .file_stem()
1092                    .and_then(|s| s.to_str())
1093                    .unwrap_or("result");
1094                format!("{}_{}", base_stem, page_num)
1095            } else {
1096                input_path
1097                    .file_stem()
1098                    .and_then(|s| s.to_str())
1099                    .unwrap_or("result")
1100                    .to_string()
1101            }
1102        } else {
1103            "result".to_string()
1104        };
1105
1106        // Save JSON
1107        if to_json {
1108            let json_path = output_dir.join(format!("{}.json", stem));
1109            let json_file = std::fs::File::create(json_path)?;
1110            serde_json::to_writer_pretty(json_file, self)?;
1111        }
1112
1113        // Save HTML
1114        if to_html {
1115            let html_path = output_dir.join(format!("{}.html", stem));
1116            std::fs::write(html_path, self.to_html())?;
1117        }
1118
1119        Ok(())
1120    }
1121}
1122
1123/// Determines paragraph continuity flags for the current element relative to the previous.
1124///
1125/// This implements PaddleX's `get_seg_flag` logic from `layout_parsing/utils.py`:
1126/// - `seg_start_flag = true` means this element starts a NEW paragraph
1127/// - `seg_start_flag = false` means this element CONTINUES the previous paragraph
1128///
1129/// The logic checks whether:
1130/// 1. Previous block's last line ends near the right edge (text fills to right)
1131/// 2. Current block's first line starts near the left edge (no indentation)
1132/// 3. Previous block has more than one line
1133/// 4. The two blocks are horizontally close enough
1134///
1135/// Returns `seg_start_flag` (true = new paragraph, false = continuation).
1136fn get_seg_flag(current: &LayoutElement, prev: Option<&LayoutElement>) -> bool {
1137    const COORD_THRESHOLD: f32 = 10.0;
1138
1139    let seg_start = current.seg_start_x.unwrap_or(current.bbox.x_min());
1140    let mut context_left = current.bbox.x_min();
1141    let mut context_right = current.bbox.x_max();
1142
1143    if let Some(prev) = prev {
1144        let prev_seg_end = prev.seg_end_x.unwrap_or(prev.bbox.x_max());
1145        let prev_num_lines = prev.num_lines.unwrap_or(1);
1146
1147        // Check if blocks overlap horizontally
1148        let overlap_blocks = context_left < prev.bbox.x_max() && context_right > prev.bbox.x_min();
1149
1150        let edge_distance;
1151        if overlap_blocks {
1152            context_left = context_left.min(prev.bbox.x_min());
1153            context_right = context_right.max(prev.bbox.x_max());
1154            edge_distance = 0.0;
1155        } else {
1156            edge_distance = (current.bbox.x_min() - prev.bbox.x_max()).abs();
1157        }
1158
1159        let prev_end_space_small = (context_right - prev_seg_end).abs() < COORD_THRESHOLD;
1160        let current_start_space_small = seg_start - context_left < COORD_THRESHOLD;
1161        let prev_lines_more_than_one = prev_num_lines > 1;
1162        let blocks_close = edge_distance
1163            < (prev.bbox.x_max() - prev.bbox.x_min())
1164                .max(current.bbox.x_max() - current.bbox.x_min());
1165
1166        if prev_end_space_small
1167            && current_start_space_small
1168            && prev_lines_more_than_one
1169            && blocks_close
1170        {
1171            return false; // continuation
1172        }
1173
1174        true // new paragraph
1175    } else {
1176        // First element: check if text starts near the left edge
1177        if seg_start - context_left < COORD_THRESHOLD {
1178            return false; // continuation from previous page (no indentation)
1179        }
1180        true
1181    }
1182}
1183
1184/// Checks if a text element appears to start a new paragraph.
1185///
1186/// Following PaddleX's logic: if the text starts near the left edge of the page
1187/// (within 5% of page width), it's likely the start of a new paragraph.
1188fn is_new_paragraph_start(element: &LayoutElement, page_width: Option<f32>) -> bool {
1189    let left = element.bbox.x_min();
1190    let threshold = page_width.map_or(50.0, |w| w * 0.05); // 5% of page width
1191    left <= threshold
1192}
1193
1194/// Checks if a text element appears to complete its paragraph on this page.
1195///
1196/// Following PaddleX's logic: if the text ends before the right edge of the page
1197/// (not within 10% of right margin), the paragraph likely ends here.
1198fn is_paragraph_complete(element: &LayoutElement, page_width: Option<f32>) -> bool {
1199    let right = element.bbox.x_max();
1200
1201    // If we have page width info, check if element ends before the right edge
1202    if let Some(width) = page_width {
1203        let right_margin = width * 0.1;
1204        return right <= (width - right_margin);
1205    }
1206
1207    // Conservative default: assume paragraphs end
1208    true
1209}
1210
1211/// Concatenates markdown content from multiple pages into a single document.
1212///
1213/// This follows PaddleX's `concatenate_markdown_pages` logic to intelligently
1214/// merge pages while preserving paragraph continuity.
1215///
1216/// # Arguments
1217///
1218/// * `results` - Slice of structure results from multiple pages (in order)
1219///
1220/// # Returns
1221///
1222/// A single markdown string with all pages properly concatenated
1223pub fn concatenate_markdown_pages(results: &[StructureResult]) -> String {
1224    if results.is_empty() {
1225        return String::new();
1226    }
1227
1228    if results.len() == 1 {
1229        return results[0].to_markdown();
1230    }
1231
1232    let mut markdown = String::new();
1233    let mut prev_page_end_flag = true; // First page is treated as starting fresh
1234
1235    for result in results.iter() {
1236        let flags = result
1237            .page_continuation_flags
1238            .as_ref()
1239            .cloned()
1240            .unwrap_or_else(|| result.calculate_continuation_flags());
1241
1242        let page_markdown = result.to_markdown();
1243
1244        // Skip empty pages
1245        if page_markdown.trim().is_empty() {
1246            prev_page_end_flag = flags.paragraph_end;
1247            continue;
1248        }
1249
1250        let page_first_continues = !flags.paragraph_start;
1251        let _page_last_continues = !flags.paragraph_end;
1252
1253        // Determine how to join this page
1254        if page_first_continues && !prev_page_end_flag {
1255            // Both pages are in the middle of the same paragraph
1256            // Check for Chinese characters to decide spacing
1257            let last_char = markdown.chars().last();
1258            let first_char = page_markdown.chars().next();
1259
1260            let last_is_chinese = last_char.is_some_and(is_chinese_char);
1261            let first_is_chinese = first_char.is_some_and(is_chinese_char);
1262
1263            if !last_is_chinese && !first_is_chinese {
1264                // Non-Chinese text: add space
1265                markdown.push(' ');
1266                markdown.push_str(page_markdown.trim_start());
1267            } else {
1268                // Chinese or mixed: direct concatenation
1269                markdown.push_str(page_markdown.trim_start());
1270            }
1271        } else {
1272            // New paragraph or section
1273            if !markdown.is_empty() {
1274                markdown.push_str("\n\n");
1275            }
1276            markdown.push_str(&page_markdown);
1277        }
1278
1279        prev_page_end_flag = flags.paragraph_end;
1280    }
1281
1282    markdown.trim().to_string()
1283}
1284
1285/// Cleans OCR text content by removing common artifacts.
1286///
1287/// This function removes PDF line-break hyphens and fixes spacing issues
1288/// in OCR text content. It should only be applied to raw OCR text, not to
1289/// formatted markdown or HTML.
1290///
1291/// Following PaddleX's approach:
1292/// 1. Remove hyphenation artifacts: `-\n` -> `` (join words)
1293/// 2. Convert newlines to spaces: `\n` -> ` `
1294fn clean_ocr_text(text: &str) -> String {
1295    // First remove hyphenation (word breaks), then convert newlines to spaces
1296    text.replace("-\n", "").replace('\n', " ")
1297}
1298
1299/// Formats the first non-empty line of a block if it matches a template keyword.
1300///
1301/// This is the Rust equivalent of PaddleX's `format_first_line_func`:
1302/// 1. Split text by `spliter`
1303/// 2. Find the first non-empty token
1304/// 3. If it matches any template (case-insensitive exact match), replace it with `format_func(token)`
1305/// 4. Rejoin with `spliter`
1306///
1307/// For abstract: `spliter=" "`, templates=["abstract","摘要"], format_func= `## {}\n`
1308/// For reference: `spliter="\n"`, templates=["references","参考文献"], format_func= `## {}`
1309fn format_first_line(
1310    text: &str,
1311    spliter: &str,
1312    templates: &[&str],
1313    heading_prefix: &str,
1314) -> String {
1315    let parts: Vec<&str> = text.split(spliter).collect();
1316    let mut result_parts: Vec<String> = Vec::with_capacity(parts.len());
1317    let mut found_first = false;
1318
1319    for part in &parts {
1320        if !found_first {
1321            let trimmed = part.trim();
1322            if trimmed.is_empty() {
1323                result_parts.push(part.to_string());
1324                continue;
1325            }
1326            found_first = true;
1327            // Check if the first non-empty token matches a template (case-insensitive)
1328            if templates.iter().any(|t| trimmed.eq_ignore_ascii_case(t)) {
1329                // Replace with formatted heading: "## <original_text>\n"
1330                result_parts.push(format!("{}{}\n", heading_prefix, trimmed));
1331            } else {
1332                result_parts.push(part.to_string());
1333            }
1334        } else {
1335            result_parts.push(part.to_string());
1336        }
1337    }
1338
1339    result_parts.join(spliter)
1340}
1341
1342/// Formats text blocks following PaddleX's text handling:
1343/// 1. First remove hyphenation: `-\n` -> `` (join broken words)
1344/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
1345///
1346/// This converts OCR line breaks into proper paragraph breaks.
1347fn format_text_block(text: &str) -> String {
1348    // First, remove hyphenation artifacts (word breaks at line ends)
1349    let dehyphenated = text.replace("-\n", "");
1350    // Collapse double newlines to single (undo paragraph breaks)
1351    let step1 = dehyphenated.replace("\n\n", "\n");
1352    // Then, convert single newlines to paragraph breaks
1353    step1.replace('\n', "\n\n")
1354}
1355
1356/// Formats content blocks (table of contents) following PaddleX:
1357/// `.replace("-\n", "  \n").replace("\n", "  \n")`
1358///
1359/// This uses markdown's soft line break (two spaces at end of line).
1360fn format_content_block(text: &str) -> String {
1361    // Handle PDF hyphen line breaks first
1362    let step1 = text.replace("-\n", "  \n");
1363    // Convert newlines to soft breaks
1364    step1.replace('\n', "  \n")
1365}
1366
1367/// Formats vision footnote blocks following PaddleX:
1368/// 1. First remove hyphenation: `-\n` -> ``
1369/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
1370fn format_vision_footnote_block(text: &str) -> String {
1371    let dehyphenated = text.replace("-\n", "");
1372    let step1 = dehyphenated.replace("\n\n", "\n");
1373    step1.replace('\n', "\n\n")
1374}
1375
1376/// Bullet marker characters commonly found in OCR text.
1377const BULLET_MARKERS: &[char] = &['•', '●', '◦', '▪', '◆'];
1378
1379/// Checks if text contains bullet markers that should be formatted as a list.
1380fn has_bullet_markers(text: &str) -> bool {
1381    BULLET_MARKERS.iter().any(|&m| text.contains(m))
1382}
1383
1384/// Formats text with bullet markers as a markdown list.
1385///
1386/// Splits on any bullet marker character so mixed markers (e.g. `• item1 ▪ item2`)
1387/// are all handled correctly.
1388fn format_as_bullet_list(text: &str, md: &mut String) {
1389    for item in text.split(|c: char| BULLET_MARKERS.contains(&c)) {
1390        let item = item.trim();
1391        if !item.is_empty() {
1392            md.push_str("- ");
1393            md.push_str(item);
1394            md.push('\n');
1395        }
1396    }
1397}
1398
1399/// Checks if a character is a Chinese character.
1400///
1401/// Used to determine spacing rules when concatenating pages.
1402fn is_chinese_char(c: char) -> bool {
1403    match c {
1404        '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
1405        '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
1406        '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
1407        '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
1408        '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
1409        '\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E
1410        '\u{2CEB0}'..='\u{2EBEF}' => // CJK Unified Ideographs Extension F
1411            true,
1412        _ => false,
1413    }
1414}
1415
1416/// Checks if a character is a lowercase letter.
1417fn is_lowercase(c: char) -> bool {
1418    c.is_ascii_lowercase()
1419}
1420
1421/// Checks if a character is an uppercase letter.
1422fn is_uppercase(c: char) -> bool {
1423    c.is_ascii_uppercase()
1424}
1425
1426/// Checks if a character is a digit.
1427fn is_digit(c: char) -> bool {
1428    c.is_ascii_digit()
1429}
1430
1431/// Removes PDF hyphenation artifacts from text.
1432/// Dehyphenation: only handles hyphen-newline patterns (word breaks across lines).
1433///
1434/// Matches PaddleX's behavior where hyphens are only stripped at line boundaries
1435/// (hyphen immediately followed by newline). Mid-word hyphens in compound words
1436/// like "real-time", "end-to-end", "one-to-many" are preserved.
1437fn dehyphenate(text: &str) -> String {
1438    let mut result = String::with_capacity(text.len());
1439    let chars: Vec<char> = text.chars().collect();
1440    let len = chars.len();
1441    let mut i = 0;
1442
1443    // Helper to check if we're in a URL-like pattern
1444    let is_url_context = |pos: usize| -> bool {
1445        let start = pos.saturating_sub(10);
1446        let end = (pos + 5).min(len);
1447        let window: String = chars[start..end].iter().collect();
1448        window.contains("http") || window.contains("www") || window.contains("://")
1449    };
1450
1451    while i < len {
1452        if chars[i] == '-' {
1453            if is_url_context(i) {
1454                result.push('-');
1455                i += 1;
1456                continue;
1457            }
1458
1459            // Only dehyphenate when hyphen is followed by newline (line-break hyphenation).
1460            // Pattern: "word-\nletter" → "wordletter"
1461            let is_artifact = if i + 1 < len && chars[i + 1] == '\n' {
1462                // Hyphen followed by newline — check if next line starts with lowercase
1463                if i + 2 < len {
1464                    is_lowercase(chars[i + 2])
1465                } else {
1466                    false
1467                }
1468            } else {
1469                false
1470            };
1471
1472            if is_artifact {
1473                // Skip the hyphen and the following newline
1474                i += 1; // skip newline (will be incremented again at end of loop)
1475            } else {
1476                result.push('-');
1477            }
1478        } else {
1479            result.push(chars[i]);
1480        }
1481        i += 1;
1482    }
1483
1484    result
1485}
1486
1487/// Fixes missing spaces between merged words.
1488///
1489/// OCR and PDF extraction can result in merged words like
1490/// "enhancetheencoder'sfeaturerepresentation" or "48.1%AP".
1491/// This function detects and fixes common patterns.
1492fn fix_merged_words(text: &str) -> String {
1493    let mut result = String::with_capacity(text.len());
1494    let chars: Vec<char> = text.chars().collect();
1495    let mut i = 0;
1496
1497    while i < chars.len() {
1498        let current = chars[i];
1499
1500        if i > 0 {
1501            let prev = chars[i - 1];
1502
1503            // Detect missing space between lowercase and lowercase (after apostrophe or consonant)
1504            // e.g., "encoder'sfeature" -> "encoder's feature"
1505            if is_lowercase(prev) && is_lowercase(current) {
1506                // Only add space if previous was apostrophe or word boundary context
1507                // This is a heuristic - in practice you'd want more sophisticated NLP
1508                if i > 1 && chars[i - 2] == '\'' {
1509                    result.push(' ');
1510                }
1511                // Also detect lowercase followed by uppercase
1512                // e.g., "RT-DETRis" -> "RT-DETR is"
1513            } else if is_lowercase(prev) && is_uppercase(current) {
1514                // Check if the uppercase starts a new word (not an acronym)
1515                // If next char is lowercase, it's likely a new word
1516                if i + 1 < chars.len() && is_lowercase(chars[i + 1]) {
1517                    result.push(' ');
1518                }
1519            }
1520            // Detect digit/percent followed by letter, or letter-digit-letter pattern
1521            // e.g., "48.1%AP" -> "48.1% AP"
1522            // e.g., "RT-DETRv3" shouldn't be split, but "model 100instances" -> "model 100 instances"
1523            else if ((is_digit(prev) || prev == '%') && is_uppercase(current))
1524                || (is_letter(prev)
1525                    && is_digit(current)
1526                    && i + 1 < chars.len()
1527                    && is_letter(chars[i + 1]))
1528            {
1529                result.push(' ');
1530            }
1531        }
1532
1533        result.push(current);
1534        i += 1;
1535    }
1536
1537    result
1538}
1539
1540/// Checks if a character is a letter.
1541fn is_letter(c: char) -> bool {
1542    is_lowercase(c) || is_uppercase(c)
1543}
1544
1545/// Simplifies table HTML by removing wrapper tags, following PaddleX's `simplify_table_func`.
1546///
1547/// This removes `<html>`, `</html>`, `<body>`, and `</body>` tags from table HTML
1548/// to produce cleaner markdown output.
1549fn simplify_table_html(html: &str) -> String {
1550    html.replace("<html>", "")
1551        .replace("</html>", "")
1552        .replace("<body>", "")
1553        .replace("</body>", "")
1554}
1555
1556/// Post-processes text content to fix common OCR/PDF artifacts.
1557///
1558/// This applies multiple cleanup steps:
1559/// 1. Dehyphenation - removes line-break hyphens
1560/// 2. Word merging fixes - adds missing spaces
1561/// 3. Spacing normalization - fixes multiple spaces
1562pub fn postprocess_text(text: &str) -> String {
1563    let text = dehyphenate(text);
1564    let text = fix_merged_words(&text);
1565
1566    // Normalize whitespace (collapse multiple spaces, fix spacing after punctuation)
1567    let mut result = String::new();
1568    let mut in_space = false;
1569
1570    for c in text.chars() {
1571        if c.is_whitespace() {
1572            if !in_space && !result.is_empty() {
1573                result.push(' ');
1574                in_space = true;
1575            }
1576        } else {
1577            // Fix missing space after period (when followed by letter)
1578            if c == '.' && !result.is_empty() {
1579                let last = result.chars().last().unwrap();
1580                if is_letter(last) || is_digit(last) {
1581                    result.push('.');
1582                    in_space = true;
1583                    continue;
1584                }
1585            }
1586            // Fix spacing after punctuation
1587            if in_space && matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')' | ']' | '}') {
1588                result.pop(); // Remove the space before punctuation
1589                result.push(c);
1590                continue;
1591            }
1592            result.push(c);
1593            in_space = false;
1594        }
1595    }
1596
1597    result
1598}
1599
1600/// Removes duplicate section headers from concatenated markdown.
1601///
1602/// When concatenating pages, section headers like "**Abstract**" or
1603/// "**References**" may appear multiple times. This function deduplicates
1604/// them while preserving the first occurrence.
1605fn deduplicate_sections(markdown: &str) -> String {
1606    let mut result = String::new();
1607    let mut seen_sections: std::collections::HashSet<String> = std::collections::HashSet::new();
1608
1609    for line in markdown.lines() {
1610        let trimmed = line.trim();
1611
1612        // Check for common section header patterns
1613        let is_section_header =
1614            trimmed.starts_with("**") && trimmed.ends_with("**") && trimmed.len() > 4;
1615
1616        let section_name = if is_section_header {
1617            trimmed[2..trimmed.len() - 2].to_string()
1618        } else {
1619            String::new()
1620        };
1621
1622        if is_section_header {
1623            if seen_sections.contains(&section_name) {
1624                // Skip duplicate section header
1625                continue;
1626            }
1627            seen_sections.insert(section_name);
1628        }
1629
1630        if !result.is_empty() {
1631            result.push('\n');
1632        }
1633        result.push_str(line);
1634    }
1635
1636    result
1637}
1638
1639/// Checks if two bounding boxes are on the same line (have significant vertical overlap).
1640///
1641/// Two boxes are considered on the same line if their vertical overlap is greater than
1642/// 50% of the smaller box's height.
1643fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool {
1644    let y1_min = bbox1.y_min();
1645    let y1_max = bbox1.y_max();
1646    let y2_min = bbox2.y_min();
1647    let y2_max = bbox2.y_max();
1648
1649    // Calculate vertical overlap
1650    let overlap_start = y1_min.max(y2_min);
1651    let overlap_end = y1_max.min(y2_max);
1652    let overlap = (overlap_end - overlap_start).max(0.0);
1653
1654    // Calculate minimum height
1655    let height1 = y1_max - y1_min;
1656    let height2 = y2_max - y2_min;
1657    let min_height = height1.min(height2);
1658
1659    // Consider same line if overlap > 50% of min height
1660    min_height > 0.0 && overlap / min_height > 0.5
1661}
1662
1663/// Filters empty formula blocks from markdown.
1664///
1665/// Formula blocks with no LaTeX content like `$$\n$$` are removed.
1666fn filter_empty_formulas(markdown: &str) -> String {
1667    let mut result = String::new();
1668    let lines: Vec<&str> = markdown.lines().collect();
1669    let mut i = 0;
1670
1671    while i < lines.len() {
1672        let line = lines[i];
1673
1674        // Check for empty formula block pattern
1675        if line.trim() == "$$" {
1676            // Check if next line is also $$ (empty formula)
1677            if i + 1 < lines.len() && lines[i + 1].trim() == "$$" {
1678                // Skip both lines
1679                i += 2;
1680                // Also skip the blank line after
1681                if i < lines.len() && lines[i].trim().is_empty() {
1682                    i += 1;
1683                }
1684                continue;
1685            }
1686            // Check if the next non-empty line contains actual content
1687            let mut j = i + 1;
1688            let has_content = if j < lines.len() {
1689                let mut found = false;
1690                while j < lines.len() {
1691                    if lines[j].trim() == "$$" {
1692                        break;
1693                    }
1694                    if !lines[j].trim().is_empty() {
1695                        found = true;
1696                        break;
1697                    }
1698                    j += 1;
1699                }
1700                found
1701            } else {
1702                false
1703            };
1704
1705            if !has_content {
1706                // Skip to closing $$
1707                while i < lines.len() && lines[i].trim() != "$$" {
1708                    i += 1;
1709                }
1710                if i < lines.len() {
1711                    i += 1; // Skip closing $$
1712                }
1713                continue;
1714            }
1715        }
1716
1717        if !result.is_empty() {
1718            result.push('\n');
1719        }
1720        result.push_str(line);
1721        i += 1;
1722    }
1723
1724    result
1725}
1726
1727/// Applies all post-processing steps to concatenated markdown.
1728///
1729/// This is the main entry point for cleaning up concatenated markdown output.
1730pub fn postprocess_markdown(markdown: &str) -> String {
1731    let markdown = filter_empty_formulas(markdown);
1732    let markdown = deduplicate_sections(&markdown);
1733
1734    // Apply text post-processing line by line for text content
1735    let mut result = String::new();
1736    let mut in_code_block = false;
1737    let mut in_formula = false;
1738
1739    for line in markdown.lines() {
1740        let trimmed = line.trim();
1741
1742        // Detect code blocks
1743        if trimmed.starts_with("```") {
1744            in_code_block = !in_code_block;
1745            result.push_str(line);
1746            result.push('\n');
1747            continue;
1748        }
1749
1750        // Detect formula blocks
1751        if trimmed == "$$" {
1752            in_formula = !in_formula;
1753            result.push_str(line);
1754            result.push('\n');
1755            continue;
1756        }
1757
1758        // Skip processing inside code blocks
1759        if in_code_block {
1760            result.push_str(line);
1761            result.push('\n');
1762            continue;
1763        }
1764
1765        // If inside a formula block, ensure it doesn't contain unescaped dollar signs
1766        // which cause KaTeX "Can't use function '$' in math mode" errors.
1767        if in_formula {
1768            // If the formula content looks like regular text (many spaces, few backslashes)
1769            // and contains a $, KaTeX will fail. We escape the $ inside the math block.
1770            let contains_dollar = line.contains('$');
1771            let is_plain_text = line.split_whitespace().count() > 3 && !line.contains('\\');
1772
1773            if contains_dollar && is_plain_text {
1774                result.push_str(&line.replace('$', "\\$"));
1775            } else if contains_dollar {
1776                // Escape bare dollar signs inside the math block to avoid
1777                // "Can't use function '$' in math mode" KaTeX errors while
1778                // preserving literal dollars (e.g. \text{$10}).
1779                result.push_str(&line.replace('$', "\\$"));
1780            } else {
1781                result.push_str(line);
1782            }
1783            result.push('\n');
1784            continue;
1785        }
1786
1787        // Process text content (skip headers, lists, etc.)
1788        if trimmed.starts_with('#')
1789            || trimmed.starts_with('*')
1790            || trimmed.starts_with('>')
1791            || trimmed.starts_with('|')
1792            || trimmed.starts_with('-')
1793            || trimmed.starts_with('+')
1794        {
1795            result.push_str(line);
1796        } else {
1797            result.push_str(&postprocess_text(line));
1798        }
1799        result.push('\n');
1800    }
1801
1802    result
1803}
1804
1805/// Extension trait for convenient multi-page processing.
1806pub trait StructureResultExt {
1807    /// Converts multiple results to a single concatenated markdown.
1808    fn to_concatenated_markdown(results: &[Self]) -> String
1809    where
1810        Self: Sized;
1811
1812    /// Saves multiple results with concatenated markdown.
1813    fn save_multi_page_results(
1814        results: &[Self],
1815        output_dir: impl AsRef<std::path::Path>,
1816        base_name: &str,
1817        to_json: bool,
1818        to_markdown: bool,
1819        to_html: bool,
1820    ) -> std::io::Result<()>
1821    where
1822        Self: Sized;
1823}
1824
1825impl StructureResultExt for StructureResult {
1826    fn to_concatenated_markdown(results: &[Self]) -> String {
1827        concatenate_markdown_pages(results)
1828    }
1829
1830    fn save_multi_page_results(
1831        results: &[Self],
1832        output_dir: impl AsRef<std::path::Path>,
1833        base_name: &str,
1834        to_json: bool,
1835        to_markdown: bool,
1836        to_html: bool,
1837    ) -> std::io::Result<()>
1838    where
1839        Self: Sized,
1840    {
1841        let output_dir = output_dir.as_ref();
1842        if !output_dir.exists() {
1843            std::fs::create_dir_all(output_dir)?;
1844        }
1845
1846        // Save individual page results
1847        for (idx, result) in results.iter().enumerate() {
1848            let page_dir = output_dir.join(format!("page_{:03}", idx));
1849            std::fs::create_dir_all(&page_dir)?;
1850            result.save_results(&page_dir, to_json, to_html)?;
1851        }
1852
1853        // Save concatenated markdown
1854        if to_markdown {
1855            let concat_md_path = output_dir.join(format!("{}.md", base_name));
1856            std::fs::write(concat_md_path, Self::to_concatenated_markdown(results))?;
1857        }
1858
1859        // Save concatenated JSON (array of results)
1860        if to_json {
1861            let concat_json_path = output_dir.join(format!("{}.json", base_name));
1862            let json_file = std::fs::File::create(concat_json_path)?;
1863            serde_json::to_writer_pretty(json_file, &results)?;
1864        }
1865
1866        Ok(())
1867    }
1868}
1869
1870/// A layout element detected in the document.
1871#[derive(Debug, Clone, Serialize, Deserialize)]
1872pub struct LayoutElement {
1873    /// Bounding box of the element
1874    pub bbox: BoundingBox,
1875    /// Type of the layout element
1876    pub element_type: LayoutElementType,
1877    /// Confidence score for the detection
1878    pub confidence: f32,
1879    /// Optional label for the element (original model label)
1880    pub label: Option<String>,
1881    /// Optional text content for the element
1882    pub text: Option<String>,
1883    /// Reading order index (1-based, assigned during stitching)
1884    ///
1885    /// This index represents the element's position in the reading order.
1886    /// Only elements that should be included in reading flow (text, tables,
1887    /// formulas, images, etc.) will have an order index assigned.
1888    /// Headers, footers, and other auxiliary elements may have `None`.
1889    pub order_index: Option<u32>,
1890    /// X-coordinate of the first text span's left edge within this element.
1891    /// Used by `get_seg_flag` to detect paragraph continuity across blocks.
1892    /// Computed during stitching from the first OCR region (after spatial sort).
1893    #[serde(skip_serializing_if = "Option::is_none")]
1894    pub seg_start_x: Option<f32>,
1895    /// X-coordinate of the last text span's right edge within this element.
1896    /// Used by `get_seg_flag` to detect paragraph continuity across blocks.
1897    /// Computed during stitching from the last OCR region (after spatial sort).
1898    #[serde(skip_serializing_if = "Option::is_none")]
1899    pub seg_end_x: Option<f32>,
1900    /// Number of text lines within this element.
1901    /// Used by `get_seg_flag` to detect paragraph continuity across blocks.
1902    #[serde(skip_serializing_if = "Option::is_none")]
1903    pub num_lines: Option<u32>,
1904}
1905
1906impl LayoutElement {
1907    /// Creates a new layout element.
1908    pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
1909        Self {
1910            bbox,
1911            element_type,
1912            confidence,
1913            label: None,
1914            text: None,
1915            order_index: None,
1916            seg_start_x: None,
1917            seg_end_x: None,
1918            num_lines: None,
1919        }
1920    }
1921
1922    /// Sets the label for the element.
1923    pub fn with_label(mut self, label: impl Into<String>) -> Self {
1924        self.label = Some(label.into());
1925        self
1926    }
1927
1928    /// Sets the text content for the element.
1929    pub fn with_text(mut self, text: impl Into<String>) -> Self {
1930        self.text = Some(text.into());
1931        self
1932    }
1933}
1934
1935/// Layout element type supporting PP-StructureV3's full label set.
1936///
1937/// This enum represents both **semantic categories** and **fine-grained labels** for layout elements.
1938/// PP-StructureV3 models output 20 or 23 class labels depending on the model variant.
1939///
1940/// The original model-specific label is preserved in `LayoutElement.label` field.
1941///
1942/// # PP-StructureV3 Label Categories
1943///
1944/// **Document structure:**
1945/// - `DocTitle` - Document title (doc_title)
1946/// - `ParagraphTitle` - Section/paragraph title (paragraph_title)
1947/// - `Text` - General text content
1948/// - `Content` - Table of contents (content)
1949/// - `Abstract` - Abstract section
1950///
1951/// **Visual elements:**
1952/// - `Image` - Images/figures (image, figure)
1953/// - `Table` - Tables
1954/// - `Chart` - Charts/graphs
1955/// - `Formula` - Mathematical formulas
1956///
1957/// **Captions and titles:**
1958/// - `FigureTitle` - Figure caption (figure_title)
1959/// - `TableTitle` - Table caption (table_title)
1960/// - `ChartTitle` - Chart caption (chart_title)
1961/// - `FigureTableChartTitle` - Combined caption type
1962///
1963/// **Page structure:**
1964/// - `Header` - Page header
1965/// - `HeaderImage` - Header image
1966/// - `Footer` - Page footer
1967/// - `FooterImage` - Footer image
1968/// - `Footnote` - Footnotes
1969///
1970/// **Special elements:**
1971/// - `Seal` - Stamps/official seals
1972/// - `Number` - Page numbers
1973/// - `Reference` - References section
1974/// - `ReferenceContent` - Reference content
1975/// - `Algorithm` - Algorithm blocks
1976/// - `FormulaNumber` - Formula numbers
1977/// - `AsideText` - Marginal/aside text
1978/// - `List` - List items
1979///
1980/// - `Other` - Unknown/unmapped labels
1981#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1982pub enum LayoutElementType {
1983    /// Document title
1984    DocTitle,
1985    /// Paragraph/section title
1986    ParagraphTitle,
1987    /// General text content
1988    Text,
1989    /// Table of contents
1990    Content,
1991    /// Abstract section
1992    Abstract,
1993
1994    /// Image or figure
1995    Image,
1996    /// Table
1997    Table,
1998    /// Chart or graph
1999    Chart,
2000    /// Mathematical formula
2001    Formula,
2002
2003    /// Figure caption/title
2004    FigureTitle,
2005    /// Table caption/title
2006    TableTitle,
2007    /// Chart caption/title
2008    ChartTitle,
2009    /// Combined figure/table/chart title (PP-DocLayout)
2010    FigureTableChartTitle,
2011
2012    /// Page header
2013    Header,
2014    /// Header image
2015    HeaderImage,
2016    /// Page footer
2017    Footer,
2018    /// Footer image
2019    FooterImage,
2020    /// Footnote
2021    Footnote,
2022
2023    /// Stamp or official seal
2024    Seal,
2025    /// Page number
2026    Number,
2027    /// Reference section
2028    Reference,
2029    /// Reference content (PP-DocLayout_plus-L)
2030    ReferenceContent,
2031    /// Algorithm block
2032    Algorithm,
2033    /// Formula number
2034    FormulaNumber,
2035    /// Marginal/aside text
2036    AsideText,
2037    /// List items
2038    List,
2039
2040    /// Generic document region block (PP-DocBlockLayout)
2041    /// Used for hierarchical layout ordering and block grouping
2042    Region,
2043
2044    /// Other/unknown (original label preserved in LayoutElement.label)
2045    Other,
2046}
2047
2048impl LayoutElementType {
2049    /// Returns the string representation of the element type.
2050    ///
2051    /// This returns the PP-StructureV3 compatible label string.
2052    pub fn as_str(&self) -> &'static str {
2053        match self {
2054            // Document Structure
2055            LayoutElementType::DocTitle => "doc_title",
2056            LayoutElementType::ParagraphTitle => "paragraph_title",
2057            LayoutElementType::Text => "text",
2058            LayoutElementType::Content => "content",
2059            LayoutElementType::Abstract => "abstract",
2060
2061            // Visual Elements
2062            LayoutElementType::Image => "image",
2063            LayoutElementType::Table => "table",
2064            LayoutElementType::Chart => "chart",
2065            LayoutElementType::Formula => "formula",
2066
2067            // Captions
2068            LayoutElementType::FigureTitle => "figure_title",
2069            LayoutElementType::TableTitle => "table_title",
2070            LayoutElementType::ChartTitle => "chart_title",
2071            LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
2072
2073            // Page Structure
2074            LayoutElementType::Header => "header",
2075            LayoutElementType::HeaderImage => "header_image",
2076            LayoutElementType::Footer => "footer",
2077            LayoutElementType::FooterImage => "footer_image",
2078            LayoutElementType::Footnote => "footnote",
2079
2080            // Special Elements
2081            LayoutElementType::Seal => "seal",
2082            LayoutElementType::Number => "number",
2083            LayoutElementType::Reference => "reference",
2084            LayoutElementType::ReferenceContent => "reference_content",
2085            LayoutElementType::Algorithm => "algorithm",
2086            LayoutElementType::FormulaNumber => "formula_number",
2087            LayoutElementType::AsideText => "aside_text",
2088            LayoutElementType::List => "list",
2089
2090            // Region (PP-DocBlockLayout)
2091            LayoutElementType::Region => "region",
2092
2093            // Fallback
2094            LayoutElementType::Other => "other",
2095        }
2096    }
2097
2098    /// Creates a LayoutElementType from a string label with fine-grained mapping.
2099    ///
2100    /// This method maps model output labels to their corresponding fine-grained types,
2101    /// preserving the full PP-StructureV3 label set (20/23 classes).
2102    pub fn from_label(label: &str) -> Self {
2103        match label.to_lowercase().as_str() {
2104            // Document Structure
2105            "doc_title" => LayoutElementType::DocTitle,
2106            "paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
2107            "text" | "paragraph" => LayoutElementType::Text,
2108            "content" => LayoutElementType::Content,
2109            "abstract" => LayoutElementType::Abstract,
2110
2111            // Visual Elements
2112            "image" | "figure" => LayoutElementType::Image,
2113            "table" => LayoutElementType::Table,
2114            "chart" | "flowchart" => LayoutElementType::Chart,
2115            "formula" | "equation" | "display_formula" | "inline_formula" => {
2116                LayoutElementType::Formula
2117            }
2118
2119            // Captions
2120            "figure_title" => LayoutElementType::FigureTitle,
2121            "table_title" => LayoutElementType::TableTitle,
2122            "chart_title" => LayoutElementType::ChartTitle,
2123            "figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
2124
2125            // Page Structure
2126            "header" => LayoutElementType::Header,
2127            "header_image" => LayoutElementType::HeaderImage,
2128            "footer" => LayoutElementType::Footer,
2129            "footer_image" => LayoutElementType::FooterImage,
2130            "footnote" | "vision_footnote" => LayoutElementType::Footnote,
2131
2132            // Special Elements
2133            "seal" => LayoutElementType::Seal,
2134            "number" => LayoutElementType::Number,
2135            "reference" => LayoutElementType::Reference,
2136            "reference_content" => LayoutElementType::ReferenceContent,
2137            "algorithm" => LayoutElementType::Algorithm,
2138            "formula_number" => LayoutElementType::FormulaNumber,
2139            "aside_text" => LayoutElementType::AsideText,
2140            "list" => LayoutElementType::List,
2141            "vertical_text" => LayoutElementType::Text,
2142
2143            // Region (PP-DocBlockLayout)
2144            "region" => LayoutElementType::Region,
2145
2146            // Everything else maps to Other
2147            // The original label is preserved in LayoutElement.label
2148            _ => LayoutElementType::Other,
2149        }
2150    }
2151
2152    /// Returns the semantic category for this element type.
2153    ///
2154    /// This method groups fine-grained types into broader semantic categories,
2155    /// useful for processing logic that doesn't need fine-grained distinctions.
2156    ///
2157    /// # Categories
2158    ///
2159    /// - **Title**: DocTitle, ParagraphTitle
2160    /// - **Text**: Text, Content, Abstract
2161    /// - **Visual**: Image, Chart
2162    /// - **Table**: Table
2163    /// - **Caption**: FigureTitle, TableTitle, ChartTitle, FigureTableChartTitle
2164    /// - **Header**: Header, HeaderImage
2165    /// - **Footer**: Footer, FooterImage, Footnote
2166    /// - **Formula**: Formula, FormulaNumber
2167    /// - **Special**: Seal, Number, Reference, ReferenceContent, Algorithm, AsideText
2168    /// - **List**: List
2169    /// - **Other**: Other
2170    pub fn semantic_category(&self) -> &'static str {
2171        match self {
2172            // Title category
2173            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
2174
2175            // Text category
2176            LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
2177                "text"
2178            }
2179
2180            // Visual category
2181            LayoutElementType::Image | LayoutElementType::Chart => "visual",
2182
2183            // Table category
2184            LayoutElementType::Table => "table",
2185
2186            // Caption category
2187            LayoutElementType::FigureTitle
2188            | LayoutElementType::TableTitle
2189            | LayoutElementType::ChartTitle
2190            | LayoutElementType::FigureTableChartTitle => "caption",
2191
2192            // Header category
2193            LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
2194
2195            // Footer category
2196            LayoutElementType::Footer
2197            | LayoutElementType::FooterImage
2198            | LayoutElementType::Footnote => "footer",
2199
2200            // Formula category
2201            LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
2202
2203            // Special category
2204            LayoutElementType::Seal
2205            | LayoutElementType::Number
2206            | LayoutElementType::Reference
2207            | LayoutElementType::ReferenceContent
2208            | LayoutElementType::Algorithm
2209            | LayoutElementType::AsideText => "special",
2210
2211            // List category
2212            LayoutElementType::List => "list",
2213
2214            // Region category (PP-DocBlockLayout)
2215            LayoutElementType::Region => "region",
2216
2217            // Other
2218            LayoutElementType::Other => "other",
2219        }
2220    }
2221
2222    /// Returns whether this element type is a title variant.
2223    pub fn is_title(&self) -> bool {
2224        matches!(
2225            self,
2226            LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
2227        )
2228    }
2229
2230    /// Returns whether this element type is a visual element (image, chart, figure).
2231    pub fn is_visual(&self) -> bool {
2232        matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
2233    }
2234
2235    /// Returns whether this element type is a caption variant.
2236    pub fn is_caption(&self) -> bool {
2237        matches!(
2238            self,
2239            LayoutElementType::FigureTitle
2240                | LayoutElementType::TableTitle
2241                | LayoutElementType::ChartTitle
2242                | LayoutElementType::FigureTableChartTitle
2243        )
2244    }
2245
2246    /// Returns whether this element type is a header variant.
2247    pub fn is_header(&self) -> bool {
2248        matches!(
2249            self,
2250            LayoutElementType::Header | LayoutElementType::HeaderImage
2251        )
2252    }
2253
2254    /// Returns whether this element type is a footer variant.
2255    pub fn is_footer(&self) -> bool {
2256        matches!(
2257            self,
2258            LayoutElementType::Footer
2259                | LayoutElementType::FooterImage
2260                | LayoutElementType::Footnote
2261        )
2262    }
2263
2264    /// Returns whether this element type is a formula variant.
2265    pub fn is_formula(&self) -> bool {
2266        matches!(
2267            self,
2268            LayoutElementType::Formula | LayoutElementType::FormulaNumber
2269        )
2270    }
2271
2272    /// Returns whether this element type contains text content that should be OCR'd.
2273    pub fn should_ocr(&self) -> bool {
2274        matches!(
2275            self,
2276            LayoutElementType::Text
2277                | LayoutElementType::Content
2278                | LayoutElementType::Abstract
2279                | LayoutElementType::DocTitle
2280                | LayoutElementType::ParagraphTitle
2281                | LayoutElementType::FigureTitle
2282                | LayoutElementType::TableTitle
2283                | LayoutElementType::ChartTitle
2284                | LayoutElementType::FigureTableChartTitle
2285                | LayoutElementType::Header
2286                | LayoutElementType::HeaderImage
2287                | LayoutElementType::Footer
2288                | LayoutElementType::FooterImage
2289                | LayoutElementType::Footnote
2290                | LayoutElementType::Reference
2291                | LayoutElementType::ReferenceContent
2292                | LayoutElementType::Algorithm
2293                | LayoutElementType::AsideText
2294                | LayoutElementType::List
2295                | LayoutElementType::Number
2296        )
2297    }
2298}
2299
2300/// Removes heavily-overlapping layout elements in-place.
2301///
2302/// This mirrors PP-Structure-style overlap suppression where text takes priority over images.
2303/// Returns the number of elements removed.
2304pub fn remove_overlapping_layout_elements(
2305    layout_elements: &mut Vec<LayoutElement>,
2306    overlap_threshold: f32,
2307) -> usize {
2308    use std::collections::HashSet;
2309
2310    if layout_elements.len() <= 1 {
2311        return 0;
2312    }
2313
2314    let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
2315    let labels: Vec<&str> = layout_elements
2316        .iter()
2317        .map(|e| e.element_type.as_str())
2318        .collect();
2319
2320    let remove_indices =
2321        crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
2322    if remove_indices.is_empty() {
2323        return 0;
2324    }
2325
2326    let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
2327    let before = layout_elements.len();
2328
2329    let mut idx = 0;
2330    layout_elements.retain(|_| {
2331        let keep = !remove_set.contains(&idx);
2332        idx += 1;
2333        keep
2334    });
2335
2336    before.saturating_sub(layout_elements.len())
2337}
2338
2339/// Applies small, PP-Structure-style label fixes to layout elements.
2340///
2341/// This is intended to capture lightweight "glue" heuristics that shouldn't live in `predict`.
2342pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
2343    if layout_elements.is_empty() {
2344        return;
2345    }
2346
2347    let mut footnote_indices: Vec<usize> = Vec::new();
2348    let mut paragraph_title_indices: Vec<usize> = Vec::new();
2349    let mut bottom_text_y_max: f32 = 0.0;
2350    let mut max_block_area: f32 = 0.0;
2351    let mut doc_title_num: usize = 0;
2352
2353    for (idx, elem) in layout_elements.iter().enumerate() {
2354        let area =
2355            (elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
2356        max_block_area = max_block_area.max(area);
2357
2358        match elem.element_type {
2359            LayoutElementType::Footnote => footnote_indices.push(idx),
2360            LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
2361            LayoutElementType::Text => {
2362                bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
2363            }
2364            LayoutElementType::DocTitle => doc_title_num += 1,
2365            _ => {}
2366        }
2367    }
2368
2369    for idx in footnote_indices {
2370        if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
2371            layout_elements[idx].element_type = LayoutElementType::Text;
2372            layout_elements[idx].label = Some("text".to_string());
2373        }
2374    }
2375
2376    let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
2377    if only_one_paragraph_title {
2378        let idx = paragraph_title_indices[0];
2379        let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
2380            * (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
2381
2382        let title_area_ratio_threshold = 0.3f32;
2383        if area > max_block_area * title_area_ratio_threshold {
2384            layout_elements[idx].element_type = LayoutElementType::DocTitle;
2385            layout_elements[idx].label = Some("doc_title".to_string());
2386        }
2387    }
2388}
2389
2390/// Result of table recognition.
2391#[derive(Debug, Clone, Serialize, Deserialize)]
2392pub struct TableResult {
2393    /// Bounding box of the table in the original image
2394    pub bbox: BoundingBox,
2395    /// Table type (wired or wireless)
2396    pub table_type: TableType,
2397    /// Confidence score for table type classification (None if classifier wasn't configured/run)
2398    pub classification_confidence: Option<f32>,
2399    /// Confidence score for table structure recognition (None if structure recognition failed)
2400    pub structure_confidence: Option<f32>,
2401    /// Detected table cells
2402    pub cells: Vec<TableCell>,
2403    /// HTML structure of the table (if available)
2404    pub html_structure: Option<String>,
2405    /// OCR text content for each cell (if OCR was integrated)
2406    pub cell_texts: Option<Vec<Option<String>>>,
2407    /// Structure tokens from table structure recognition (used for HTML generation after stitching)
2408    #[serde(skip)]
2409    pub structure_tokens: Option<Vec<String>>,
2410    /// Detected cell bounding boxes from the cell detection model (in page coordinates).
2411    /// Stored separately from `cells` (which carry structure/grid metadata from the structure model)
2412    /// and used by the stitcher for row-aware IoA-based OCR matching.
2413    #[serde(skip)]
2414    pub detected_cell_bboxes: Option<Vec<BoundingBox>>,
2415    /// Whether the table was processed in end-to-end (E2E) mode.
2416    /// When true, cells come from the structure model only (no separate cell detection).
2417    /// Used by the stitcher to select the appropriate OCR matching strategy.
2418    #[serde(skip)]
2419    pub is_e2e: bool,
2420}
2421
2422impl TableResult {
2423    /// Creates a new table result.
2424    pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
2425        Self {
2426            bbox,
2427            table_type,
2428            classification_confidence: None,
2429            structure_confidence: None,
2430            cells: Vec::new(),
2431            html_structure: None,
2432            cell_texts: None,
2433            structure_tokens: None,
2434            detected_cell_bboxes: None,
2435            is_e2e: false,
2436        }
2437    }
2438
2439    /// Sets the classification confidence.
2440    pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
2441        self.classification_confidence = Some(confidence);
2442        self
2443    }
2444
2445    /// Sets the structure recognition confidence.
2446    pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
2447        self.structure_confidence = Some(confidence);
2448        self
2449    }
2450
2451    /// Sets the table cells.
2452    pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
2453        self.cells = cells;
2454        self
2455    }
2456
2457    /// Sets the HTML structure.
2458    pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
2459        self.html_structure = Some(html.into());
2460        self
2461    }
2462
2463    /// Sets the cell texts from OCR.
2464    pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
2465        self.cell_texts = Some(texts);
2466        self
2467    }
2468
2469    /// Sets the structure tokens for later HTML generation.
2470    pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
2471        self.structure_tokens = Some(tokens);
2472        self
2473    }
2474
2475    /// Stores detected cell bounding boxes for the stitcher's row-aware IoA matcher.
2476    pub fn with_detected_cell_bboxes(mut self, bboxes: Vec<BoundingBox>) -> Self {
2477        self.detected_cell_bboxes = Some(bboxes);
2478        self
2479    }
2480
2481    /// Marks this table as processed in end-to-end (E2E) mode.
2482    pub fn with_e2e(mut self, is_e2e: bool) -> Self {
2483        self.is_e2e = is_e2e;
2484        self
2485    }
2486
2487    /// Returns the best available confidence score for this table.
2488    ///
2489    /// This method provides a unified confidence API for callers who want to filter
2490    /// tables by confidence without caring whether classification or structure
2491    /// recognition was used. Priority:
2492    /// 1. If both classification and structure confidence are available, returns
2493    ///    the minimum (most conservative estimate)
2494    /// 2. If only structure confidence is available (common when classifier isn't
2495    ///    configured), returns that
2496    /// 3. If only classification confidence is available, returns that
2497    /// 4. Returns `None` only if neither confidence is available (stub result)
2498    pub fn confidence(&self) -> Option<f32> {
2499        match (self.classification_confidence, self.structure_confidence) {
2500            (Some(cls), Some(str)) => Some(cls.min(str)),
2501            (None, Some(str)) => Some(str),
2502            (Some(cls), None) => Some(cls),
2503            (None, None) => None,
2504        }
2505    }
2506
2507    /// Returns true if this table has valid structure data.
2508    ///
2509    /// A table is considered valid if it has either cells or an HTML structure.
2510    /// Stub results (created when structure recognition fails) will return false.
2511    pub fn has_structure(&self) -> bool {
2512        !self.cells.is_empty() || self.html_structure.is_some()
2513    }
2514}
2515
2516/// Type of table.
2517#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
2518pub enum TableType {
2519    /// Table with visible borders
2520    Wired,
2521    /// Table without visible borders
2522    Wireless,
2523    /// Unknown table type
2524    Unknown,
2525}
2526
2527/// A cell in a table.
2528#[derive(Debug, Clone, Serialize, Deserialize)]
2529pub struct TableCell {
2530    /// Bounding box of the cell
2531    pub bbox: BoundingBox,
2532    /// Row index (0-based)
2533    pub row: Option<usize>,
2534    /// Column index (0-based)
2535    pub col: Option<usize>,
2536    /// Row span
2537    pub row_span: Option<usize>,
2538    /// Column span
2539    pub col_span: Option<usize>,
2540    /// Confidence score for the cell detection
2541    pub confidence: f32,
2542    /// Text content of the cell (if available)
2543    pub text: Option<String>,
2544}
2545
2546impl TableCell {
2547    /// Creates a new table cell.
2548    pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
2549        Self {
2550            bbox,
2551            row: None,
2552            col: None,
2553            row_span: None,
2554            col_span: None,
2555            confidence,
2556            text: None,
2557        }
2558    }
2559
2560    /// Sets the row and column indices.
2561    pub fn with_position(mut self, row: usize, col: usize) -> Self {
2562        self.row = Some(row);
2563        self.col = Some(col);
2564        self
2565    }
2566
2567    /// Sets the row and column spans.
2568    pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
2569        self.row_span = Some(row_span);
2570        self.col_span = Some(col_span);
2571        self
2572    }
2573
2574    /// Sets the text content.
2575    pub fn with_text(mut self, text: impl Into<String>) -> Self {
2576        self.text = Some(text.into());
2577        self
2578    }
2579}
2580
2581/// Result of formula recognition.
2582#[derive(Debug, Clone, Serialize, Deserialize)]
2583pub struct FormulaResult {
2584    /// Bounding box of the formula in the original image
2585    pub bbox: BoundingBox,
2586    /// LaTeX representation of the formula
2587    pub latex: String,
2588    /// Confidence score for the recognition
2589    pub confidence: f32,
2590}
2591
2592impl FormulaResult {
2593    /// Creates a new formula result.
2594    pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
2595        Self {
2596            bbox,
2597            latex: latex.into(),
2598            confidence,
2599        }
2600    }
2601}
2602
2603#[cfg(test)]
2604mod tests {
2605    use super::*;
2606
2607    #[test]
2608    fn test_structure_result_creation() {
2609        let result = StructureResult::new("test.jpg", 0);
2610        assert_eq!(result.input_path.as_ref(), "test.jpg");
2611        assert_eq!(result.index, 0);
2612        assert!(result.layout_elements.is_empty());
2613        assert!(result.tables.is_empty());
2614        assert!(result.formulas.is_empty());
2615        assert!(result.text_regions.is_none());
2616    }
2617
2618    #[test]
2619    fn test_layout_element_type_as_str() {
2620        assert_eq!(LayoutElementType::Text.as_str(), "text");
2621        assert_eq!(LayoutElementType::Table.as_str(), "table");
2622        assert_eq!(LayoutElementType::Formula.as_str(), "formula");
2623    }
2624
2625    #[test]
2626    fn test_table_result_creation() {
2627        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2628        let table = TableResult::new(bbox, TableType::Wired);
2629        assert_eq!(table.table_type, TableType::Wired);
2630        assert!(table.cells.is_empty());
2631        assert!(table.html_structure.is_none());
2632    }
2633
2634    #[test]
2635    fn test_structure_result_export() {
2636        let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2637        let mut result = StructureResult::new("test.jpg", 0);
2638
2639        let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
2640            .with_text("Test Document");
2641
2642        let text =
2643            LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
2644
2645        result = result.with_layout_elements(vec![title, text]);
2646
2647        let md = result.to_markdown();
2648        assert!(md.contains("# Test Document"));
2649        assert!(md.contains("Hello world"));
2650
2651        let html = result.to_html();
2652        assert!(html.contains("<h1>Test Document</h1>"));
2653        assert!(html.contains("<p>Hello world</p>"));
2654    }
2655
2656    #[test]
2657    fn test_format_title_with_level_keywords() {
2658        let (level, text) = format_title_with_level("Abstract", None);
2659        assert_eq!(level, 2);
2660        assert_eq!(text, "Abstract");
2661
2662        let (level, text) = format_title_with_level("References:", None);
2663        assert_eq!(level, 2);
2664        assert_eq!(text, "References:");
2665    }
2666
2667    #[test]
2668    fn test_format_title_with_level_cluster_fallback() {
2669        let (level, text) = format_title_with_level("Unnumbered Heading", Some(4));
2670        assert_eq!(level, 4);
2671        assert_eq!(text, "Unnumbered Heading");
2672    }
2673
2674    #[test]
2675    fn test_to_markdown_skips_footnote() {
2676        let mut result = StructureResult::new("test.jpg", 0);
2677        let body = LayoutElement::new(
2678            BoundingBox::from_coords(0.0, 0.0, 100.0, 30.0),
2679            LayoutElementType::Text,
2680            1.0,
2681        )
2682        .with_text("Body");
2683        let footnote = LayoutElement::new(
2684            BoundingBox::from_coords(0.0, 40.0, 100.0, 60.0),
2685            LayoutElementType::Footnote,
2686            1.0,
2687        )
2688        .with_text("Footnote text");
2689        result = result.with_layout_elements(vec![body, footnote]);
2690
2691        let md = result.to_markdown();
2692        assert!(md.contains("Body"));
2693        assert!(!md.contains("Footnote text"));
2694    }
2695
2696    #[test]
2697    fn test_to_markdown_doc_title_joins_lines_with_space() {
2698        let mut result = StructureResult::new("test.jpg", 0);
2699        let title = LayoutElement::new(
2700            BoundingBox::from_coords(0.0, 0.0, 100.0, 20.0),
2701            LayoutElementType::DocTitle,
2702            1.0,
2703        )
2704        .with_text("Main\nTitle");
2705        result = result.with_layout_elements(vec![title]);
2706        let md = result.to_markdown();
2707        assert!(md.contains("# Main Title"));
2708    }
2709
2710    #[test]
2711    fn test_to_markdown_content_uses_soft_breaks() {
2712        let mut result = StructureResult::new("test.jpg", 0);
2713        let toc = LayoutElement::new(
2714            BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2715            LayoutElementType::Content,
2716            1.0,
2717        )
2718        .with_text("1 Intro\n2 Method");
2719        result = result.with_layout_elements(vec![toc]);
2720        let md = result.to_markdown();
2721        assert!(md.contains("1 Intro  \n2 Method"));
2722    }
2723
2724    #[test]
2725    fn test_infer_paragraph_title_levels_by_height() {
2726        let titles = vec![
2727            LayoutElement::new(
2728                BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2729                LayoutElementType::ParagraphTitle,
2730                1.0,
2731            )
2732            .with_text("Large"),
2733            LayoutElement::new(
2734                BoundingBox::from_coords(0.0, 50.0, 100.0, 74.0),
2735                LayoutElementType::ParagraphTitle,
2736                1.0,
2737            )
2738            .with_text("Medium"),
2739            LayoutElement::new(
2740                BoundingBox::from_coords(0.0, 80.0, 100.0, 98.0),
2741                LayoutElementType::ParagraphTitle,
2742                1.0,
2743            )
2744            .with_text("Small"),
2745        ];
2746
2747        let levels = infer_paragraph_title_levels(&titles);
2748        let l0 = levels.get(&0).copied().unwrap_or(2);
2749        let l1 = levels.get(&1).copied().unwrap_or(2);
2750        let l2 = levels.get(&2).copied().unwrap_or(2);
2751        assert!(l0 <= l1 && l1 <= l2);
2752    }
2753
2754    #[test]
2755    fn test_infer_paragraph_title_levels_semantic_vote_wins_tie() {
2756        let titles = vec![
2757            LayoutElement::new(
2758                BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2759                LayoutElementType::ParagraphTitle,
2760                1.0,
2761            )
2762            .with_text("1.1 Detail"),
2763            LayoutElement::new(
2764                BoundingBox::from_coords(0.0, 50.0, 100.0, 70.0),
2765                LayoutElementType::ParagraphTitle,
2766                1.0,
2767            )
2768            .with_text("2 Intro"),
2769        ];
2770
2771        let levels = infer_paragraph_title_levels(&titles);
2772        assert_eq!(levels.get(&0).copied(), Some(3));
2773        assert_eq!(levels.get(&1).copied(), Some(2));
2774    }
2775
2776    #[test]
2777    fn test_infer_paragraph_title_levels_uses_relative_indent_signal() {
2778        let titles = vec![
2779            LayoutElement::new(
2780                BoundingBox::from_coords(0.0, 0.0, 100.0, 24.0),
2781                LayoutElementType::ParagraphTitle,
2782                1.0,
2783            )
2784            .with_text("Heading A"),
2785            LayoutElement::new(
2786                BoundingBox::from_coords(40.0, 40.0, 140.0, 64.0),
2787                LayoutElementType::ParagraphTitle,
2788                1.0,
2789            )
2790            .with_text("Heading B"),
2791        ];
2792
2793        let levels = infer_paragraph_title_levels(&titles);
2794        let left_level = levels.get(&0).copied().unwrap_or(2);
2795        let indented_level = levels.get(&1).copied().unwrap_or(2);
2796        assert!(left_level < indented_level);
2797    }
2798}
oar_ocr_core/domain/structure.rs

oar_ocr_core/domain/
structure.rs