oar_ocr_core/domain/structure.rs
1//! Document structure analysis result types.
2//!
3//! This module defines the result types for document structure analysis,
4//! including layout detection, table recognition, and formula recognition.
5
6use super::text_region::TextRegion;
7use crate::processors::BoundingBox;
8use image::RgbImage;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::path::Path;
13use std::sync::Arc;
14
15/// Title numbering pattern for detecting section numbers like 1, 1.2, 1.2.3, (1), 一、etc.
16/// This follows standard title numbering pattern.
17static TITLE_NUMBERING_REGEX: Lazy<Regex> = Lazy::new(|| {
18 Regex::new(
19 r"(?x)
20 ^\s*
21 (
22 # Arabic numerals: 1, 1.2, 1.2.3, etc.
23 [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
24 |
25 # Parenthesized Arabic numerals: (1), (1.2), etc.
26 [((][1-9][0-9]*(?:\.[1-9][0-9]*)*[))]
27 |
28 # Chinese numerals with punctuation: 一、 二、
29 [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
30 |
31 # Parenthesized Chinese numerals: (一)
32 [((][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[))]
33 |
34 # Roman numerals with delimiter (period or followed by space)
35 (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
36 )
37 (\s+)
38 (.*)
39 $
40 ",
41 )
42 .unwrap_or_else(|e| panic!("Invalid title numbering regex: {e}"))
43});
44
45/// Format a paragraph title with automatic level detection based on numbering.
46///
47/// Following PaddleX's title formatting logic:
48/// - Extracts numbering prefix (1.2.3, etc.)
49/// - Determines heading level from number of dots
50/// - Returns (level, formatted_title) where level starts from 2 (## for paragraph titles)
51///
52/// PaddleX logic: `level = dots + 1`, then uses `#{'#' * level}` which means:
53/// - "1 Introduction" (0 dots) -> level=1 -> `## 1 Introduction`
54/// - "2.1 Method" (1 dot) -> level=2 -> `### 2.1 Method`
55/// - "2.1.1 Details" (2 dots) -> level=3 -> `#### 2.1.1 Details`
56///
57/// To align with PaddleX, we return level+1 to account for the extra `#`:
58/// - "1 Introduction" -> (2, "1 Introduction") -> `## 1 Introduction`
59/// - "2.1 Method" -> (3, "2.1 Method") -> `### 2.1 Method`
60/// - "2.1.1 Details" -> (4, "2.1.1 Details") -> `#### 2.1.1 Details`
61fn format_title_with_level(title: &str) -> (usize, String) {
62 // Clean up line breaks
63 let cleaned = title.replace("-\n", "").replace('\n', " ");
64
65 if let Some(captures) = TITLE_NUMBERING_REGEX.captures(&cleaned) {
66 let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
67 let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
68
69 // Determine level from dots in numbering (PaddleX: dots + 1, then +1 for base ##)
70 // 1 -> 2 (##), 1.2 -> 3 (###), 1.2.3 -> 4 (####)
71 let dot_count = numbering.matches('.').count();
72 let level = dot_count + 2; // +1 for PaddleX logic, +1 for base ## level
73
74 // Reconstruct title: numbering + space + content
75 let formatted = if title_content.is_empty() {
76 numbering.trim_end_matches('.').to_string()
77 } else {
78 format!(
79 "{} {}",
80 numbering.trim_end_matches('.'),
81 title_content.trim_start()
82 )
83 };
84
85 // Clamp level to reasonable range (2-6 for markdown, since # is for doc_title)
86 let level = level.clamp(2, 6);
87
88 (level, formatted)
89 } else {
90 // No numbering detected, default to level 2 (## heading)
91 (2, cleaned)
92 }
93}
94
95/// A detected document region block (from PP-DocBlockLayout).
96///
97/// Region blocks represent hierarchical groupings of layout elements,
98/// typically columns or logical sections of a document. They are used
99/// for hierarchical reading order determination.
100///
101/// # PP-StructureV3 Alignment
102///
103/// PP-DocBlockLayout detects "region" type blocks that group related
104/// layout elements together. Elements within the same region should
105/// be read together before moving to the next region.
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct RegionBlock {
108 /// Bounding box of the region
109 pub bbox: BoundingBox,
110 /// Confidence score of the detection
111 pub confidence: f32,
112 /// Index of this region in the reading order
113 pub order_index: Option<u32>,
114 /// Indices of layout elements that belong to this region
115 pub element_indices: Vec<usize>,
116}
117
118/// Page continuation flags for multi-page document processing.
119///
120/// These flags indicate whether the page starts or ends in the middle of
121/// a semantic paragraph, which is crucial for properly concatenating
122/// markdown output from multiple pages.
123///
124/// - `paragraph_start`: `false` means this page continues a paragraph from previous page
125/// - `paragraph_end`: `false` means this page's content continues to next page
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct PageContinuationFlags {
128 /// Whether the first element on this page is a paragraph continuation
129 pub paragraph_start: bool,
130 /// Whether the last element on this page continues to the next page
131 pub paragraph_end: bool,
132}
133
134impl PageContinuationFlags {
135 pub fn new(paragraph_start: bool, paragraph_end: bool) -> Self {
136 Self {
137 paragraph_start,
138 paragraph_end,
139 }
140 }
141
142 /// Returns the tuple format (is_start, is_end) for compatibility
143 pub fn as_tuple(&self) -> (bool, bool) {
144 (self.paragraph_start, self.paragraph_end)
145 }
146}
147
148/// Result of document structure analysis.
149///
150/// This struct contains all the results from analyzing a document's structure,
151/// including layout elements, tables, formulas, and OCR results.
152///
153/// # Coordinate System
154///
155/// The coordinate system of bounding boxes depends on which preprocessing was applied:
156///
157/// - **No preprocessing**: Boxes are in the original input image's coordinate system.
158///
159/// - **Orientation correction only** (`orientation_angle` set, `rectified_img` is None):
160/// Boxes are transformed back to the original input image's coordinate system.
161///
162/// - **Rectification applied** (`rectified_img` is Some):
163/// Boxes remain in the **rectified image's coordinate system**. Neural network-based
164/// rectification (UVDoc) warps cannot be precisely inverted, so use `rectified_img`
165/// for visualization instead of the original image.
166///
167/// - **Both orientation and rectification**: Boxes are in the rectified coordinate system
168/// (rectification takes precedence since it's applied after orientation correction).
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct StructureResult {
171 /// Path to the input image file
172 pub input_path: Arc<str>,
173 /// Index of the image in a batch (0 for single image processing)
174 pub index: usize,
175 /// Detected layout elements (text regions, tables, figures, etc.)
176 pub layout_elements: Vec<LayoutElement>,
177 /// Recognized tables with their structure and content
178 pub tables: Vec<TableResult>,
179 /// Recognized mathematical formulas
180 pub formulas: Vec<FormulaResult>,
181 /// OCR text regions (if OCR was integrated)
182 pub text_regions: Option<Vec<TextRegion>>,
183 /// Document orientation angle (if orientation correction was used)
184 pub orientation_angle: Option<f32>,
185 /// Detected region blocks for hierarchical ordering (PP-DocBlockLayout)
186 /// When present, layout_elements are already sorted by region hierarchy
187 pub region_blocks: Option<Vec<RegionBlock>>,
188 /// Rectified image (if document rectification was used)
189 /// Note: Bounding boxes are already transformed back to original coordinates for rotation,
190 /// but for rectification (UVDoc), boxes are in the rectified image's coordinate system.
191 /// Use this image for visualization when rectification was applied.
192 #[serde(skip)]
193 pub rectified_img: Option<Arc<RgbImage>>,
194 /// Page continuation flags for multi-page document processing.
195 /// This indicates whether this page continues a paragraph from the previous page
196 /// or continues to the next page, which is crucial for proper markdown concatenation.
197 pub page_continuation_flags: Option<PageContinuationFlags>,
198}
199
200impl StructureResult {
201 /// Creates a new structure result.
202 pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
203 Self {
204 input_path: input_path.into(),
205 index,
206 layout_elements: Vec::new(),
207 tables: Vec::new(),
208 formulas: Vec::new(),
209 text_regions: None,
210 orientation_angle: None,
211 region_blocks: None,
212 rectified_img: None,
213 page_continuation_flags: None,
214 }
215 }
216
217 /// Adds layout elements to the result.
218 pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
219 self.layout_elements = elements;
220 self
221 }
222
223 /// Adds tables to the result.
224 pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
225 self.tables = tables;
226 self
227 }
228
229 /// Adds formulas to the result.
230 pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
231 self.formulas = formulas;
232 self
233 }
234
235 /// Adds OCR text regions to the result.
236 pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
237 self.text_regions = Some(regions);
238 self
239 }
240
241 /// Adds region blocks to the result (PP-DocBlockLayout).
242 ///
243 /// Region blocks represent hierarchical groupings of layout elements.
244 /// When set, layout_elements should already be sorted by region hierarchy.
245 pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
246 self.region_blocks = Some(blocks);
247 self
248 }
249
250 /// Sets page continuation flags for multi-page document processing.
251 pub fn with_page_continuation_flags(mut self, flags: PageContinuationFlags) -> Self {
252 self.page_continuation_flags = Some(flags);
253 self
254 }
255
256 /// Converts the result to a Markdown string.
257 ///
258 /// Follows PP-StructureV3's formatting rules:
259 /// - DocTitle: `# title`
260 /// - ParagraphTitle: Auto-detect numbering (1.2.3 -> ###)
261 /// - Formula: `$$latex$$`
262 /// - Table: HTML with border
263 /// - Images: ``
264 ///
265 /// Note: Low-confidence text elements that overlap with table regions are filtered out
266 /// to avoid duplicate content from table OCR.
267 pub fn to_markdown(&self) -> String {
268 // Collect table bboxes for overlap filtering
269 let table_bboxes: Vec<&BoundingBox> = self
270 .layout_elements
271 .iter()
272 .filter(|e| e.element_type == LayoutElementType::Table)
273 .map(|e| &e.bbox)
274 .collect();
275
276 let mut md = String::new();
277 let elements = &self.layout_elements;
278
279 for (idx, element) in elements.iter().enumerate() {
280 // PP-StructureV3 markdown ignores auxiliary labels.
281 if matches!(
282 element.element_type,
283 LayoutElementType::Number
284 | LayoutElementType::Footnote
285 | LayoutElementType::Header
286 | LayoutElementType::HeaderImage
287 | LayoutElementType::Footer
288 | LayoutElementType::FooterImage
289 | LayoutElementType::AsideText
290 ) {
291 continue;
292 }
293
294 // Filter out low-confidence text elements that overlap with tables
295 // These are typically OCR artifacts from table cell text that shouldn't be
296 // output separately in markdown
297 if element.element_type == LayoutElementType::Text {
298 let overlaps_table = table_bboxes.iter().any(|table_bbox| {
299 element.bbox.ioa(table_bbox) > 0.3 // >30% of text is inside table
300 });
301
302 // Skip low-confidence text that overlaps with table regions
303 // Standard logic filters these in the stitching phase
304 if overlaps_table && element.confidence < 0.7 {
305 continue;
306 }
307 }
308
309 match element.element_type {
310 // Document title
311 LayoutElementType::DocTitle => {
312 md.push_str("\n# ");
313 if let Some(text) = &element.text {
314 let cleaned = clean_ocr_text(text);
315 md.push_str(&cleaned);
316 }
317 md.push_str("\n\n");
318 }
319 // Paragraph/section title - auto-detect numbering for level
320 LayoutElementType::ParagraphTitle => {
321 if let Some(text) = &element.text {
322 let cleaned = clean_ocr_text(text);
323 let (level, formatted_title) = format_title_with_level(&cleaned);
324 md.push('\n');
325 for _ in 0..level {
326 md.push('#');
327 }
328 md.push(' ');
329 md.push_str(&formatted_title);
330 md.push_str("\n\n");
331 } else {
332 md.push_str("\n## \n\n");
333 }
334 }
335 // Table - preserve HTML structure with border and center alignment
336 // Following PaddleX's format with <div style="text-align: center;"> wrapper
337 LayoutElementType::Table => {
338 if let Some(table) =
339 self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
340 {
341 if let Some(html) = &table.html_structure {
342 // Simplify table HTML (remove html/body wrappers) and add border
343 let simplified = simplify_table_html(html);
344 let table_with_border =
345 simplified.replacen("<table>", "<table border=\"1\">", 1);
346 // Wrap with center-aligned div for better markdown rendering
347 md.push_str("\n<div style=\"text-align: center;\">");
348 md.push_str(&table_with_border);
349 md.push_str("</div>\n\n");
350 } else {
351 md.push_str("\n[Table]\n\n");
352 }
353 } else {
354 md.push_str("\n[Table]\n\n");
355 }
356 }
357 // Formula - detect inline vs display formula based on context
358 LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
359 // Check if this formula is on the same line as adjacent text elements
360 // to determine if it's an inline formula or display formula
361 let is_inline = {
362 // Look for previous non-formula text element on the same line
363 let has_prev_text = (0..idx).rev().any(|i| {
364 let prev = &elements[i];
365 !prev.element_type.is_formula()
366 && (prev.element_type == LayoutElementType::Text
367 || prev.element_type == LayoutElementType::ReferenceContent)
368 && is_same_line(&element.bbox, &prev.bbox)
369 });
370
371 // Look for next non-formula text element on the same line
372 let has_next_text = ((idx + 1)..elements.len()).any(|i| {
373 let next = &elements[i];
374 !next.element_type.is_formula()
375 && (next.element_type == LayoutElementType::Text
376 || next.element_type == LayoutElementType::ReferenceContent)
377 && is_same_line(&element.bbox, &next.bbox)
378 });
379
380 has_prev_text || has_next_text
381 };
382
383 if is_inline {
384 // Inline formula: use $...$
385 md.push('$');
386 if let Some(latex) = &element.text {
387 md.push_str(latex);
388 }
389 md.push_str("$ ");
390 } else {
391 // Display formula: use $$...$$
392 md.push_str("\n$$");
393 if let Some(latex) = &element.text {
394 md.push_str(latex);
395 }
396 md.push_str("$$\n\n");
397 }
398 }
399 // Image/Chart - figure format with center alignment
400 LayoutElementType::Image | LayoutElementType::Chart => {
401 // Use HTML img tag with center alignment for better rendering
402 md.push_str("\n<div style=\"text-align: center;\"><img src=\"");
403 // Generate a placeholder image name based on element bbox
404 let img_name = format!(
405 "imgs/img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg",
406 if element.element_type == LayoutElementType::Chart {
407 "chart"
408 } else {
409 "image"
410 },
411 element.bbox.x_min(),
412 element.bbox.y_min(),
413 element.bbox.x_max(),
414 element.bbox.y_max()
415 );
416 md.push_str(&img_name);
417 md.push_str("\" alt=\"Image\" width=\"");
418 // Calculate width percentage based on element size
419 let width_pct =
420 ((element.bbox.x_max() - element.bbox.x_min()) / 12.0).clamp(20.0, 100.0);
421 md.push_str(&format!("{:.0}%", width_pct));
422 md.push_str("\" /></div>\n\n");
423 }
424 // Seal - show as image with text
425 LayoutElementType::Seal => {
426 md.push_str("\n![Seal]");
427 if let Some(text) = &element.text {
428 md.push_str("\n> ");
429 md.push_str(text);
430 }
431 md.push_str("\n\n");
432 }
433 // Captions - with center alignment following PaddleX
434 _ if element.element_type.is_caption() => {
435 if let Some(text) = &element.text {
436 md.push_str("\n<div style=\"text-align: center;\">");
437 md.push_str(text);
438 md.push_str(" </div>\n\n");
439 }
440 }
441 // Abstract - following PaddleX format with proper text handling
442 LayoutElementType::Abstract => {
443 if let Some(text) = &element.text {
444 // Check for "Abstract" or "摘要" heading
445 let lower = text.to_lowercase();
446 if lower.contains("abstract") || lower.contains("摘要") {
447 md.push_str("\n## **Abstract**\n\n");
448 }
449 let formatted = format_text_block(text);
450 md.push_str(&formatted);
451 md.push_str("\n\n");
452 }
453 }
454 // Reference - following PaddleX's format_reference_block
455 LayoutElementType::Reference => {
456 if let Some(text) = &element.text {
457 let formatted = format_reference_block(text);
458 md.push('\n');
459 md.push_str(&formatted);
460 md.push_str("\n\n");
461 }
462 }
463 // Content (table of contents) - following PaddleX's soft breaks
464 LayoutElementType::Content => {
465 if let Some(text) = &element.text {
466 let formatted = format_content_block(text);
467 md.push('\n');
468 md.push_str(&formatted);
469 md.push_str("\n\n");
470 }
471 }
472 // Footnote - following PaddleX's vision_footnote handling
473 LayoutElementType::Footnote => {
474 if let Some(text) = &element.text {
475 let formatted = format_vision_footnote_block(text);
476 md.push('\n');
477 md.push_str(&formatted);
478 md.push_str("\n\n");
479 }
480 }
481 // List
482 LayoutElementType::List => {
483 if let Some(text) = &element.text {
484 let cleaned = format_text_block(text);
485 // Split by newlines and format as list items
486 for line in cleaned.lines() {
487 let line = line.trim();
488 if !line.is_empty() {
489 md.push_str("- ");
490 md.push_str(line);
491 md.push('\n');
492 }
493 }
494 md.push('\n');
495 }
496 }
497 // Header/Footer - smaller text (typically excluded from markdown)
498 _ if element.element_type.is_header() || element.element_type.is_footer() => {
499 // Skip headers and footers in markdown output
500 // They typically contain page numbers and repeating info
501 continue;
502 }
503 // Default text elements - following PaddleX's text handling
504 _ => {
505 if let Some(text) = &element.text {
506 let formatted = format_text_block(text);
507 md.push_str(&formatted);
508 md.push_str("\n\n");
509 }
510 }
511 }
512 }
513 md.trim().to_string()
514 }
515
516 /// Calculates the page continuation flags for this result.
517 ///
518 /// This follows PaddleX's `get_seg_flag` logic to determine whether
519 /// the page starts/ends in the middle of a semantic paragraph.
520 ///
521 /// Returns (paragraph_start, paragraph_end) where:
522 /// - `paragraph_start`: false means page continues from previous
523 /// - `paragraph_end`: false means content continues to next page
524 pub fn calculate_continuation_flags(&self) -> PageContinuationFlags {
525 let elements = &self.layout_elements;
526
527 if elements.is_empty() {
528 return PageContinuationFlags::new(true, true);
529 }
530
531 // Estimate page width from rectified image or element bboxes
532 let page_width = self
533 .rectified_img
534 .as_ref()
535 .map(|img| img.width() as f32)
536 .or_else(|| {
537 elements
538 .iter()
539 .map(|e| e.bbox.x_max())
540 .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
541 });
542
543 // Filter to only text elements for continuation analysis
544 let text_elements: Vec<_> = elements
545 .iter()
546 .filter(|e| {
547 matches!(
548 e.element_type,
549 LayoutElementType::Text
550 | LayoutElementType::DocTitle
551 | LayoutElementType::ParagraphTitle
552 | LayoutElementType::Abstract
553 | LayoutElementType::Reference
554 )
555 })
556 .collect();
557
558 if text_elements.is_empty() {
559 return PageContinuationFlags::new(true, true);
560 }
561
562 // Calculate paragraph start flag
563 let first = &text_elements[0];
564 let paragraph_start = is_new_paragraph_start(first, page_width);
565
566 // Calculate paragraph end flag
567 let last = &text_elements[text_elements.len() - 1];
568 let paragraph_end = is_paragraph_complete(last, page_width);
569
570 PageContinuationFlags::new(paragraph_start, paragraph_end)
571 }
572
573 /// Converts the result to an HTML string.
574 ///
575 /// Follows PP-StructureV3's formatting rules with semantic HTML tags.
576 pub fn to_html(&self) -> String {
577 let mut html = String::from(
578 "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
579 );
580
581 for element in &self.layout_elements {
582 match element.element_type {
583 // Document title
584 LayoutElementType::DocTitle => {
585 html.push_str("<h1>");
586 if let Some(text) = &element.text {
587 html.push_str(&Self::escape_html(text));
588 }
589 html.push_str("</h1>\n");
590 }
591 // Paragraph/section title
592 LayoutElementType::ParagraphTitle => {
593 html.push_str("<h2>");
594 if let Some(text) = &element.text {
595 html.push_str(&Self::escape_html(text));
596 }
597 html.push_str("</h2>\n");
598 }
599 // Table - embed HTML structure with simplified markup
600 LayoutElementType::Table => {
601 if let Some(table) =
602 self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
603 {
604 if let Some(table_html) = &table.html_structure {
605 // Simplify table HTML (remove html/body wrappers) and add border styling
606 let simplified = simplify_table_html(table_html);
607 let styled = simplified.replacen(
608 "<table>",
609 "<table border=\"1\" style=\"border-collapse: collapse;\">",
610 1,
611 );
612 html.push_str(&styled);
613 html.push('\n');
614 } else {
615 html.push_str("<p>[Table]</p>\n");
616 }
617 } else {
618 html.push_str("<p>[Table]</p>\n");
619 }
620 }
621 // Formula - use math tags
622 LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
623 html.push_str("<p class=\"formula\">$$");
624 if let Some(latex) = &element.text {
625 html.push_str(&Self::escape_html(latex));
626 }
627 html.push_str("$$</p>\n");
628 }
629 // Image/Chart
630 LayoutElementType::Image | LayoutElementType::Chart => {
631 html.push_str("<figure>\n<img alt=\"Figure\" />\n");
632 if let Some(caption) = &element.text {
633 html.push_str("<figcaption>");
634 html.push_str(&Self::escape_html(caption));
635 html.push_str("</figcaption>\n");
636 }
637 html.push_str("</figure>\n");
638 }
639 // Seal
640 LayoutElementType::Seal => {
641 html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
642 if let Some(text) = &element.text {
643 html.push_str("<figcaption>");
644 html.push_str(&Self::escape_html(text));
645 html.push_str("</figcaption>\n");
646 }
647 html.push_str("</figure>\n");
648 }
649 // Captions
650 _ if element.element_type.is_caption() => {
651 if let Some(text) = &element.text {
652 html.push_str("<figcaption>");
653 html.push_str(&Self::escape_html(text));
654 html.push_str("</figcaption>\n");
655 }
656 }
657 // Abstract
658 LayoutElementType::Abstract => {
659 html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
660 if let Some(text) = &element.text {
661 html.push_str(&Self::escape_html(text));
662 }
663 html.push_str("</p>\n</section>\n");
664 }
665 // Reference
666 LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
667 html.push_str("<section class=\"references\">\n<p>");
668 if let Some(text) = &element.text {
669 html.push_str(&Self::escape_html(text));
670 }
671 html.push_str("</p>\n</section>\n");
672 }
673 // List
674 LayoutElementType::List => {
675 html.push_str("<ul>\n");
676 if let Some(text) = &element.text {
677 for line in text.lines() {
678 html.push_str("<li>");
679 html.push_str(&Self::escape_html(line));
680 html.push_str("</li>\n");
681 }
682 }
683 html.push_str("</ul>\n");
684 }
685 // Header
686 _ if element.element_type.is_header() => {
687 html.push_str("<header>");
688 if let Some(text) = &element.text {
689 html.push_str(&Self::escape_html(text));
690 }
691 html.push_str("</header>\n");
692 }
693 // Footer
694 _ if element.element_type.is_footer() => {
695 html.push_str("<footer>");
696 if let Some(text) = &element.text {
697 html.push_str(&Self::escape_html(text));
698 }
699 html.push_str("</footer>\n");
700 }
701 // Default text
702 _ => {
703 if let Some(text) = &element.text {
704 html.push_str("<p>");
705 html.push_str(&Self::escape_html(text));
706 html.push_str("</p>\n");
707 }
708 }
709 }
710 }
711 html.push_str("</body>\n</html>");
712 html
713 }
714
715 /// Escapes HTML special characters.
716 fn escape_html(text: &str) -> String {
717 text.replace('&', "&")
718 .replace('<', "<")
719 .replace('>', ">")
720 .replace('"', """)
721 .replace('\'', "'")
722 }
723
724 /// Converts the result to a JSON Value.
725 pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
726 serde_json::to_value(self)
727 }
728
729 /// Saves the analysis results to the specified directory.
730 ///
731 /// This generates:
732 /// - `*_res.json`: The full structured result
733 /// - `*_res.html`: An HTML representation
734 ///
735 /// Note: Markdown export with image extraction should use the example utilities
736 /// (`examples/utils/markdown.rs`) instead, as that requires I/O operations
737 /// that belong in the application layer. Use `StructureResult::to_markdown()`
738 /// for pure markdown generation without side effects.
739 ///
740 /// # Arguments
741 ///
742 /// * `output_dir` - Directory to save the output files
743 /// * `to_json` - If true, save a JSON representation
744 /// * `to_html` - If true, save an HTML representation
745 pub fn save_results(
746 &self,
747 output_dir: impl AsRef<Path>,
748 to_json: bool,
749 to_html: bool,
750 ) -> std::io::Result<()> {
751 let output_dir = output_dir.as_ref();
752 if !output_dir.exists() {
753 std::fs::create_dir_all(output_dir)?;
754 }
755
756 let input_path = Path::new(self.input_path.as_ref());
757 // Extract file stem, handling PDF page suffix (e.g., "file.pdf#3" -> "file_003")
758 let stem = if let Some(path_str) = input_path.to_str() {
759 if let Some(hash_idx) = path_str.rfind('#') {
760 // This is a PDF page reference like "file.pdf#3"
761 let base = &path_str[..hash_idx];
762 let page_num = &path_str[hash_idx + 1..];
763 let base_stem = Path::new(base)
764 .file_stem()
765 .and_then(|s| s.to_str())
766 .unwrap_or("result");
767 format!("{}_{}", base_stem, page_num)
768 } else {
769 input_path
770 .file_stem()
771 .and_then(|s| s.to_str())
772 .unwrap_or("result")
773 .to_string()
774 }
775 } else {
776 "result".to_string()
777 };
778
779 // Save JSON
780 if to_json {
781 let json_path = output_dir.join(format!("{}.json", stem));
782 let json_file = std::fs::File::create(json_path)?;
783 serde_json::to_writer_pretty(json_file, self)?;
784 }
785
786 // Save HTML
787 if to_html {
788 let html_path = output_dir.join(format!("{}.html", stem));
789 std::fs::write(html_path, self.to_html())?;
790 }
791
792 Ok(())
793 }
794}
795
796/// Checks if a text element appears to start a new paragraph.
797///
798/// Following PaddleX's logic: if the text starts near the left edge of the page
799/// (within 5% of page width), it's likely the start of a new paragraph.
800fn is_new_paragraph_start(element: &LayoutElement, page_width: Option<f32>) -> bool {
801 let left = element.bbox.x_min();
802 let threshold = page_width.map_or(50.0, |w| w * 0.05); // 5% of page width
803 left <= threshold
804}
805
806/// Checks if a text element appears to complete its paragraph on this page.
807///
808/// Following PaddleX's logic: if the text ends before the right edge of the page
809/// (not within 10% of right margin), the paragraph likely ends here.
810fn is_paragraph_complete(element: &LayoutElement, page_width: Option<f32>) -> bool {
811 let right = element.bbox.x_max();
812
813 // If we have page width info, check if element ends before the right edge
814 if let Some(width) = page_width {
815 let right_margin = width * 0.1;
816 return right <= (width - right_margin);
817 }
818
819 // Conservative default: assume paragraphs end
820 true
821}
822
823/// Concatenates markdown content from multiple pages into a single document.
824///
825/// This follows PaddleX's `concatenate_markdown_pages` logic to intelligently
826/// merge pages while preserving paragraph continuity.
827///
828/// # Arguments
829///
830/// * `results` - Slice of structure results from multiple pages (in order)
831///
832/// # Returns
833///
834/// A single markdown string with all pages properly concatenated
835pub fn concatenate_markdown_pages(results: &[StructureResult]) -> String {
836 if results.is_empty() {
837 return String::new();
838 }
839
840 if results.len() == 1 {
841 return results[0].to_markdown();
842 }
843
844 let mut markdown = String::new();
845 let mut prev_page_end_flag = true; // First page is treated as starting fresh
846
847 for result in results.iter() {
848 let flags = result
849 .page_continuation_flags
850 .as_ref()
851 .cloned()
852 .unwrap_or_else(|| result.calculate_continuation_flags());
853
854 let page_markdown = result.to_markdown();
855
856 // Skip empty pages
857 if page_markdown.trim().is_empty() {
858 prev_page_end_flag = flags.paragraph_end;
859 continue;
860 }
861
862 let page_first_continues = !flags.paragraph_start;
863 let _page_last_continues = !flags.paragraph_end;
864
865 // Determine how to join this page
866 if page_first_continues && !prev_page_end_flag {
867 // Both pages are in the middle of the same paragraph
868 // Check for Chinese characters to decide spacing
869 let last_char = markdown.chars().last();
870 let first_char = page_markdown.chars().next();
871
872 let last_is_chinese = last_char.is_some_and(is_chinese_char);
873 let first_is_chinese = first_char.is_some_and(is_chinese_char);
874
875 if !last_is_chinese && !first_is_chinese {
876 // Non-Chinese text: add space
877 markdown.push(' ');
878 markdown.push_str(page_markdown.trim_start());
879 } else {
880 // Chinese or mixed: direct concatenation
881 markdown.push_str(page_markdown.trim_start());
882 }
883 } else {
884 // New paragraph or section
885 if !markdown.is_empty() {
886 markdown.push_str("\n\n");
887 }
888 markdown.push_str(&page_markdown);
889 }
890
891 prev_page_end_flag = flags.paragraph_end;
892 }
893
894 markdown.trim().to_string()
895}
896
897/// Cleans OCR text content by removing common artifacts.
898///
899/// This function removes PDF line-break hyphens and fixes spacing issues
900/// in OCR text content. It should only be applied to raw OCR text, not to
901/// formatted markdown or HTML.
902///
903/// Following PaddleX's approach:
904/// 1. Remove hyphenation artifacts: `-\n` -> `` (join words)
905/// 2. Convert newlines to spaces: `\n` -> ` `
906fn clean_ocr_text(text: &str) -> String {
907 // First remove hyphenation (word breaks), then convert newlines to spaces
908 text.replace("-\n", "").replace('\n', " ")
909}
910
911/// Formats text blocks following PaddleX's text handling:
912/// 1. First remove hyphenation: `-\n` -> `` (join broken words)
913/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
914///
915/// This converts OCR line breaks into proper paragraph breaks.
916fn format_text_block(text: &str) -> String {
917 // First, remove hyphenation artifacts (word breaks at line ends)
918 let dehyphenated = text.replace("-\n", "");
919 // Collapse double newlines to single (undo paragraph breaks)
920 let step1 = dehyphenated.replace("\n\n", "\n");
921 // Then, convert single newlines to paragraph breaks
922 step1.replace('\n', "\n\n")
923}
924
925/// Formats content blocks (table of contents) following PaddleX:
926/// `.replace("-\n", " \n").replace("\n", " \n")`
927///
928/// This uses markdown's soft line break (two spaces at end of line).
929fn format_content_block(text: &str) -> String {
930 // Handle PDF hyphen line breaks first
931 let step1 = text.replace("-\n", " \n");
932 // Convert newlines to soft breaks
933 step1.replace('\n', " \n")
934}
935
936/// Formats reference blocks, following PaddleX's `format_first_line_func`:
937/// - First remove hyphenation: `-\n` -> ``
938/// - Detects "References" or "参考文献" keyword
939/// - Adds markdown heading if found
940fn format_reference_block(text: &str) -> String {
941 // First remove hyphenation
942 let dehyphenated = text.replace("-\n", "");
943 let lines: Vec<&str> = dehyphenated.lines().collect();
944
945 // Check first non-empty line for reference keywords
946 let mut result = String::new();
947 let mut added_heading = false;
948
949 for (i, line) in lines.iter().enumerate() {
950 let trimmed = line.trim();
951 if trimmed.is_empty() {
952 continue;
953 }
954
955 // Check if this is a reference heading line
956 if !added_heading && (trimmed.contains("References") || trimmed.contains("参考文献")) {
957 result.push_str("## **References**\n\n");
958 added_heading = true;
959 // Skip the heading line itself, continue with content
960 continue;
961 }
962
963 // Add remaining lines
964 if i > 0 || result.is_empty() {
965 if !result.is_empty() {
966 result.push('\n');
967 }
968 result.push_str(trimmed);
969 }
970 }
971
972 if result.is_empty() {
973 dehyphenated
974 } else {
975 result
976 }
977}
978
979/// Formats vision footnote blocks following PaddleX:
980/// 1. First remove hyphenation: `-\n` -> ``
981/// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")`
982fn format_vision_footnote_block(text: &str) -> String {
983 let dehyphenated = text.replace("-\n", "");
984 let step1 = dehyphenated.replace("\n\n", "\n");
985 step1.replace('\n', "\n\n")
986}
987
988/// Checks if a character is a Chinese character.
989///
990/// Used to determine spacing rules when concatenating pages.
991fn is_chinese_char(c: char) -> bool {
992 match c {
993 '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
994 '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
995 '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
996 '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
997 '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
998 '\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E
999 '\u{2CEB0}'..='\u{2EBEF}' => // CJK Unified Ideographs Extension F
1000 true,
1001 _ => false,
1002 }
1003}
1004
1005/// Checks if a character is a lowercase letter.
1006fn is_lowercase(c: char) -> bool {
1007 c.is_ascii_lowercase()
1008}
1009
1010/// Checks if a character is an uppercase letter.
1011fn is_uppercase(c: char) -> bool {
1012 c.is_ascii_uppercase()
1013}
1014
1015/// Checks if a character is a digit.
1016fn is_digit(c: char) -> bool {
1017 c.is_ascii_digit()
1018}
1019
1020/// Removes PDF hyphenation artifacts from text.
1021///
1022/// PDFs often break words at line ends with hyphens like "frame-work",
1023/// "com-pared", etc. This function detects and removes these hyphens
1024/// when they appear to be line-break hyphens rather than intentional hyphens.
1025///
1026/// Rules:
1027/// 1. Hyphen followed by lowercase letter is likely a hyphenation artifact
1028/// 2. Hyphen followed by space and lowercase letter is also artifact
1029/// 3. Hyphen followed by newline and lowercase letter is artifact
1030/// 4. Preserve intentional hyphens (compound words, hyphenated phrases)
1031/// 5. Preserve hyphens in URLs and technical patterns
1032fn dehyphenate(text: &str) -> String {
1033 let mut result = String::with_capacity(text.len());
1034 let chars: Vec<char> = text.chars().collect();
1035 let len = chars.len();
1036 let mut i = 0;
1037
1038 // Helper to check if we're in a URL-like pattern
1039 let is_url_context = |pos: usize| -> bool {
1040 // Look at a window around the hyphen for URL patterns
1041 let start = pos.saturating_sub(10);
1042 let end = (pos + 5).min(len);
1043 let window: String = chars[start..end].iter().collect();
1044 window.contains("http") || window.contains("www") || window.contains("://")
1045 };
1046
1047 while i < len {
1048 if chars[i] == '-' {
1049 // Skip dehyphenation for URL contexts
1050 if is_url_context(i) {
1051 result.push('-');
1052 i += 1;
1053 continue;
1054 }
1055
1056 // Check if this is a hyphenation artifact
1057 let is_artifact = if i + 1 < len {
1058 let next = chars[i + 1];
1059 if next == '\n' {
1060 // Hyphen followed by newline - check what's after the newline
1061 if i + 2 < len {
1062 let after_newline = chars[i + 2];
1063 is_lowercase(after_newline)
1064 } else {
1065 false
1066 }
1067 } else if is_lowercase(next) {
1068 // Hyphen followed directly by lowercase letter (e.g., "com-puted")
1069 // But check if preceded by lowercase to avoid removing intentional hyphens
1070 // like in "RT-DETR" or "one-to-many"
1071 i > 0 && is_lowercase(chars[i - 1])
1072 } else if next.is_whitespace() && i + 2 < len {
1073 let after_space = chars[i + 2];
1074 // Hyphen + space + lowercase letter (e.g., "com- puted")
1075 is_lowercase(after_space) && i > 0 && is_lowercase(chars[i - 1])
1076 } else {
1077 false
1078 }
1079 } else {
1080 false
1081 };
1082
1083 if is_artifact {
1084 // Skip the hyphen
1085 // Also skip following newline/space if present
1086 if i + 1 < len {
1087 let next = chars[i + 1];
1088 if next == '\n' || next.is_whitespace() {
1089 i += 1;
1090 }
1091 }
1092 } else {
1093 result.push('-');
1094 }
1095 } else {
1096 result.push(chars[i]);
1097 }
1098 i += 1;
1099 }
1100
1101 result
1102}
1103
1104/// Fixes missing spaces between merged words.
1105///
1106/// OCR and PDF extraction can result in merged words like
1107/// "enhancetheencoder'sfeaturerepresentation" or "48.1%AP".
1108/// This function detects and fixes common patterns.
1109fn fix_merged_words(text: &str) -> String {
1110 let mut result = String::with_capacity(text.len());
1111 let chars: Vec<char> = text.chars().collect();
1112 let mut i = 0;
1113
1114 while i < chars.len() {
1115 let current = chars[i];
1116
1117 if i > 0 {
1118 let prev = chars[i - 1];
1119
1120 // Detect missing space between lowercase and lowercase (after apostrophe or consonant)
1121 // e.g., "encoder'sfeature" -> "encoder's feature"
1122 if is_lowercase(prev) && is_lowercase(current) {
1123 // Only add space if previous was apostrophe or word boundary context
1124 // This is a heuristic - in practice you'd want more sophisticated NLP
1125 if i > 1 && chars[i - 2] == '\'' {
1126 result.push(' ');
1127 }
1128 // Also detect lowercase followed by uppercase
1129 // e.g., "RT-DETRis" -> "RT-DETR is"
1130 } else if is_lowercase(prev) && is_uppercase(current) {
1131 // Check if the uppercase starts a new word (not an acronym)
1132 // If next char is lowercase, it's likely a new word
1133 if i + 1 < chars.len() && is_lowercase(chars[i + 1]) {
1134 result.push(' ');
1135 }
1136 }
1137 // Detect digit/percent followed by letter, or letter-digit-letter pattern
1138 // e.g., "48.1%AP" -> "48.1% AP"
1139 // e.g., "RT-DETRv3" shouldn't be split, but "model 100instances" -> "model 100 instances"
1140 else if ((is_digit(prev) || prev == '%') && is_uppercase(current))
1141 || (is_letter(prev)
1142 && is_digit(current)
1143 && i + 1 < chars.len()
1144 && is_letter(chars[i + 1]))
1145 {
1146 result.push(' ');
1147 }
1148 }
1149
1150 result.push(current);
1151 i += 1;
1152 }
1153
1154 result
1155}
1156
1157/// Checks if a character is a letter.
1158fn is_letter(c: char) -> bool {
1159 is_lowercase(c) || is_uppercase(c)
1160}
1161
1162/// Simplifies table HTML by removing wrapper tags, following PaddleX's `simplify_table_func`.
1163///
1164/// This removes `<html>`, `</html>`, `<body>`, and `</body>` tags from table HTML
1165/// to produce cleaner markdown output.
1166fn simplify_table_html(html: &str) -> String {
1167 html.replace("<html>", "")
1168 .replace("</html>", "")
1169 .replace("<body>", "")
1170 .replace("</body>", "")
1171}
1172
1173/// Post-processes text content to fix common OCR/PDF artifacts.
1174///
1175/// This applies multiple cleanup steps:
1176/// 1. Dehyphenation - removes line-break hyphens
1177/// 2. Word merging fixes - adds missing spaces
1178/// 3. Spacing normalization - fixes multiple spaces
1179pub fn postprocess_text(text: &str) -> String {
1180 let text = dehyphenate(text);
1181 let text = fix_merged_words(&text);
1182
1183 // Normalize whitespace (collapse multiple spaces, fix spacing after punctuation)
1184 let mut result = String::new();
1185 let mut in_space = false;
1186
1187 for c in text.chars() {
1188 if c.is_whitespace() {
1189 if !in_space && !result.is_empty() {
1190 result.push(' ');
1191 in_space = true;
1192 }
1193 } else {
1194 // Fix missing space after period (when followed by letter)
1195 if c == '.' && !result.is_empty() {
1196 let last = result.chars().last().unwrap();
1197 if is_letter(last) || is_digit(last) {
1198 result.push('.');
1199 in_space = true;
1200 continue;
1201 }
1202 }
1203 // Fix spacing after punctuation
1204 if in_space && matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')' | ']' | '}') {
1205 result.pop(); // Remove the space before punctuation
1206 result.push(c);
1207 continue;
1208 }
1209 result.push(c);
1210 in_space = false;
1211 }
1212 }
1213
1214 result
1215}
1216
1217/// Removes duplicate section headers from concatenated markdown.
1218///
1219/// When concatenating pages, section headers like "**Abstract**" or
1220/// "**References**" may appear multiple times. This function deduplicates
1221/// them while preserving the first occurrence.
1222fn deduplicate_sections(markdown: &str) -> String {
1223 let mut result = String::new();
1224 let mut seen_sections: std::collections::HashSet<String> = std::collections::HashSet::new();
1225
1226 for line in markdown.lines() {
1227 let trimmed = line.trim();
1228
1229 // Check for common section header patterns
1230 let is_section_header =
1231 trimmed.starts_with("**") && trimmed.ends_with("**") && trimmed.len() > 4;
1232
1233 let section_name = if is_section_header {
1234 trimmed[2..trimmed.len() - 2].to_string()
1235 } else {
1236 String::new()
1237 };
1238
1239 if is_section_header {
1240 if seen_sections.contains(§ion_name) {
1241 // Skip duplicate section header
1242 continue;
1243 }
1244 seen_sections.insert(section_name);
1245 }
1246
1247 if !result.is_empty() {
1248 result.push('\n');
1249 }
1250 result.push_str(line);
1251 }
1252
1253 result
1254}
1255
1256/// Checks if two bounding boxes are on the same line (have significant vertical overlap).
1257///
1258/// Two boxes are considered on the same line if their vertical overlap is greater than
1259/// 50% of the smaller box's height.
1260fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool {
1261 let y1_min = bbox1.y_min();
1262 let y1_max = bbox1.y_max();
1263 let y2_min = bbox2.y_min();
1264 let y2_max = bbox2.y_max();
1265
1266 // Calculate vertical overlap
1267 let overlap_start = y1_min.max(y2_min);
1268 let overlap_end = y1_max.min(y2_max);
1269 let overlap = (overlap_end - overlap_start).max(0.0);
1270
1271 // Calculate minimum height
1272 let height1 = y1_max - y1_min;
1273 let height2 = y2_max - y2_min;
1274 let min_height = height1.min(height2);
1275
1276 // Consider same line if overlap > 50% of min height
1277 min_height > 0.0 && overlap / min_height > 0.5
1278}
1279
1280/// Filters empty formula blocks from markdown.
1281///
1282/// Formula blocks with no LaTeX content like `$$\n$$` are removed.
1283fn filter_empty_formulas(markdown: &str) -> String {
1284 let mut result = String::new();
1285 let lines: Vec<&str> = markdown.lines().collect();
1286 let mut i = 0;
1287
1288 while i < lines.len() {
1289 let line = lines[i];
1290
1291 // Check for empty formula block pattern
1292 if line.trim() == "$$" {
1293 // Check if next line is also $$ (empty formula)
1294 if i + 1 < lines.len() && lines[i + 1].trim() == "$$" {
1295 // Skip both lines
1296 i += 2;
1297 // Also skip the blank line after
1298 if i < lines.len() && lines[i].trim().is_empty() {
1299 i += 1;
1300 }
1301 continue;
1302 }
1303 // Check if the next non-empty line contains actual content
1304 let mut j = i + 1;
1305 let has_content = if j < lines.len() {
1306 let mut found = false;
1307 while j < lines.len() {
1308 if lines[j].trim() == "$$" {
1309 break;
1310 }
1311 if !lines[j].trim().is_empty() {
1312 found = true;
1313 break;
1314 }
1315 j += 1;
1316 }
1317 found
1318 } else {
1319 false
1320 };
1321
1322 if !has_content {
1323 // Skip to closing $$
1324 while i < lines.len() && lines[i].trim() != "$$" {
1325 i += 1;
1326 }
1327 if i < lines.len() {
1328 i += 1; // Skip closing $$
1329 }
1330 continue;
1331 }
1332 }
1333
1334 if !result.is_empty() {
1335 result.push('\n');
1336 }
1337 result.push_str(line);
1338 i += 1;
1339 }
1340
1341 result
1342}
1343
1344/// Applies all post-processing steps to concatenated markdown.
1345///
1346/// This is the main entry point for cleaning up concatenated markdown output.
1347pub fn postprocess_markdown(markdown: &str) -> String {
1348 let markdown = filter_empty_formulas(markdown);
1349 let markdown = deduplicate_sections(&markdown);
1350
1351 // Apply text post-processing line by line for text content
1352 let mut result = String::new();
1353 let mut in_code_block = false;
1354 let mut in_formula = false;
1355
1356 for line in markdown.lines() {
1357 let trimmed = line.trim();
1358
1359 // Detect code blocks
1360 if trimmed.starts_with("```") {
1361 in_code_block = !in_code_block;
1362 result.push_str(line);
1363 result.push('\n');
1364 continue;
1365 }
1366
1367 // Detect formula blocks
1368 if trimmed == "$$" {
1369 in_formula = !in_formula;
1370 result.push_str(line);
1371 result.push('\n');
1372 continue;
1373 }
1374
1375 // Skip processing inside code/formula blocks
1376 if in_code_block || in_formula {
1377 result.push_str(line);
1378 result.push('\n');
1379 continue;
1380 }
1381
1382 // Process text content (skip headers, lists, etc.)
1383 if trimmed.starts_with('#')
1384 || trimmed.starts_with('*')
1385 || trimmed.starts_with('>')
1386 || trimmed.starts_with('|')
1387 || trimmed.starts_with('-')
1388 || trimmed.starts_with('+')
1389 {
1390 result.push_str(line);
1391 } else {
1392 result.push_str(&postprocess_text(line));
1393 }
1394 result.push('\n');
1395 }
1396
1397 result
1398}
1399
1400/// Extension trait for convenient multi-page processing.
1401pub trait StructureResultExt {
1402 /// Converts multiple results to a single concatenated markdown.
1403 fn to_concatenated_markdown(results: &[Self]) -> String
1404 where
1405 Self: Sized;
1406
1407 /// Saves multiple results with concatenated markdown.
1408 fn save_multi_page_results(
1409 results: &[Self],
1410 output_dir: impl AsRef<std::path::Path>,
1411 base_name: &str,
1412 to_json: bool,
1413 to_markdown: bool,
1414 to_html: bool,
1415 ) -> std::io::Result<()>
1416 where
1417 Self: Sized;
1418}
1419
1420impl StructureResultExt for StructureResult {
1421 fn to_concatenated_markdown(results: &[Self]) -> String {
1422 concatenate_markdown_pages(results)
1423 }
1424
1425 fn save_multi_page_results(
1426 results: &[Self],
1427 output_dir: impl AsRef<std::path::Path>,
1428 base_name: &str,
1429 to_json: bool,
1430 to_markdown: bool,
1431 to_html: bool,
1432 ) -> std::io::Result<()>
1433 where
1434 Self: Sized,
1435 {
1436 let output_dir = output_dir.as_ref();
1437 if !output_dir.exists() {
1438 std::fs::create_dir_all(output_dir)?;
1439 }
1440
1441 // Save individual page results
1442 for (idx, result) in results.iter().enumerate() {
1443 let page_dir = output_dir.join(format!("page_{:03}", idx));
1444 std::fs::create_dir_all(&page_dir)?;
1445 result.save_results(&page_dir, to_json, to_html)?;
1446 }
1447
1448 // Save concatenated markdown
1449 if to_markdown {
1450 let concat_md_path = output_dir.join(format!("{}.md", base_name));
1451 std::fs::write(concat_md_path, Self::to_concatenated_markdown(results))?;
1452 }
1453
1454 // Save concatenated JSON (array of results)
1455 if to_json {
1456 let concat_json_path = output_dir.join(format!("{}.json", base_name));
1457 let json_file = std::fs::File::create(concat_json_path)?;
1458 serde_json::to_writer_pretty(json_file, &results)?;
1459 }
1460
1461 Ok(())
1462 }
1463}
1464
1465/// A layout element detected in the document.
1466#[derive(Debug, Clone, Serialize, Deserialize)]
1467pub struct LayoutElement {
1468 /// Bounding box of the element
1469 pub bbox: BoundingBox,
1470 /// Type of the layout element
1471 pub element_type: LayoutElementType,
1472 /// Confidence score for the detection
1473 pub confidence: f32,
1474 /// Optional label for the element (original model label)
1475 pub label: Option<String>,
1476 /// Optional text content for the element
1477 pub text: Option<String>,
1478 /// Reading order index (1-based, assigned during stitching)
1479 ///
1480 /// This index represents the element's position in the reading order.
1481 /// Only elements that should be included in reading flow (text, tables,
1482 /// formulas, images, etc.) will have an order index assigned.
1483 /// Headers, footers, and other auxiliary elements may have `None`.
1484 pub order_index: Option<u32>,
1485}
1486
1487impl LayoutElement {
1488 /// Creates a new layout element.
1489 pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
1490 Self {
1491 bbox,
1492 element_type,
1493 confidence,
1494 label: None,
1495 text: None,
1496 order_index: None,
1497 }
1498 }
1499
1500 /// Sets the label for the element.
1501 pub fn with_label(mut self, label: impl Into<String>) -> Self {
1502 self.label = Some(label.into());
1503 self
1504 }
1505
1506 /// Sets the text content for the element.
1507 pub fn with_text(mut self, text: impl Into<String>) -> Self {
1508 self.text = Some(text.into());
1509 self
1510 }
1511}
1512
1513/// Layout element type supporting PP-StructureV3's full label set.
1514///
1515/// This enum represents both **semantic categories** and **fine-grained labels** for layout elements.
1516/// PP-StructureV3 models output 20 or 23 class labels depending on the model variant.
1517///
1518/// The original model-specific label is preserved in `LayoutElement.label` field.
1519///
1520/// # PP-StructureV3 Label Categories
1521///
1522/// **Document structure:**
1523/// - `DocTitle` - Document title (doc_title)
1524/// - `ParagraphTitle` - Section/paragraph title (paragraph_title)
1525/// - `Text` - General text content
1526/// - `Content` - Table of contents (content)
1527/// - `Abstract` - Abstract section
1528///
1529/// **Visual elements:**
1530/// - `Image` - Images/figures (image, figure)
1531/// - `Table` - Tables
1532/// - `Chart` - Charts/graphs
1533/// - `Formula` - Mathematical formulas
1534///
1535/// **Captions and titles:**
1536/// - `FigureTitle` - Figure caption (figure_title)
1537/// - `TableTitle` - Table caption (table_title)
1538/// - `ChartTitle` - Chart caption (chart_title)
1539/// - `FigureTableChartTitle` - Combined caption type
1540///
1541/// **Page structure:**
1542/// - `Header` - Page header
1543/// - `HeaderImage` - Header image
1544/// - `Footer` - Page footer
1545/// - `FooterImage` - Footer image
1546/// - `Footnote` - Footnotes
1547///
1548/// **Special elements:**
1549/// - `Seal` - Stamps/official seals
1550/// - `Number` - Page numbers
1551/// - `Reference` - References section
1552/// - `ReferenceContent` - Reference content
1553/// - `Algorithm` - Algorithm blocks
1554/// - `FormulaNumber` - Formula numbers
1555/// - `AsideText` - Marginal/aside text
1556/// - `List` - List items
1557///
1558/// - `Other` - Unknown/unmapped labels
1559#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1560pub enum LayoutElementType {
1561 /// Document title
1562 DocTitle,
1563 /// Paragraph/section title
1564 ParagraphTitle,
1565 /// General text content
1566 Text,
1567 /// Table of contents
1568 Content,
1569 /// Abstract section
1570 Abstract,
1571
1572 /// Image or figure
1573 Image,
1574 /// Table
1575 Table,
1576 /// Chart or graph
1577 Chart,
1578 /// Mathematical formula
1579 Formula,
1580
1581 /// Figure caption/title
1582 FigureTitle,
1583 /// Table caption/title
1584 TableTitle,
1585 /// Chart caption/title
1586 ChartTitle,
1587 /// Combined figure/table/chart title (PP-DocLayout)
1588 FigureTableChartTitle,
1589
1590 /// Page header
1591 Header,
1592 /// Header image
1593 HeaderImage,
1594 /// Page footer
1595 Footer,
1596 /// Footer image
1597 FooterImage,
1598 /// Footnote
1599 Footnote,
1600
1601 /// Stamp or official seal
1602 Seal,
1603 /// Page number
1604 Number,
1605 /// Reference section
1606 Reference,
1607 /// Reference content (PP-DocLayout_plus-L)
1608 ReferenceContent,
1609 /// Algorithm block
1610 Algorithm,
1611 /// Formula number
1612 FormulaNumber,
1613 /// Marginal/aside text
1614 AsideText,
1615 /// List items
1616 List,
1617
1618 /// Generic document region block (PP-DocBlockLayout)
1619 /// Used for hierarchical layout ordering and block grouping
1620 Region,
1621
1622 /// Other/unknown (original label preserved in LayoutElement.label)
1623 Other,
1624}
1625
1626impl LayoutElementType {
1627 /// Returns the string representation of the element type.
1628 ///
1629 /// This returns the PP-StructureV3 compatible label string.
1630 pub fn as_str(&self) -> &'static str {
1631 match self {
1632 // Document Structure
1633 LayoutElementType::DocTitle => "doc_title",
1634 LayoutElementType::ParagraphTitle => "paragraph_title",
1635 LayoutElementType::Text => "text",
1636 LayoutElementType::Content => "content",
1637 LayoutElementType::Abstract => "abstract",
1638
1639 // Visual Elements
1640 LayoutElementType::Image => "image",
1641 LayoutElementType::Table => "table",
1642 LayoutElementType::Chart => "chart",
1643 LayoutElementType::Formula => "formula",
1644
1645 // Captions
1646 LayoutElementType::FigureTitle => "figure_title",
1647 LayoutElementType::TableTitle => "table_title",
1648 LayoutElementType::ChartTitle => "chart_title",
1649 LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
1650
1651 // Page Structure
1652 LayoutElementType::Header => "header",
1653 LayoutElementType::HeaderImage => "header_image",
1654 LayoutElementType::Footer => "footer",
1655 LayoutElementType::FooterImage => "footer_image",
1656 LayoutElementType::Footnote => "footnote",
1657
1658 // Special Elements
1659 LayoutElementType::Seal => "seal",
1660 LayoutElementType::Number => "number",
1661 LayoutElementType::Reference => "reference",
1662 LayoutElementType::ReferenceContent => "reference_content",
1663 LayoutElementType::Algorithm => "algorithm",
1664 LayoutElementType::FormulaNumber => "formula_number",
1665 LayoutElementType::AsideText => "aside_text",
1666 LayoutElementType::List => "list",
1667
1668 // Region (PP-DocBlockLayout)
1669 LayoutElementType::Region => "region",
1670
1671 // Fallback
1672 LayoutElementType::Other => "other",
1673 }
1674 }
1675
1676 /// Creates a LayoutElementType from a string label with fine-grained mapping.
1677 ///
1678 /// This method maps model output labels to their corresponding fine-grained types,
1679 /// preserving the full PP-StructureV3 label set (20/23 classes).
1680 pub fn from_label(label: &str) -> Self {
1681 match label.to_lowercase().as_str() {
1682 // Document Structure
1683 "doc_title" => LayoutElementType::DocTitle,
1684 "paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
1685 "text" | "paragraph" => LayoutElementType::Text,
1686 "content" => LayoutElementType::Content,
1687 "abstract" => LayoutElementType::Abstract,
1688
1689 // Visual Elements
1690 "image" | "figure" => LayoutElementType::Image,
1691 "table" => LayoutElementType::Table,
1692 "chart" | "flowchart" => LayoutElementType::Chart,
1693 "formula" | "equation" | "display_formula" | "inline_formula" => {
1694 LayoutElementType::Formula
1695 }
1696
1697 // Captions
1698 "figure_title" => LayoutElementType::FigureTitle,
1699 "table_title" => LayoutElementType::TableTitle,
1700 "chart_title" => LayoutElementType::ChartTitle,
1701 "figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
1702
1703 // Page Structure
1704 "header" => LayoutElementType::Header,
1705 "header_image" => LayoutElementType::HeaderImage,
1706 "footer" => LayoutElementType::Footer,
1707 "footer_image" => LayoutElementType::FooterImage,
1708 "footnote" | "vision_footnote" => LayoutElementType::Footnote,
1709
1710 // Special Elements
1711 "seal" => LayoutElementType::Seal,
1712 "number" => LayoutElementType::Number,
1713 "reference" => LayoutElementType::Reference,
1714 "reference_content" => LayoutElementType::ReferenceContent,
1715 "algorithm" => LayoutElementType::Algorithm,
1716 "formula_number" => LayoutElementType::FormulaNumber,
1717 "aside_text" => LayoutElementType::AsideText,
1718 "list" => LayoutElementType::List,
1719 "vertical_text" => LayoutElementType::Text,
1720
1721 // Region (PP-DocBlockLayout)
1722 "region" => LayoutElementType::Region,
1723
1724 // Everything else maps to Other
1725 // The original label is preserved in LayoutElement.label
1726 _ => LayoutElementType::Other,
1727 }
1728 }
1729
1730 /// Returns the semantic category for this element type.
1731 ///
1732 /// This method groups fine-grained types into broader semantic categories,
1733 /// useful for processing logic that doesn't need fine-grained distinctions.
1734 ///
1735 /// # Categories
1736 ///
1737 /// - **Title**: DocTitle, ParagraphTitle
1738 /// - **Text**: Text, Content, Abstract
1739 /// - **Visual**: Image, Chart
1740 /// - **Table**: Table
1741 /// - **Caption**: FigureTitle, TableTitle, ChartTitle, FigureTableChartTitle
1742 /// - **Header**: Header, HeaderImage
1743 /// - **Footer**: Footer, FooterImage, Footnote
1744 /// - **Formula**: Formula, FormulaNumber
1745 /// - **Special**: Seal, Number, Reference, ReferenceContent, Algorithm, AsideText
1746 /// - **List**: List
1747 /// - **Other**: Other
1748 pub fn semantic_category(&self) -> &'static str {
1749 match self {
1750 // Title category
1751 LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
1752
1753 // Text category
1754 LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
1755 "text"
1756 }
1757
1758 // Visual category
1759 LayoutElementType::Image | LayoutElementType::Chart => "visual",
1760
1761 // Table category
1762 LayoutElementType::Table => "table",
1763
1764 // Caption category
1765 LayoutElementType::FigureTitle
1766 | LayoutElementType::TableTitle
1767 | LayoutElementType::ChartTitle
1768 | LayoutElementType::FigureTableChartTitle => "caption",
1769
1770 // Header category
1771 LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
1772
1773 // Footer category
1774 LayoutElementType::Footer
1775 | LayoutElementType::FooterImage
1776 | LayoutElementType::Footnote => "footer",
1777
1778 // Formula category
1779 LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
1780
1781 // Special category
1782 LayoutElementType::Seal
1783 | LayoutElementType::Number
1784 | LayoutElementType::Reference
1785 | LayoutElementType::ReferenceContent
1786 | LayoutElementType::Algorithm
1787 | LayoutElementType::AsideText => "special",
1788
1789 // List category
1790 LayoutElementType::List => "list",
1791
1792 // Region category (PP-DocBlockLayout)
1793 LayoutElementType::Region => "region",
1794
1795 // Other
1796 LayoutElementType::Other => "other",
1797 }
1798 }
1799
1800 /// Returns whether this element type is a title variant.
1801 pub fn is_title(&self) -> bool {
1802 matches!(
1803 self,
1804 LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
1805 )
1806 }
1807
1808 /// Returns whether this element type is a visual element (image, chart, figure).
1809 pub fn is_visual(&self) -> bool {
1810 matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
1811 }
1812
1813 /// Returns whether this element type is a caption variant.
1814 pub fn is_caption(&self) -> bool {
1815 matches!(
1816 self,
1817 LayoutElementType::FigureTitle
1818 | LayoutElementType::TableTitle
1819 | LayoutElementType::ChartTitle
1820 | LayoutElementType::FigureTableChartTitle
1821 )
1822 }
1823
1824 /// Returns whether this element type is a header variant.
1825 pub fn is_header(&self) -> bool {
1826 matches!(
1827 self,
1828 LayoutElementType::Header | LayoutElementType::HeaderImage
1829 )
1830 }
1831
1832 /// Returns whether this element type is a footer variant.
1833 pub fn is_footer(&self) -> bool {
1834 matches!(
1835 self,
1836 LayoutElementType::Footer
1837 | LayoutElementType::FooterImage
1838 | LayoutElementType::Footnote
1839 )
1840 }
1841
1842 /// Returns whether this element type is a formula variant.
1843 pub fn is_formula(&self) -> bool {
1844 matches!(
1845 self,
1846 LayoutElementType::Formula | LayoutElementType::FormulaNumber
1847 )
1848 }
1849
1850 /// Returns whether this element type contains text content that should be OCR'd.
1851 pub fn should_ocr(&self) -> bool {
1852 matches!(
1853 self,
1854 LayoutElementType::Text
1855 | LayoutElementType::Content
1856 | LayoutElementType::Abstract
1857 | LayoutElementType::DocTitle
1858 | LayoutElementType::ParagraphTitle
1859 | LayoutElementType::FigureTitle
1860 | LayoutElementType::TableTitle
1861 | LayoutElementType::ChartTitle
1862 | LayoutElementType::FigureTableChartTitle
1863 | LayoutElementType::Header
1864 | LayoutElementType::HeaderImage
1865 | LayoutElementType::Footer
1866 | LayoutElementType::FooterImage
1867 | LayoutElementType::Footnote
1868 | LayoutElementType::Reference
1869 | LayoutElementType::ReferenceContent
1870 | LayoutElementType::Algorithm
1871 | LayoutElementType::AsideText
1872 | LayoutElementType::List
1873 | LayoutElementType::Number
1874 )
1875 }
1876}
1877
1878/// Removes heavily-overlapping layout elements in-place.
1879///
1880/// This mirrors PP-Structure-style overlap suppression where text takes priority over images.
1881/// Returns the number of elements removed.
1882pub fn remove_overlapping_layout_elements(
1883 layout_elements: &mut Vec<LayoutElement>,
1884 overlap_threshold: f32,
1885) -> usize {
1886 use std::collections::HashSet;
1887
1888 if layout_elements.len() <= 1 {
1889 return 0;
1890 }
1891
1892 let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
1893 let labels: Vec<&str> = layout_elements
1894 .iter()
1895 .map(|e| e.element_type.as_str())
1896 .collect();
1897
1898 let remove_indices =
1899 crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
1900 if remove_indices.is_empty() {
1901 return 0;
1902 }
1903
1904 let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
1905 let before = layout_elements.len();
1906
1907 let mut idx = 0;
1908 layout_elements.retain(|_| {
1909 let keep = !remove_set.contains(&idx);
1910 idx += 1;
1911 keep
1912 });
1913
1914 before.saturating_sub(layout_elements.len())
1915}
1916
1917/// Applies small, PP-Structure-style label fixes to layout elements.
1918///
1919/// This is intended to capture lightweight "glue" heuristics that shouldn't live in `predict`.
1920pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
1921 if layout_elements.is_empty() {
1922 return;
1923 }
1924
1925 let mut footnote_indices: Vec<usize> = Vec::new();
1926 let mut paragraph_title_indices: Vec<usize> = Vec::new();
1927 let mut bottom_text_y_max: f32 = 0.0;
1928 let mut max_block_area: f32 = 0.0;
1929 let mut doc_title_num: usize = 0;
1930
1931 for (idx, elem) in layout_elements.iter().enumerate() {
1932 let area =
1933 (elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
1934 max_block_area = max_block_area.max(area);
1935
1936 match elem.element_type {
1937 LayoutElementType::Footnote => footnote_indices.push(idx),
1938 LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
1939 LayoutElementType::Text => {
1940 bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
1941 }
1942 LayoutElementType::DocTitle => doc_title_num += 1,
1943 _ => {}
1944 }
1945 }
1946
1947 for idx in footnote_indices {
1948 if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
1949 layout_elements[idx].element_type = LayoutElementType::Text;
1950 layout_elements[idx].label = Some("text".to_string());
1951 }
1952 }
1953
1954 let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
1955 if only_one_paragraph_title {
1956 let idx = paragraph_title_indices[0];
1957 let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
1958 * (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
1959
1960 let title_area_ratio_threshold = 0.3f32;
1961 if area > max_block_area * title_area_ratio_threshold {
1962 layout_elements[idx].element_type = LayoutElementType::DocTitle;
1963 layout_elements[idx].label = Some("doc_title".to_string());
1964 }
1965 }
1966}
1967
1968/// Result of table recognition.
1969#[derive(Debug, Clone, Serialize, Deserialize)]
1970pub struct TableResult {
1971 /// Bounding box of the table in the original image
1972 pub bbox: BoundingBox,
1973 /// Table type (wired or wireless)
1974 pub table_type: TableType,
1975 /// Confidence score for table type classification (None if classifier wasn't configured/run)
1976 pub classification_confidence: Option<f32>,
1977 /// Confidence score for table structure recognition (None if structure recognition failed)
1978 pub structure_confidence: Option<f32>,
1979 /// Detected table cells
1980 pub cells: Vec<TableCell>,
1981 /// HTML structure of the table (if available)
1982 pub html_structure: Option<String>,
1983 /// OCR text content for each cell (if OCR was integrated)
1984 pub cell_texts: Option<Vec<Option<String>>>,
1985 /// Structure tokens from table structure recognition (used for HTML generation after stitching)
1986 #[serde(skip)]
1987 pub structure_tokens: Option<Vec<String>>,
1988}
1989
1990impl TableResult {
1991 /// Creates a new table result.
1992 pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
1993 Self {
1994 bbox,
1995 table_type,
1996 classification_confidence: None,
1997 structure_confidence: None,
1998 cells: Vec::new(),
1999 html_structure: None,
2000 cell_texts: None,
2001 structure_tokens: None,
2002 }
2003 }
2004
2005 /// Sets the classification confidence.
2006 pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
2007 self.classification_confidence = Some(confidence);
2008 self
2009 }
2010
2011 /// Sets the structure recognition confidence.
2012 pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
2013 self.structure_confidence = Some(confidence);
2014 self
2015 }
2016
2017 /// Sets the table cells.
2018 pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
2019 self.cells = cells;
2020 self
2021 }
2022
2023 /// Sets the HTML structure.
2024 pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
2025 self.html_structure = Some(html.into());
2026 self
2027 }
2028
2029 /// Sets the cell texts from OCR.
2030 pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
2031 self.cell_texts = Some(texts);
2032 self
2033 }
2034
2035 /// Sets the structure tokens for later HTML generation.
2036 pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
2037 self.structure_tokens = Some(tokens);
2038 self
2039 }
2040
2041 /// Returns the best available confidence score for this table.
2042 ///
2043 /// This method provides a unified confidence API for callers who want to filter
2044 /// tables by confidence without caring whether classification or structure
2045 /// recognition was used. Priority:
2046 /// 1. If both classification and structure confidence are available, returns
2047 /// the minimum (most conservative estimate)
2048 /// 2. If only structure confidence is available (common when classifier isn't
2049 /// configured), returns that
2050 /// 3. If only classification confidence is available, returns that
2051 /// 4. Returns `None` only if neither confidence is available (stub result)
2052 pub fn confidence(&self) -> Option<f32> {
2053 match (self.classification_confidence, self.structure_confidence) {
2054 (Some(cls), Some(str)) => Some(cls.min(str)),
2055 (None, Some(str)) => Some(str),
2056 (Some(cls), None) => Some(cls),
2057 (None, None) => None,
2058 }
2059 }
2060
2061 /// Returns true if this table has valid structure data.
2062 ///
2063 /// A table is considered valid if it has either cells or an HTML structure.
2064 /// Stub results (created when structure recognition fails) will return false.
2065 pub fn has_structure(&self) -> bool {
2066 !self.cells.is_empty() || self.html_structure.is_some()
2067 }
2068}
2069
2070/// Type of table.
2071#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
2072pub enum TableType {
2073 /// Table with visible borders
2074 Wired,
2075 /// Table without visible borders
2076 Wireless,
2077 /// Unknown table type
2078 Unknown,
2079}
2080
2081/// A cell in a table.
2082#[derive(Debug, Clone, Serialize, Deserialize)]
2083pub struct TableCell {
2084 /// Bounding box of the cell
2085 pub bbox: BoundingBox,
2086 /// Row index (0-based)
2087 pub row: Option<usize>,
2088 /// Column index (0-based)
2089 pub col: Option<usize>,
2090 /// Row span
2091 pub row_span: Option<usize>,
2092 /// Column span
2093 pub col_span: Option<usize>,
2094 /// Confidence score for the cell detection
2095 pub confidence: f32,
2096 /// Text content of the cell (if available)
2097 pub text: Option<String>,
2098}
2099
2100impl TableCell {
2101 /// Creates a new table cell.
2102 pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
2103 Self {
2104 bbox,
2105 row: None,
2106 col: None,
2107 row_span: None,
2108 col_span: None,
2109 confidence,
2110 text: None,
2111 }
2112 }
2113
2114 /// Sets the row and column indices.
2115 pub fn with_position(mut self, row: usize, col: usize) -> Self {
2116 self.row = Some(row);
2117 self.col = Some(col);
2118 self
2119 }
2120
2121 /// Sets the row and column spans.
2122 pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
2123 self.row_span = Some(row_span);
2124 self.col_span = Some(col_span);
2125 self
2126 }
2127
2128 /// Sets the text content.
2129 pub fn with_text(mut self, text: impl Into<String>) -> Self {
2130 self.text = Some(text.into());
2131 self
2132 }
2133}
2134
2135/// Result of formula recognition.
2136#[derive(Debug, Clone, Serialize, Deserialize)]
2137pub struct FormulaResult {
2138 /// Bounding box of the formula in the original image
2139 pub bbox: BoundingBox,
2140 /// LaTeX representation of the formula
2141 pub latex: String,
2142 /// Confidence score for the recognition
2143 pub confidence: f32,
2144}
2145
2146impl FormulaResult {
2147 /// Creates a new formula result.
2148 pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
2149 Self {
2150 bbox,
2151 latex: latex.into(),
2152 confidence,
2153 }
2154 }
2155}
2156
2157#[cfg(test)]
2158mod tests {
2159 use super::*;
2160
2161 #[test]
2162 fn test_structure_result_creation() {
2163 let result = StructureResult::new("test.jpg", 0);
2164 assert_eq!(result.input_path.as_ref(), "test.jpg");
2165 assert_eq!(result.index, 0);
2166 assert!(result.layout_elements.is_empty());
2167 assert!(result.tables.is_empty());
2168 assert!(result.formulas.is_empty());
2169 assert!(result.text_regions.is_none());
2170 }
2171
2172 #[test]
2173 fn test_layout_element_type_as_str() {
2174 assert_eq!(LayoutElementType::Text.as_str(), "text");
2175 assert_eq!(LayoutElementType::Table.as_str(), "table");
2176 assert_eq!(LayoutElementType::Formula.as_str(), "formula");
2177 }
2178
2179 #[test]
2180 fn test_table_result_creation() {
2181 let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2182 let table = TableResult::new(bbox, TableType::Wired);
2183 assert_eq!(table.table_type, TableType::Wired);
2184 assert!(table.cells.is_empty());
2185 assert!(table.html_structure.is_none());
2186 }
2187
2188 #[test]
2189 fn test_structure_result_export() {
2190 let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2191 let mut result = StructureResult::new("test.jpg", 0);
2192
2193 let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
2194 .with_text("Test Document");
2195
2196 let text =
2197 LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
2198
2199 result = result.with_layout_elements(vec![title, text]);
2200
2201 let md = result.to_markdown();
2202 assert!(md.contains("# Test Document"));
2203 assert!(md.contains("Hello world"));
2204
2205 let html = result.to_html();
2206 assert!(html.contains("<h1>Test Document</h1>"));
2207 assert!(html.contains("<p>Hello world</p>"));
2208 }
2209}