1use super::text_region::TextRegion;
7use crate::processors::BoundingBox;
8use image::RgbImage;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use std::path::Path;
14use std::sync::Arc;
15
16static TITLE_NUMBERING_REGEX: Lazy<Regex> = Lazy::new(|| {
19 Regex::new(
20 r"(?x)
21 ^\s*
22 (
23 # Arabic numerals: 1, 1.2, 1.2.3, etc.
24 [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
25 |
26 # Parenthesized Arabic numerals: (1), (1.2), etc.
27 [((][1-9][0-9]*(?:\.[1-9][0-9]*)*[))]
28 |
29 # Chinese numerals with punctuation: 一、 二、
30 [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
31 |
32 # Parenthesized Chinese numerals: (一)
33 [((][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[))]
34 |
35 # Roman numerals with delimiter (period or followed by space)
36 (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
37 )
38 (\s+)
39 (.*)
40 $
41 ",
42 )
43 .unwrap_or_else(|e| panic!("Invalid title numbering regex: {e}"))
44});
45
46fn semantic_title_level_and_format(cleaned: &str) -> Option<(usize, String)> {
63 let trimmed = cleaned.trim();
64
65 let keyword = trimmed.trim_end_matches(':').to_ascii_uppercase();
67 if matches!(
68 keyword.as_str(),
69 "ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
70 ) {
71 return Some((2, trimmed.to_string()));
72 }
73
74 if let Some(captures) = TITLE_NUMBERING_REGEX.captures(cleaned) {
75 let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
76 let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
77
78 let dot_count = numbering.matches('.').count();
79 let level = (dot_count + 2).clamp(2, 6);
80
81 let formatted = if title_content.is_empty() {
82 numbering.trim_end_matches('.').to_string()
83 } else {
84 format!(
85 "{} {}",
86 numbering.trim_end_matches('.'),
87 title_content.trim_start()
88 )
89 };
90 return Some((level, formatted));
91 }
92
93 None
94}
95
96fn semantic_title_level(text: &str) -> Option<usize> {
97 let cleaned = text.replace("-\n", "").replace('\n', " ");
98 semantic_title_level_and_format(&cleaned).map(|(level, _)| level)
99}
100
101fn format_title_with_level(title: &str, clustered_level: Option<usize>) -> (usize, String) {
102 let cleaned = title.replace("-\n", "").replace('\n', " ");
104 if let Some((level, formatted)) = semantic_title_level_and_format(&cleaned) {
105 return (level, formatted);
106 }
107
108 let level = clustered_level.unwrap_or(2).clamp(2, 6);
110 (level, cleaned)
111}
112
113fn infer_paragraph_title_levels(elements: &[LayoutElement]) -> HashMap<usize, usize> {
119 let title_indices: Vec<usize> = elements
120 .iter()
121 .enumerate()
122 .filter(|(_, e)| e.element_type == LayoutElementType::ParagraphTitle)
123 .map(|(idx, _)| idx)
124 .collect();
125 if title_indices.is_empty() {
126 return HashMap::new();
127 }
128
129 let height_samples: Vec<(usize, f32)> = title_indices
130 .iter()
131 .filter_map(|&idx| {
132 let e = &elements[idx];
133 let height = (e.bbox.y_max() - e.bbox.y_min()).max(1.0);
134 let line_h = height / e.num_lines.unwrap_or(1).max(1) as f32;
135 let v = line_h.max(1.0);
136 if v.is_finite() { Some((idx, v)) } else { None }
137 })
138 .collect();
139
140 let indent_samples: Vec<(usize, f32)> = title_indices
141 .iter()
142 .filter_map(|&idx| {
143 let x = elements[idx].bbox.x_min();
144 if x.is_finite() { Some((idx, x)) } else { None }
145 })
146 .collect();
147 let semantic_levels: HashMap<usize, usize> = title_indices
148 .iter()
149 .filter_map(|&idx| {
150 elements[idx]
151 .text
152 .as_deref()
153 .and_then(semantic_title_level)
154 .map(|level| (idx, level))
155 })
156 .collect();
157
158 let font_levels = infer_levels_by_kmeans_feature(&height_samples, true);
159 let relative_levels = infer_levels_by_kmeans_feature(&indent_samples, false);
161
162 let mut voted = HashMap::new();
163 for idx in title_indices {
164 let semantic_level = semantic_levels.get(&idx).copied();
165 let font_level = font_levels.get(&idx).copied();
166 let relative_level = relative_levels.get(&idx).copied();
167
168 let mut score = [0u8; 7];
169 if let Some(level) = semantic_level {
170 score[level.clamp(1, 6)] += 2;
171 }
172 if let Some(level) = font_level {
173 score[level.clamp(1, 6)] += 1;
174 }
175 if let Some(level) = relative_level {
176 score[level.clamp(1, 6)] += 1;
177 }
178
179 let mut best_level = semantic_level.unwrap_or(2);
180 let mut best_score = 0u8;
181 for (level, &s) in score.iter().enumerate().skip(1) {
182 if s > best_score {
183 best_score = s;
184 best_level = level;
185 } else if s == best_score && s > 0 {
186 let is_semantic = semantic_level == Some(level);
187 let best_is_semantic = semantic_level == Some(best_level);
188 if (is_semantic && !best_is_semantic)
189 || (is_semantic == best_is_semantic && level < best_level)
190 {
191 best_level = level;
192 }
193 }
194 }
195
196 if best_score == 0 {
197 best_level = semantic_level
198 .or(font_level)
199 .or(relative_level)
200 .unwrap_or(2);
201 }
202
203 voted.insert(idx, best_level.clamp(1, 6));
204 }
205
206 voted
207}
208
209fn infer_levels_by_kmeans_feature(
214 samples: &[(usize, f32)],
215 descending: bool,
216) -> HashMap<usize, usize> {
217 let clean_samples: Vec<(usize, f32)> = samples
218 .iter()
219 .copied()
220 .filter(|(_, v)| v.is_finite())
221 .collect();
222 if clean_samples.len() < 2 {
223 return HashMap::new();
224 }
225
226 let mut values: Vec<f32> = clean_samples.iter().map(|(_, v)| *v).collect();
227 values.sort_by(|a, b| a.total_cmp(b));
228 let unique_count = values
229 .windows(2)
230 .filter(|w| (w[1] - w[0]).abs() > 1e-3)
231 .count()
232 + 1;
233 let k = unique_count.clamp(1, 4).min(clean_samples.len());
234 if k <= 1 {
235 return HashMap::new();
236 }
237
238 let mut centroids = (0..k)
239 .map(|i| {
240 let pos = ((i as f32 + 0.5) / k as f32 * values.len() as f32).floor() as usize;
241 values[pos.min(values.len() - 1)]
242 })
243 .collect::<Vec<_>>();
244
245 for _ in 0..16 {
246 let mut sums = vec![0.0f32; k];
247 let mut counts = vec![0usize; k];
248 for (_, value) in &clean_samples {
249 let mut best_idx = 0usize;
250 let mut best_dist = f32::INFINITY;
251 for (idx, c) in centroids.iter().enumerate() {
252 let dist = (value - c).abs();
253 if dist < best_dist {
254 best_dist = dist;
255 best_idx = idx;
256 }
257 }
258 sums[best_idx] += *value;
259 counts[best_idx] += 1;
260 }
261 for idx in 0..k {
262 if counts[idx] > 0 {
263 centroids[idx] = sums[idx] / counts[idx] as f32;
264 }
265 }
266 }
267
268 let mut centroid_order: Vec<(usize, f32)> = centroids.iter().copied().enumerate().collect();
269 if descending {
270 centroid_order.sort_by(|a, b| b.1.total_cmp(&a.1));
271 } else {
272 centroid_order.sort_by(|a, b| a.1.total_cmp(&b.1));
273 }
274 let rank_by_cluster: HashMap<usize, usize> = centroid_order
275 .into_iter()
276 .enumerate()
277 .map(|(rank, (cluster_idx, _))| (cluster_idx, rank))
278 .collect();
279
280 let mut result = HashMap::new();
281 for (element_idx, value) in &clean_samples {
282 let mut best_idx = 0usize;
283 let mut best_dist = f32::INFINITY;
284 for (idx, c) in centroids.iter().enumerate() {
285 let dist = (value - c).abs();
286 if dist < best_dist {
287 best_dist = dist;
288 best_idx = idx;
289 }
290 }
291 let rank = rank_by_cluster.get(&best_idx).copied().unwrap_or(0);
292 let level = (rank + 2).clamp(2, 6);
293 result.insert(*element_idx, level);
294 }
295
296 result
297}
298
299#[derive(Debug, Clone, Serialize, Deserialize)]
311pub struct RegionBlock {
312 pub bbox: BoundingBox,
314 pub confidence: f32,
316 pub order_index: Option<u32>,
318 pub element_indices: Vec<usize>,
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
331pub struct PageContinuationFlags {
332 pub paragraph_start: bool,
334 pub paragraph_end: bool,
336}
337
338impl PageContinuationFlags {
339 pub fn new(paragraph_start: bool, paragraph_end: bool) -> Self {
340 Self {
341 paragraph_start,
342 paragraph_end,
343 }
344 }
345
346 pub fn as_tuple(&self) -> (bool, bool) {
348 (self.paragraph_start, self.paragraph_end)
349 }
350}
351
352#[derive(Debug, Clone, Serialize, Deserialize)]
374pub struct StructureResult {
375 pub input_path: Arc<str>,
377 pub index: usize,
379 pub layout_elements: Vec<LayoutElement>,
381 pub tables: Vec<TableResult>,
383 pub formulas: Vec<FormulaResult>,
385 pub text_regions: Option<Vec<TextRegion>>,
387 pub orientation_angle: Option<f32>,
389 pub region_blocks: Option<Vec<RegionBlock>>,
392 #[serde(skip)]
397 pub rectified_img: Option<Arc<RgbImage>>,
398 pub page_continuation_flags: Option<PageContinuationFlags>,
402}
403
404impl StructureResult {
405 pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
407 Self {
408 input_path: input_path.into(),
409 index,
410 layout_elements: Vec::new(),
411 tables: Vec::new(),
412 formulas: Vec::new(),
413 text_regions: None,
414 orientation_angle: None,
415 region_blocks: None,
416 rectified_img: None,
417 page_continuation_flags: None,
418 }
419 }
420
421 pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
423 self.layout_elements = elements;
424 self
425 }
426
427 pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
429 self.tables = tables;
430 self
431 }
432
433 pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
435 self.formulas = formulas;
436 self
437 }
438
439 pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
441 self.text_regions = Some(regions);
442 self
443 }
444
445 pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
450 self.region_blocks = Some(blocks);
451 self
452 }
453
454 pub fn with_page_continuation_flags(mut self, flags: PageContinuationFlags) -> Self {
456 self.page_continuation_flags = Some(flags);
457 self
458 }
459
460 pub fn to_markdown(&self) -> String {
472 let table_bboxes: Vec<&BoundingBox> = self
474 .layout_elements
475 .iter()
476 .filter(|e| e.element_type == LayoutElementType::Table)
477 .map(|e| &e.bbox)
478 .collect();
479
480 let original_image_width = self
482 .rectified_img
483 .as_ref()
484 .map(|img| img.width() as f32)
485 .or_else(|| {
486 self.layout_elements
488 .iter()
489 .map(|e| e.bbox.x_max())
490 .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
491 })
492 .unwrap_or(1.0);
493
494 let mut md = String::new();
495 let elements = &self.layout_elements;
496 let paragraph_title_levels = infer_paragraph_title_levels(elements);
497 let mut prev_text_element: Option<&LayoutElement> = None;
500
501 for (idx, element) in elements.iter().enumerate() {
502 if matches!(
504 element.element_type,
505 LayoutElementType::Number
506 | LayoutElementType::Footnote
507 | LayoutElementType::Header
508 | LayoutElementType::HeaderImage
509 | LayoutElementType::Footer
510 | LayoutElementType::FooterImage
511 | LayoutElementType::AsideText
512 ) {
513 continue;
514 }
515
516 if element.element_type == LayoutElementType::Text {
520 let overlaps_table = table_bboxes.iter().any(|table_bbox| {
521 element.bbox.ioa(table_bbox) > 0.3 });
523
524 if overlaps_table && element.confidence < 0.7 {
527 continue;
528 }
529 }
530
531 let seg_start_flag = get_seg_flag(element, prev_text_element);
535
536 let is_continuation = element.element_type == LayoutElementType::Text
537 && prev_text_element.is_some()
538 && !seg_start_flag;
539
540 if !is_continuation {
542 }
544
545 match element.element_type {
546 LayoutElementType::DocTitle => {
548 if !md.is_empty() {
549 md.push_str("\n\n");
550 }
551 if let Some(text) = &element.text {
552 let cleaned = clean_ocr_text(text);
553 let keyword = cleaned.trim().trim_end_matches(':').to_ascii_uppercase();
555 if matches!(
556 keyword.as_str(),
557 "ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
558 ) {
559 md.push_str("## ");
560 } else {
561 md.push_str("# ");
562 }
563 md.push_str(&cleaned);
564 }
565 }
566 LayoutElementType::ParagraphTitle => {
568 if !md.is_empty() {
569 md.push_str("\n\n");
570 }
571 if let Some(text) = &element.text {
572 let cleaned = clean_ocr_text(text);
573 let clustered = paragraph_title_levels.get(&idx).copied();
574 let (level, formatted_title) = format_title_with_level(&cleaned, clustered);
575 for _ in 0..level {
576 md.push('#');
577 }
578 md.push(' ');
579 md.push_str(&formatted_title);
580 } else {
581 md.push_str("## ");
582 }
583 }
584 LayoutElementType::Table => {
587 if !md.is_empty() {
588 md.push_str("\n\n");
589 }
590 if let Some(table) =
591 self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
592 {
593 if let Some(html) = &table.html_structure {
594 let simplified = simplify_table_html(html);
596 let table_with_border =
597 simplified.replacen("<table>", "<table border=\"1\">", 1);
598 let cleaned = clean_ocr_text(&table_with_border);
600 md.push_str("<div style=\"text-align: center;\">");
601 md.push_str(&cleaned);
602 md.push_str("</div>");
603 } else {
604 md.push_str("[Table]");
605 }
606 } else {
607 md.push_str("[Table]");
608 }
609 }
610 LayoutElementType::FormulaNumber => {
614 continue;
615 }
616 LayoutElementType::Formula => {
618 let raw_content = element.text.as_deref().map(|s| s.trim()).unwrap_or("");
620 if raw_content.is_empty() {
621 continue;
622 }
623 let latex_content =
625 if raw_content.starts_with("$$") && raw_content.ends_with("$$") {
626 &raw_content[2..raw_content.len() - 2]
627 } else if raw_content.starts_with('$') && raw_content.ends_with('$') {
628 &raw_content[1..raw_content.len() - 1]
629 } else {
630 raw_content
631 };
632
633 let is_inline = {
640 let has_prev_text = (0..idx)
641 .rev()
642 .find(|&i| {
643 let t = elements[i].element_type;
644 !t.is_formula() && t != LayoutElementType::FormulaNumber
645 })
646 .is_some_and(|i| {
647 let prev = &elements[i];
648 (prev.element_type == LayoutElementType::Text
649 || prev.element_type == LayoutElementType::ReferenceContent)
650 && is_same_line(&element.bbox, &prev.bbox)
651 });
652
653 let has_next_text = ((idx + 1)..elements.len())
654 .find(|&i| {
655 let t = elements[i].element_type;
656 !t.is_formula() && t != LayoutElementType::FormulaNumber
657 })
658 .is_some_and(|i| {
659 let next = &elements[i];
660 (next.element_type == LayoutElementType::Text
661 || next.element_type == LayoutElementType::ReferenceContent)
662 && is_same_line(&element.bbox, &next.bbox)
663 });
664
665 has_prev_text && has_next_text
668 };
669
670 if is_inline {
671 md.push('$');
673 md.push_str(latex_content);
674 md.push_str("$ ");
675 } else {
676 if !md.is_empty() {
678 md.push_str("\n\n");
679 }
680 md.push_str("$$");
681 md.push_str(latex_content);
682 md.push_str("$$");
683 }
684 }
685 LayoutElementType::Image | LayoutElementType::Chart => {
687 if !md.is_empty() {
688 md.push_str("\n\n");
689 }
690 md.push_str("<div style=\"text-align: center;\"><img src=\"");
692 let img_name = format!(
694 "imgs/img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg",
695 if element.element_type == LayoutElementType::Chart {
696 "chart"
697 } else {
698 "image"
699 },
700 element.bbox.x_min(),
701 element.bbox.y_min(),
702 element.bbox.x_max(),
703 element.bbox.y_max()
704 );
705 md.push_str(&img_name);
706 md.push_str("\" alt=\"Image\" width=\"");
707 let image_width = element.bbox.x_max() - element.bbox.x_min();
709 let width_pct = (image_width / original_image_width * 100.0) as u32;
710 let width_pct = width_pct.clamp(1, 100);
711 md.push_str(&format!("{}%", width_pct));
712 md.push_str("\" /></div>");
713 }
714 LayoutElementType::Seal => {
716 if !md.is_empty() {
717 md.push_str("\n\n");
718 }
719 md.push_str("![Seal]");
720 if let Some(text) = &element.text {
721 md.push_str("\n> ");
722 md.push_str(text);
723 }
724 }
725 _ if element.element_type.is_caption() => {
727 if let Some(text) = &element.text {
728 if !md.is_empty() {
729 md.push_str("\n\n");
730 }
731 let cleaned = clean_ocr_text(text);
732 md.push_str("<div style=\"text-align: center;\">");
733 md.push_str(&cleaned);
734 md.push_str(" </div>");
735 }
736 }
737 LayoutElementType::Abstract => {
739 if let Some(text) = &element.text {
740 if !md.is_empty() {
741 md.push_str("\n\n");
742 }
743 let formatted = format_first_line(text, " ", &["abstract", "摘要"], "## ");
744 md.push_str(&formatted);
745 }
746 }
747 LayoutElementType::Reference => {
749 if let Some(text) = &element.text {
750 if !md.is_empty() {
751 md.push_str("\n\n");
752 }
753 let formatted =
754 format_first_line(text, "\n", &["references", "参考文献"], "## ");
755 md.push_str(&formatted);
756 }
757 }
758 LayoutElementType::Content => {
760 if let Some(text) = &element.text {
761 if !md.is_empty() {
762 md.push_str("\n\n");
763 }
764 let formatted = format_content_block(text);
765 md.push_str(&formatted);
766 }
767 }
768 LayoutElementType::Footnote => {
770 if let Some(text) = &element.text {
771 if !md.is_empty() {
772 md.push_str("\n\n");
773 }
774 let formatted = format_vision_footnote_block(text);
775 md.push_str(&formatted);
776 }
777 }
778 LayoutElementType::List => {
780 if let Some(text) = &element.text {
781 if !md.is_empty() {
782 md.push_str("\n\n");
783 }
784 let cleaned = format_text_block(text);
785 for line in cleaned.lines() {
787 let line = line.trim();
788 if !line.is_empty() {
789 md.push_str("- ");
790 md.push_str(line);
791 md.push('\n');
792 }
793 }
794 }
795 }
796 LayoutElementType::Algorithm => {
798 if let Some(text) = &element.text {
799 if !md.is_empty() {
800 md.push_str("\n\n");
801 }
802 md.push_str(text.trim_matches('\n'));
803 }
804 }
805 _ if element.element_type.is_header() || element.element_type.is_footer() => {
807 continue;
810 }
811 _ => {
813 if let Some(text) = &element.text {
814 let cleaned = clean_ocr_text(text);
815 if has_bullet_markers(&cleaned) {
816 if !md.is_empty() {
817 md.push_str("\n\n");
818 }
819 format_as_bullet_list(&cleaned, &mut md);
820 } else if is_continuation {
821 let formatted = format_text_block(text);
822 md.push_str(&formatted);
823 } else {
824 if !md.is_empty() {
825 md.push_str("\n\n");
826 }
827 let formatted = format_text_block(text);
828 md.push_str(&formatted);
829 }
830 }
831 }
832 }
833
834 if element.element_type == LayoutElementType::Text
835 || element.element_type == LayoutElementType::ReferenceContent
836 {
837 prev_text_element = Some(element);
838 }
839 }
840 md.trim().to_string()
841 }
842
843 pub fn calculate_continuation_flags(&self) -> PageContinuationFlags {
852 let elements = &self.layout_elements;
853
854 if elements.is_empty() {
855 return PageContinuationFlags::new(true, true);
856 }
857
858 let page_width = self
860 .rectified_img
861 .as_ref()
862 .map(|img| img.width() as f32)
863 .or_else(|| {
864 elements
865 .iter()
866 .map(|e| e.bbox.x_max())
867 .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
868 });
869
870 let text_elements: Vec<_> = elements
872 .iter()
873 .filter(|e| {
874 matches!(
875 e.element_type,
876 LayoutElementType::Text
877 | LayoutElementType::DocTitle
878 | LayoutElementType::ParagraphTitle
879 | LayoutElementType::Abstract
880 | LayoutElementType::Reference
881 )
882 })
883 .collect();
884
885 if text_elements.is_empty() {
886 return PageContinuationFlags::new(true, true);
887 }
888
889 let first = &text_elements[0];
891 let paragraph_start = is_new_paragraph_start(first, page_width);
892
893 let last = &text_elements[text_elements.len() - 1];
895 let paragraph_end = is_paragraph_complete(last, page_width);
896
897 PageContinuationFlags::new(paragraph_start, paragraph_end)
898 }
899
900 pub fn to_html(&self) -> String {
904 let mut html = String::from(
905 "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
906 );
907
908 for element in &self.layout_elements {
909 match element.element_type {
910 LayoutElementType::DocTitle => {
912 html.push_str("<h1>");
913 if let Some(text) = &element.text {
914 html.push_str(&Self::escape_html(text));
915 }
916 html.push_str("</h1>\n");
917 }
918 LayoutElementType::ParagraphTitle => {
920 html.push_str("<h2>");
921 if let Some(text) = &element.text {
922 html.push_str(&Self::escape_html(text));
923 }
924 html.push_str("</h2>\n");
925 }
926 LayoutElementType::Table => {
928 if let Some(table) =
929 self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
930 {
931 if let Some(table_html) = &table.html_structure {
932 let simplified = simplify_table_html(table_html);
934 let styled = simplified.replacen(
935 "<table>",
936 "<table border=\"1\" style=\"border-collapse: collapse;\">",
937 1,
938 );
939 html.push_str(&styled);
940 html.push('\n');
941 } else {
942 html.push_str("<p>[Table]</p>\n");
943 }
944 } else {
945 html.push_str("<p>[Table]</p>\n");
946 }
947 }
948 LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
950 html.push_str("<p class=\"formula\">$$");
951 if let Some(latex) = &element.text {
952 html.push_str(&Self::escape_html(latex));
953 }
954 html.push_str("$$</p>\n");
955 }
956 LayoutElementType::Image | LayoutElementType::Chart => {
958 html.push_str("<figure>\n<img alt=\"Figure\" />\n");
959 if let Some(caption) = &element.text {
960 html.push_str("<figcaption>");
961 html.push_str(&Self::escape_html(caption));
962 html.push_str("</figcaption>\n");
963 }
964 html.push_str("</figure>\n");
965 }
966 LayoutElementType::Seal => {
968 html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
969 if let Some(text) = &element.text {
970 html.push_str("<figcaption>");
971 html.push_str(&Self::escape_html(text));
972 html.push_str("</figcaption>\n");
973 }
974 html.push_str("</figure>\n");
975 }
976 _ if element.element_type.is_caption() => {
978 if let Some(text) = &element.text {
979 html.push_str("<figcaption>");
980 html.push_str(&Self::escape_html(text));
981 html.push_str("</figcaption>\n");
982 }
983 }
984 LayoutElementType::Abstract => {
986 html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
987 if let Some(text) = &element.text {
988 html.push_str(&Self::escape_html(text));
989 }
990 html.push_str("</p>\n</section>\n");
991 }
992 LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
994 html.push_str("<section class=\"references\">\n<p>");
995 if let Some(text) = &element.text {
996 html.push_str(&Self::escape_html(text));
997 }
998 html.push_str("</p>\n</section>\n");
999 }
1000 LayoutElementType::List => {
1002 html.push_str("<ul>\n");
1003 if let Some(text) = &element.text {
1004 for line in text.lines() {
1005 html.push_str("<li>");
1006 html.push_str(&Self::escape_html(line));
1007 html.push_str("</li>\n");
1008 }
1009 }
1010 html.push_str("</ul>\n");
1011 }
1012 _ if element.element_type.is_header() => {
1014 html.push_str("<header>");
1015 if let Some(text) = &element.text {
1016 html.push_str(&Self::escape_html(text));
1017 }
1018 html.push_str("</header>\n");
1019 }
1020 _ if element.element_type.is_footer() => {
1022 html.push_str("<footer>");
1023 if let Some(text) = &element.text {
1024 html.push_str(&Self::escape_html(text));
1025 }
1026 html.push_str("</footer>\n");
1027 }
1028 _ => {
1030 if let Some(text) = &element.text {
1031 html.push_str("<p>");
1032 html.push_str(&Self::escape_html(text));
1033 html.push_str("</p>\n");
1034 }
1035 }
1036 }
1037 }
1038 html.push_str("</body>\n</html>");
1039 html
1040 }
1041
1042 fn escape_html(text: &str) -> String {
1044 text.replace('&', "&")
1045 .replace('<', "<")
1046 .replace('>', ">")
1047 .replace('"', """)
1048 .replace('\'', "'")
1049 }
1050
1051 pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
1053 serde_json::to_value(self)
1054 }
1055
1056 pub fn save_results(
1073 &self,
1074 output_dir: impl AsRef<Path>,
1075 to_json: bool,
1076 to_html: bool,
1077 ) -> std::io::Result<()> {
1078 let output_dir = output_dir.as_ref();
1079 if !output_dir.exists() {
1080 std::fs::create_dir_all(output_dir)?;
1081 }
1082
1083 let input_path = Path::new(self.input_path.as_ref());
1084 let stem = if let Some(path_str) = input_path.to_str() {
1086 if let Some(hash_idx) = path_str.rfind('#') {
1087 let base = &path_str[..hash_idx];
1089 let page_num = &path_str[hash_idx + 1..];
1090 let base_stem = Path::new(base)
1091 .file_stem()
1092 .and_then(|s| s.to_str())
1093 .unwrap_or("result");
1094 format!("{}_{}", base_stem, page_num)
1095 } else {
1096 input_path
1097 .file_stem()
1098 .and_then(|s| s.to_str())
1099 .unwrap_or("result")
1100 .to_string()
1101 }
1102 } else {
1103 "result".to_string()
1104 };
1105
1106 if to_json {
1108 let json_path = output_dir.join(format!("{}.json", stem));
1109 let json_file = std::fs::File::create(json_path)?;
1110 serde_json::to_writer_pretty(json_file, self)?;
1111 }
1112
1113 if to_html {
1115 let html_path = output_dir.join(format!("{}.html", stem));
1116 std::fs::write(html_path, self.to_html())?;
1117 }
1118
1119 Ok(())
1120 }
1121}
1122
1123fn get_seg_flag(current: &LayoutElement, prev: Option<&LayoutElement>) -> bool {
1137 const COORD_THRESHOLD: f32 = 10.0;
1138
1139 let seg_start = current.seg_start_x.unwrap_or(current.bbox.x_min());
1140 let mut context_left = current.bbox.x_min();
1141 let mut context_right = current.bbox.x_max();
1142
1143 if let Some(prev) = prev {
1144 let prev_seg_end = prev.seg_end_x.unwrap_or(prev.bbox.x_max());
1145 let prev_num_lines = prev.num_lines.unwrap_or(1);
1146
1147 let overlap_blocks = context_left < prev.bbox.x_max() && context_right > prev.bbox.x_min();
1149
1150 let edge_distance;
1151 if overlap_blocks {
1152 context_left = context_left.min(prev.bbox.x_min());
1153 context_right = context_right.max(prev.bbox.x_max());
1154 edge_distance = 0.0;
1155 } else {
1156 edge_distance = (current.bbox.x_min() - prev.bbox.x_max()).abs();
1157 }
1158
1159 let prev_end_space_small = (context_right - prev_seg_end).abs() < COORD_THRESHOLD;
1160 let current_start_space_small = seg_start - context_left < COORD_THRESHOLD;
1161 let prev_lines_more_than_one = prev_num_lines > 1;
1162 let blocks_close = edge_distance
1163 < (prev.bbox.x_max() - prev.bbox.x_min())
1164 .max(current.bbox.x_max() - current.bbox.x_min());
1165
1166 if prev_end_space_small
1167 && current_start_space_small
1168 && prev_lines_more_than_one
1169 && blocks_close
1170 {
1171 return false; }
1173
1174 true } else {
1176 if seg_start - context_left < COORD_THRESHOLD {
1178 return false; }
1180 true
1181 }
1182}
1183
1184fn is_new_paragraph_start(element: &LayoutElement, page_width: Option<f32>) -> bool {
1189 let left = element.bbox.x_min();
1190 let threshold = page_width.map_or(50.0, |w| w * 0.05); left <= threshold
1192}
1193
1194fn is_paragraph_complete(element: &LayoutElement, page_width: Option<f32>) -> bool {
1199 let right = element.bbox.x_max();
1200
1201 if let Some(width) = page_width {
1203 let right_margin = width * 0.1;
1204 return right <= (width - right_margin);
1205 }
1206
1207 true
1209}
1210
1211pub fn concatenate_markdown_pages(results: &[StructureResult]) -> String {
1224 if results.is_empty() {
1225 return String::new();
1226 }
1227
1228 if results.len() == 1 {
1229 return results[0].to_markdown();
1230 }
1231
1232 let mut markdown = String::new();
1233 let mut prev_page_end_flag = true; for result in results.iter() {
1236 let flags = result
1237 .page_continuation_flags
1238 .as_ref()
1239 .cloned()
1240 .unwrap_or_else(|| result.calculate_continuation_flags());
1241
1242 let page_markdown = result.to_markdown();
1243
1244 if page_markdown.trim().is_empty() {
1246 prev_page_end_flag = flags.paragraph_end;
1247 continue;
1248 }
1249
1250 let page_first_continues = !flags.paragraph_start;
1251 let _page_last_continues = !flags.paragraph_end;
1252
1253 if page_first_continues && !prev_page_end_flag {
1255 let last_char = markdown.chars().last();
1258 let first_char = page_markdown.chars().next();
1259
1260 let last_is_chinese = last_char.is_some_and(is_chinese_char);
1261 let first_is_chinese = first_char.is_some_and(is_chinese_char);
1262
1263 if !last_is_chinese && !first_is_chinese {
1264 markdown.push(' ');
1266 markdown.push_str(page_markdown.trim_start());
1267 } else {
1268 markdown.push_str(page_markdown.trim_start());
1270 }
1271 } else {
1272 if !markdown.is_empty() {
1274 markdown.push_str("\n\n");
1275 }
1276 markdown.push_str(&page_markdown);
1277 }
1278
1279 prev_page_end_flag = flags.paragraph_end;
1280 }
1281
1282 markdown.trim().to_string()
1283}
1284
1285fn clean_ocr_text(text: &str) -> String {
1295 text.replace("-\n", "").replace('\n', " ")
1297}
1298
1299fn format_first_line(
1310 text: &str,
1311 spliter: &str,
1312 templates: &[&str],
1313 heading_prefix: &str,
1314) -> String {
1315 let parts: Vec<&str> = text.split(spliter).collect();
1316 let mut result_parts: Vec<String> = Vec::with_capacity(parts.len());
1317 let mut found_first = false;
1318
1319 for part in &parts {
1320 if !found_first {
1321 let trimmed = part.trim();
1322 if trimmed.is_empty() {
1323 result_parts.push(part.to_string());
1324 continue;
1325 }
1326 found_first = true;
1327 if templates.iter().any(|t| trimmed.eq_ignore_ascii_case(t)) {
1329 result_parts.push(format!("{}{}\n", heading_prefix, trimmed));
1331 } else {
1332 result_parts.push(part.to_string());
1333 }
1334 } else {
1335 result_parts.push(part.to_string());
1336 }
1337 }
1338
1339 result_parts.join(spliter)
1340}
1341
1342fn format_text_block(text: &str) -> String {
1348 let dehyphenated = text.replace("-\n", "");
1350 let step1 = dehyphenated.replace("\n\n", "\n");
1352 step1.replace('\n', "\n\n")
1354}
1355
1356fn format_content_block(text: &str) -> String {
1361 let step1 = text.replace("-\n", " \n");
1363 step1.replace('\n', " \n")
1365}
1366
1367fn format_vision_footnote_block(text: &str) -> String {
1371 let dehyphenated = text.replace("-\n", "");
1372 let step1 = dehyphenated.replace("\n\n", "\n");
1373 step1.replace('\n', "\n\n")
1374}
1375
1376const BULLET_MARKERS: &[char] = &['•', '●', '◦', '▪', '◆'];
1378
1379fn has_bullet_markers(text: &str) -> bool {
1381 BULLET_MARKERS.iter().any(|&m| text.contains(m))
1382}
1383
1384fn format_as_bullet_list(text: &str, md: &mut String) {
1389 for item in text.split(|c: char| BULLET_MARKERS.contains(&c)) {
1390 let item = item.trim();
1391 if !item.is_empty() {
1392 md.push_str("- ");
1393 md.push_str(item);
1394 md.push('\n');
1395 }
1396 }
1397}
1398
1399fn is_chinese_char(c: char) -> bool {
1403 match c {
1404 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' | '\u{2CEB0}'..='\u{2EBEF}' => true,
1412 _ => false,
1413 }
1414}
1415
1416fn is_lowercase(c: char) -> bool {
1418 c.is_ascii_lowercase()
1419}
1420
1421fn is_uppercase(c: char) -> bool {
1423 c.is_ascii_uppercase()
1424}
1425
1426fn is_digit(c: char) -> bool {
1428 c.is_ascii_digit()
1429}
1430
1431fn dehyphenate(text: &str) -> String {
1438 let mut result = String::with_capacity(text.len());
1439 let chars: Vec<char> = text.chars().collect();
1440 let len = chars.len();
1441 let mut i = 0;
1442
1443 let is_url_context = |pos: usize| -> bool {
1445 let start = pos.saturating_sub(10);
1446 let end = (pos + 5).min(len);
1447 let window: String = chars[start..end].iter().collect();
1448 window.contains("http") || window.contains("www") || window.contains("://")
1449 };
1450
1451 while i < len {
1452 if chars[i] == '-' {
1453 if is_url_context(i) {
1454 result.push('-');
1455 i += 1;
1456 continue;
1457 }
1458
1459 let is_artifact = if i + 1 < len && chars[i + 1] == '\n' {
1462 if i + 2 < len {
1464 is_lowercase(chars[i + 2])
1465 } else {
1466 false
1467 }
1468 } else {
1469 false
1470 };
1471
1472 if is_artifact {
1473 i += 1; } else {
1476 result.push('-');
1477 }
1478 } else {
1479 result.push(chars[i]);
1480 }
1481 i += 1;
1482 }
1483
1484 result
1485}
1486
1487fn fix_merged_words(text: &str) -> String {
1493 let mut result = String::with_capacity(text.len());
1494 let chars: Vec<char> = text.chars().collect();
1495 let mut i = 0;
1496
1497 while i < chars.len() {
1498 let current = chars[i];
1499
1500 if i > 0 {
1501 let prev = chars[i - 1];
1502
1503 if is_lowercase(prev) && is_lowercase(current) {
1506 if i > 1 && chars[i - 2] == '\'' {
1509 result.push(' ');
1510 }
1511 } else if is_lowercase(prev) && is_uppercase(current) {
1514 if i + 1 < chars.len() && is_lowercase(chars[i + 1]) {
1517 result.push(' ');
1518 }
1519 }
1520 else if ((is_digit(prev) || prev == '%') && is_uppercase(current))
1524 || (is_letter(prev)
1525 && is_digit(current)
1526 && i + 1 < chars.len()
1527 && is_letter(chars[i + 1]))
1528 {
1529 result.push(' ');
1530 }
1531 }
1532
1533 result.push(current);
1534 i += 1;
1535 }
1536
1537 result
1538}
1539
1540fn is_letter(c: char) -> bool {
1542 is_lowercase(c) || is_uppercase(c)
1543}
1544
1545fn simplify_table_html(html: &str) -> String {
1550 html.replace("<html>", "")
1551 .replace("</html>", "")
1552 .replace("<body>", "")
1553 .replace("</body>", "")
1554}
1555
1556pub fn postprocess_text(text: &str) -> String {
1563 let text = dehyphenate(text);
1564 let text = fix_merged_words(&text);
1565
1566 let mut result = String::new();
1568 let mut in_space = false;
1569
1570 for c in text.chars() {
1571 if c.is_whitespace() {
1572 if !in_space && !result.is_empty() {
1573 result.push(' ');
1574 in_space = true;
1575 }
1576 } else {
1577 if c == '.' && !result.is_empty() {
1579 let last = result.chars().last().unwrap();
1580 if is_letter(last) || is_digit(last) {
1581 result.push('.');
1582 in_space = true;
1583 continue;
1584 }
1585 }
1586 if in_space && matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')' | ']' | '}') {
1588 result.pop(); result.push(c);
1590 continue;
1591 }
1592 result.push(c);
1593 in_space = false;
1594 }
1595 }
1596
1597 result
1598}
1599
1600fn deduplicate_sections(markdown: &str) -> String {
1606 let mut result = String::new();
1607 let mut seen_sections: std::collections::HashSet<String> = std::collections::HashSet::new();
1608
1609 for line in markdown.lines() {
1610 let trimmed = line.trim();
1611
1612 let is_section_header =
1614 trimmed.starts_with("**") && trimmed.ends_with("**") && trimmed.len() > 4;
1615
1616 let section_name = if is_section_header {
1617 trimmed[2..trimmed.len() - 2].to_string()
1618 } else {
1619 String::new()
1620 };
1621
1622 if is_section_header {
1623 if seen_sections.contains(§ion_name) {
1624 continue;
1626 }
1627 seen_sections.insert(section_name);
1628 }
1629
1630 if !result.is_empty() {
1631 result.push('\n');
1632 }
1633 result.push_str(line);
1634 }
1635
1636 result
1637}
1638
1639fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool {
1644 let y1_min = bbox1.y_min();
1645 let y1_max = bbox1.y_max();
1646 let y2_min = bbox2.y_min();
1647 let y2_max = bbox2.y_max();
1648
1649 let overlap_start = y1_min.max(y2_min);
1651 let overlap_end = y1_max.min(y2_max);
1652 let overlap = (overlap_end - overlap_start).max(0.0);
1653
1654 let height1 = y1_max - y1_min;
1656 let height2 = y2_max - y2_min;
1657 let min_height = height1.min(height2);
1658
1659 min_height > 0.0 && overlap / min_height > 0.5
1661}
1662
1663fn filter_empty_formulas(markdown: &str) -> String {
1667 let mut result = String::new();
1668 let lines: Vec<&str> = markdown.lines().collect();
1669 let mut i = 0;
1670
1671 while i < lines.len() {
1672 let line = lines[i];
1673
1674 if line.trim() == "$$" {
1676 if i + 1 < lines.len() && lines[i + 1].trim() == "$$" {
1678 i += 2;
1680 if i < lines.len() && lines[i].trim().is_empty() {
1682 i += 1;
1683 }
1684 continue;
1685 }
1686 let mut j = i + 1;
1688 let has_content = if j < lines.len() {
1689 let mut found = false;
1690 while j < lines.len() {
1691 if lines[j].trim() == "$$" {
1692 break;
1693 }
1694 if !lines[j].trim().is_empty() {
1695 found = true;
1696 break;
1697 }
1698 j += 1;
1699 }
1700 found
1701 } else {
1702 false
1703 };
1704
1705 if !has_content {
1706 while i < lines.len() && lines[i].trim() != "$$" {
1708 i += 1;
1709 }
1710 if i < lines.len() {
1711 i += 1; }
1713 continue;
1714 }
1715 }
1716
1717 if !result.is_empty() {
1718 result.push('\n');
1719 }
1720 result.push_str(line);
1721 i += 1;
1722 }
1723
1724 result
1725}
1726
1727pub fn postprocess_markdown(markdown: &str) -> String {
1731 let markdown = filter_empty_formulas(markdown);
1732 let markdown = deduplicate_sections(&markdown);
1733
1734 let mut result = String::new();
1736 let mut in_code_block = false;
1737 let mut in_formula = false;
1738
1739 for line in markdown.lines() {
1740 let trimmed = line.trim();
1741
1742 if trimmed.starts_with("```") {
1744 in_code_block = !in_code_block;
1745 result.push_str(line);
1746 result.push('\n');
1747 continue;
1748 }
1749
1750 if trimmed == "$$" {
1752 in_formula = !in_formula;
1753 result.push_str(line);
1754 result.push('\n');
1755 continue;
1756 }
1757
1758 if in_code_block {
1760 result.push_str(line);
1761 result.push('\n');
1762 continue;
1763 }
1764
1765 if in_formula {
1768 let contains_dollar = line.contains('$');
1771 let is_plain_text = line.split_whitespace().count() > 3 && !line.contains('\\');
1772
1773 if contains_dollar && is_plain_text {
1774 result.push_str(&line.replace('$', "\\$"));
1775 } else if contains_dollar {
1776 result.push_str(&line.replace('$', "\\$"));
1780 } else {
1781 result.push_str(line);
1782 }
1783 result.push('\n');
1784 continue;
1785 }
1786
1787 if trimmed.starts_with('#')
1789 || trimmed.starts_with('*')
1790 || trimmed.starts_with('>')
1791 || trimmed.starts_with('|')
1792 || trimmed.starts_with('-')
1793 || trimmed.starts_with('+')
1794 {
1795 result.push_str(line);
1796 } else {
1797 result.push_str(&postprocess_text(line));
1798 }
1799 result.push('\n');
1800 }
1801
1802 result
1803}
1804
1805pub trait StructureResultExt {
1807 fn to_concatenated_markdown(results: &[Self]) -> String
1809 where
1810 Self: Sized;
1811
1812 fn save_multi_page_results(
1814 results: &[Self],
1815 output_dir: impl AsRef<std::path::Path>,
1816 base_name: &str,
1817 to_json: bool,
1818 to_markdown: bool,
1819 to_html: bool,
1820 ) -> std::io::Result<()>
1821 where
1822 Self: Sized;
1823}
1824
1825impl StructureResultExt for StructureResult {
1826 fn to_concatenated_markdown(results: &[Self]) -> String {
1827 concatenate_markdown_pages(results)
1828 }
1829
1830 fn save_multi_page_results(
1831 results: &[Self],
1832 output_dir: impl AsRef<std::path::Path>,
1833 base_name: &str,
1834 to_json: bool,
1835 to_markdown: bool,
1836 to_html: bool,
1837 ) -> std::io::Result<()>
1838 where
1839 Self: Sized,
1840 {
1841 let output_dir = output_dir.as_ref();
1842 if !output_dir.exists() {
1843 std::fs::create_dir_all(output_dir)?;
1844 }
1845
1846 for (idx, result) in results.iter().enumerate() {
1848 let page_dir = output_dir.join(format!("page_{:03}", idx));
1849 std::fs::create_dir_all(&page_dir)?;
1850 result.save_results(&page_dir, to_json, to_html)?;
1851 }
1852
1853 if to_markdown {
1855 let concat_md_path = output_dir.join(format!("{}.md", base_name));
1856 std::fs::write(concat_md_path, Self::to_concatenated_markdown(results))?;
1857 }
1858
1859 if to_json {
1861 let concat_json_path = output_dir.join(format!("{}.json", base_name));
1862 let json_file = std::fs::File::create(concat_json_path)?;
1863 serde_json::to_writer_pretty(json_file, &results)?;
1864 }
1865
1866 Ok(())
1867 }
1868}
1869
1870#[derive(Debug, Clone, Serialize, Deserialize)]
1872pub struct LayoutElement {
1873 pub bbox: BoundingBox,
1875 pub element_type: LayoutElementType,
1877 pub confidence: f32,
1879 pub label: Option<String>,
1881 pub text: Option<String>,
1883 pub order_index: Option<u32>,
1890 #[serde(skip_serializing_if = "Option::is_none")]
1894 pub seg_start_x: Option<f32>,
1895 #[serde(skip_serializing_if = "Option::is_none")]
1899 pub seg_end_x: Option<f32>,
1900 #[serde(skip_serializing_if = "Option::is_none")]
1903 pub num_lines: Option<u32>,
1904}
1905
1906impl LayoutElement {
1907 pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
1909 Self {
1910 bbox,
1911 element_type,
1912 confidence,
1913 label: None,
1914 text: None,
1915 order_index: None,
1916 seg_start_x: None,
1917 seg_end_x: None,
1918 num_lines: None,
1919 }
1920 }
1921
1922 pub fn with_label(mut self, label: impl Into<String>) -> Self {
1924 self.label = Some(label.into());
1925 self
1926 }
1927
1928 pub fn with_text(mut self, text: impl Into<String>) -> Self {
1930 self.text = Some(text.into());
1931 self
1932 }
1933}
1934
1935#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
1982pub enum LayoutElementType {
1983 DocTitle,
1985 ParagraphTitle,
1987 Text,
1989 Content,
1991 Abstract,
1993
1994 Image,
1996 Table,
1998 Chart,
2000 Formula,
2002
2003 FigureTitle,
2005 TableTitle,
2007 ChartTitle,
2009 FigureTableChartTitle,
2011
2012 Header,
2014 HeaderImage,
2016 Footer,
2018 FooterImage,
2020 Footnote,
2022
2023 Seal,
2025 Number,
2027 Reference,
2029 ReferenceContent,
2031 Algorithm,
2033 FormulaNumber,
2035 AsideText,
2037 List,
2039
2040 Region,
2043
2044 Other,
2046}
2047
2048impl LayoutElementType {
2049 pub fn as_str(&self) -> &'static str {
2053 match self {
2054 LayoutElementType::DocTitle => "doc_title",
2056 LayoutElementType::ParagraphTitle => "paragraph_title",
2057 LayoutElementType::Text => "text",
2058 LayoutElementType::Content => "content",
2059 LayoutElementType::Abstract => "abstract",
2060
2061 LayoutElementType::Image => "image",
2063 LayoutElementType::Table => "table",
2064 LayoutElementType::Chart => "chart",
2065 LayoutElementType::Formula => "formula",
2066
2067 LayoutElementType::FigureTitle => "figure_title",
2069 LayoutElementType::TableTitle => "table_title",
2070 LayoutElementType::ChartTitle => "chart_title",
2071 LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
2072
2073 LayoutElementType::Header => "header",
2075 LayoutElementType::HeaderImage => "header_image",
2076 LayoutElementType::Footer => "footer",
2077 LayoutElementType::FooterImage => "footer_image",
2078 LayoutElementType::Footnote => "footnote",
2079
2080 LayoutElementType::Seal => "seal",
2082 LayoutElementType::Number => "number",
2083 LayoutElementType::Reference => "reference",
2084 LayoutElementType::ReferenceContent => "reference_content",
2085 LayoutElementType::Algorithm => "algorithm",
2086 LayoutElementType::FormulaNumber => "formula_number",
2087 LayoutElementType::AsideText => "aside_text",
2088 LayoutElementType::List => "list",
2089
2090 LayoutElementType::Region => "region",
2092
2093 LayoutElementType::Other => "other",
2095 }
2096 }
2097
2098 pub fn from_label(label: &str) -> Self {
2103 match label.to_lowercase().as_str() {
2104 "doc_title" => LayoutElementType::DocTitle,
2106 "paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
2107 "text" | "paragraph" => LayoutElementType::Text,
2108 "content" => LayoutElementType::Content,
2109 "abstract" => LayoutElementType::Abstract,
2110
2111 "image" | "figure" => LayoutElementType::Image,
2113 "table" => LayoutElementType::Table,
2114 "chart" | "flowchart" => LayoutElementType::Chart,
2115 "formula" | "equation" | "display_formula" | "inline_formula" => {
2116 LayoutElementType::Formula
2117 }
2118
2119 "figure_title" => LayoutElementType::FigureTitle,
2121 "table_title" => LayoutElementType::TableTitle,
2122 "chart_title" => LayoutElementType::ChartTitle,
2123 "figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
2124
2125 "header" => LayoutElementType::Header,
2127 "header_image" => LayoutElementType::HeaderImage,
2128 "footer" => LayoutElementType::Footer,
2129 "footer_image" => LayoutElementType::FooterImage,
2130 "footnote" | "vision_footnote" => LayoutElementType::Footnote,
2131
2132 "seal" => LayoutElementType::Seal,
2134 "number" => LayoutElementType::Number,
2135 "reference" => LayoutElementType::Reference,
2136 "reference_content" => LayoutElementType::ReferenceContent,
2137 "algorithm" => LayoutElementType::Algorithm,
2138 "formula_number" => LayoutElementType::FormulaNumber,
2139 "aside_text" => LayoutElementType::AsideText,
2140 "list" => LayoutElementType::List,
2141 "vertical_text" => LayoutElementType::Text,
2142
2143 "region" => LayoutElementType::Region,
2145
2146 _ => LayoutElementType::Other,
2149 }
2150 }
2151
2152 pub fn semantic_category(&self) -> &'static str {
2171 match self {
2172 LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
2174
2175 LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
2177 "text"
2178 }
2179
2180 LayoutElementType::Image | LayoutElementType::Chart => "visual",
2182
2183 LayoutElementType::Table => "table",
2185
2186 LayoutElementType::FigureTitle
2188 | LayoutElementType::TableTitle
2189 | LayoutElementType::ChartTitle
2190 | LayoutElementType::FigureTableChartTitle => "caption",
2191
2192 LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
2194
2195 LayoutElementType::Footer
2197 | LayoutElementType::FooterImage
2198 | LayoutElementType::Footnote => "footer",
2199
2200 LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
2202
2203 LayoutElementType::Seal
2205 | LayoutElementType::Number
2206 | LayoutElementType::Reference
2207 | LayoutElementType::ReferenceContent
2208 | LayoutElementType::Algorithm
2209 | LayoutElementType::AsideText => "special",
2210
2211 LayoutElementType::List => "list",
2213
2214 LayoutElementType::Region => "region",
2216
2217 LayoutElementType::Other => "other",
2219 }
2220 }
2221
2222 pub fn is_title(&self) -> bool {
2224 matches!(
2225 self,
2226 LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
2227 )
2228 }
2229
2230 pub fn is_visual(&self) -> bool {
2232 matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
2233 }
2234
2235 pub fn is_caption(&self) -> bool {
2237 matches!(
2238 self,
2239 LayoutElementType::FigureTitle
2240 | LayoutElementType::TableTitle
2241 | LayoutElementType::ChartTitle
2242 | LayoutElementType::FigureTableChartTitle
2243 )
2244 }
2245
2246 pub fn is_header(&self) -> bool {
2248 matches!(
2249 self,
2250 LayoutElementType::Header | LayoutElementType::HeaderImage
2251 )
2252 }
2253
2254 pub fn is_footer(&self) -> bool {
2256 matches!(
2257 self,
2258 LayoutElementType::Footer
2259 | LayoutElementType::FooterImage
2260 | LayoutElementType::Footnote
2261 )
2262 }
2263
2264 pub fn is_formula(&self) -> bool {
2266 matches!(
2267 self,
2268 LayoutElementType::Formula | LayoutElementType::FormulaNumber
2269 )
2270 }
2271
2272 pub fn should_ocr(&self) -> bool {
2274 matches!(
2275 self,
2276 LayoutElementType::Text
2277 | LayoutElementType::Content
2278 | LayoutElementType::Abstract
2279 | LayoutElementType::DocTitle
2280 | LayoutElementType::ParagraphTitle
2281 | LayoutElementType::FigureTitle
2282 | LayoutElementType::TableTitle
2283 | LayoutElementType::ChartTitle
2284 | LayoutElementType::FigureTableChartTitle
2285 | LayoutElementType::Header
2286 | LayoutElementType::HeaderImage
2287 | LayoutElementType::Footer
2288 | LayoutElementType::FooterImage
2289 | LayoutElementType::Footnote
2290 | LayoutElementType::Reference
2291 | LayoutElementType::ReferenceContent
2292 | LayoutElementType::Algorithm
2293 | LayoutElementType::AsideText
2294 | LayoutElementType::List
2295 | LayoutElementType::Number
2296 )
2297 }
2298}
2299
2300pub fn remove_overlapping_layout_elements(
2305 layout_elements: &mut Vec<LayoutElement>,
2306 overlap_threshold: f32,
2307) -> usize {
2308 use std::collections::HashSet;
2309
2310 if layout_elements.len() <= 1 {
2311 return 0;
2312 }
2313
2314 let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
2315 let labels: Vec<&str> = layout_elements
2316 .iter()
2317 .map(|e| e.element_type.as_str())
2318 .collect();
2319
2320 let remove_indices =
2321 crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
2322 if remove_indices.is_empty() {
2323 return 0;
2324 }
2325
2326 let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
2327 let before = layout_elements.len();
2328
2329 let mut idx = 0;
2330 layout_elements.retain(|_| {
2331 let keep = !remove_set.contains(&idx);
2332 idx += 1;
2333 keep
2334 });
2335
2336 before.saturating_sub(layout_elements.len())
2337}
2338
2339pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
2343 if layout_elements.is_empty() {
2344 return;
2345 }
2346
2347 let mut footnote_indices: Vec<usize> = Vec::new();
2348 let mut paragraph_title_indices: Vec<usize> = Vec::new();
2349 let mut bottom_text_y_max: f32 = 0.0;
2350 let mut max_block_area: f32 = 0.0;
2351 let mut doc_title_num: usize = 0;
2352
2353 for (idx, elem) in layout_elements.iter().enumerate() {
2354 let area =
2355 (elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
2356 max_block_area = max_block_area.max(area);
2357
2358 match elem.element_type {
2359 LayoutElementType::Footnote => footnote_indices.push(idx),
2360 LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
2361 LayoutElementType::Text => {
2362 bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
2363 }
2364 LayoutElementType::DocTitle => doc_title_num += 1,
2365 _ => {}
2366 }
2367 }
2368
2369 for idx in footnote_indices {
2370 if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
2371 layout_elements[idx].element_type = LayoutElementType::Text;
2372 layout_elements[idx].label = Some("text".to_string());
2373 }
2374 }
2375
2376 let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
2377 if only_one_paragraph_title {
2378 let idx = paragraph_title_indices[0];
2379 let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
2380 * (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
2381
2382 let title_area_ratio_threshold = 0.3f32;
2383 if area > max_block_area * title_area_ratio_threshold {
2384 layout_elements[idx].element_type = LayoutElementType::DocTitle;
2385 layout_elements[idx].label = Some("doc_title".to_string());
2386 }
2387 }
2388}
2389
2390#[derive(Debug, Clone, Serialize, Deserialize)]
2392pub struct TableResult {
2393 pub bbox: BoundingBox,
2395 pub table_type: TableType,
2397 pub classification_confidence: Option<f32>,
2399 pub structure_confidence: Option<f32>,
2401 pub cells: Vec<TableCell>,
2403 pub html_structure: Option<String>,
2405 pub cell_texts: Option<Vec<Option<String>>>,
2407 #[serde(skip)]
2409 pub structure_tokens: Option<Vec<String>>,
2410 #[serde(skip)]
2414 pub detected_cell_bboxes: Option<Vec<BoundingBox>>,
2415 #[serde(skip)]
2419 pub is_e2e: bool,
2420}
2421
2422impl TableResult {
2423 pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
2425 Self {
2426 bbox,
2427 table_type,
2428 classification_confidence: None,
2429 structure_confidence: None,
2430 cells: Vec::new(),
2431 html_structure: None,
2432 cell_texts: None,
2433 structure_tokens: None,
2434 detected_cell_bboxes: None,
2435 is_e2e: false,
2436 }
2437 }
2438
2439 pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
2441 self.classification_confidence = Some(confidence);
2442 self
2443 }
2444
2445 pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
2447 self.structure_confidence = Some(confidence);
2448 self
2449 }
2450
2451 pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
2453 self.cells = cells;
2454 self
2455 }
2456
2457 pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
2459 self.html_structure = Some(html.into());
2460 self
2461 }
2462
2463 pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
2465 self.cell_texts = Some(texts);
2466 self
2467 }
2468
2469 pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
2471 self.structure_tokens = Some(tokens);
2472 self
2473 }
2474
2475 pub fn with_detected_cell_bboxes(mut self, bboxes: Vec<BoundingBox>) -> Self {
2477 self.detected_cell_bboxes = Some(bboxes);
2478 self
2479 }
2480
2481 pub fn with_e2e(mut self, is_e2e: bool) -> Self {
2483 self.is_e2e = is_e2e;
2484 self
2485 }
2486
2487 pub fn confidence(&self) -> Option<f32> {
2499 match (self.classification_confidence, self.structure_confidence) {
2500 (Some(cls), Some(str)) => Some(cls.min(str)),
2501 (None, Some(str)) => Some(str),
2502 (Some(cls), None) => Some(cls),
2503 (None, None) => None,
2504 }
2505 }
2506
2507 pub fn has_structure(&self) -> bool {
2512 !self.cells.is_empty() || self.html_structure.is_some()
2513 }
2514}
2515
2516#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
2518pub enum TableType {
2519 Wired,
2521 Wireless,
2523 Unknown,
2525}
2526
2527#[derive(Debug, Clone, Serialize, Deserialize)]
2529pub struct TableCell {
2530 pub bbox: BoundingBox,
2532 pub row: Option<usize>,
2534 pub col: Option<usize>,
2536 pub row_span: Option<usize>,
2538 pub col_span: Option<usize>,
2540 pub confidence: f32,
2542 pub text: Option<String>,
2544}
2545
2546impl TableCell {
2547 pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
2549 Self {
2550 bbox,
2551 row: None,
2552 col: None,
2553 row_span: None,
2554 col_span: None,
2555 confidence,
2556 text: None,
2557 }
2558 }
2559
2560 pub fn with_position(mut self, row: usize, col: usize) -> Self {
2562 self.row = Some(row);
2563 self.col = Some(col);
2564 self
2565 }
2566
2567 pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
2569 self.row_span = Some(row_span);
2570 self.col_span = Some(col_span);
2571 self
2572 }
2573
2574 pub fn with_text(mut self, text: impl Into<String>) -> Self {
2576 self.text = Some(text.into());
2577 self
2578 }
2579}
2580
2581#[derive(Debug, Clone, Serialize, Deserialize)]
2583pub struct FormulaResult {
2584 pub bbox: BoundingBox,
2586 pub latex: String,
2588 pub confidence: f32,
2590}
2591
2592impl FormulaResult {
2593 pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
2595 Self {
2596 bbox,
2597 latex: latex.into(),
2598 confidence,
2599 }
2600 }
2601}
2602
2603#[cfg(test)]
2604mod tests {
2605 use super::*;
2606
2607 #[test]
2608 fn test_structure_result_creation() {
2609 let result = StructureResult::new("test.jpg", 0);
2610 assert_eq!(result.input_path.as_ref(), "test.jpg");
2611 assert_eq!(result.index, 0);
2612 assert!(result.layout_elements.is_empty());
2613 assert!(result.tables.is_empty());
2614 assert!(result.formulas.is_empty());
2615 assert!(result.text_regions.is_none());
2616 }
2617
2618 #[test]
2619 fn test_layout_element_type_as_str() {
2620 assert_eq!(LayoutElementType::Text.as_str(), "text");
2621 assert_eq!(LayoutElementType::Table.as_str(), "table");
2622 assert_eq!(LayoutElementType::Formula.as_str(), "formula");
2623 }
2624
2625 #[test]
2626 fn test_table_result_creation() {
2627 let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2628 let table = TableResult::new(bbox, TableType::Wired);
2629 assert_eq!(table.table_type, TableType::Wired);
2630 assert!(table.cells.is_empty());
2631 assert!(table.html_structure.is_none());
2632 }
2633
2634 #[test]
2635 fn test_structure_result_export() {
2636 let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
2637 let mut result = StructureResult::new("test.jpg", 0);
2638
2639 let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
2640 .with_text("Test Document");
2641
2642 let text =
2643 LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
2644
2645 result = result.with_layout_elements(vec![title, text]);
2646
2647 let md = result.to_markdown();
2648 assert!(md.contains("# Test Document"));
2649 assert!(md.contains("Hello world"));
2650
2651 let html = result.to_html();
2652 assert!(html.contains("<h1>Test Document</h1>"));
2653 assert!(html.contains("<p>Hello world</p>"));
2654 }
2655
2656 #[test]
2657 fn test_format_title_with_level_keywords() {
2658 let (level, text) = format_title_with_level("Abstract", None);
2659 assert_eq!(level, 2);
2660 assert_eq!(text, "Abstract");
2661
2662 let (level, text) = format_title_with_level("References:", None);
2663 assert_eq!(level, 2);
2664 assert_eq!(text, "References:");
2665 }
2666
2667 #[test]
2668 fn test_format_title_with_level_cluster_fallback() {
2669 let (level, text) = format_title_with_level("Unnumbered Heading", Some(4));
2670 assert_eq!(level, 4);
2671 assert_eq!(text, "Unnumbered Heading");
2672 }
2673
2674 #[test]
2675 fn test_to_markdown_skips_footnote() {
2676 let mut result = StructureResult::new("test.jpg", 0);
2677 let body = LayoutElement::new(
2678 BoundingBox::from_coords(0.0, 0.0, 100.0, 30.0),
2679 LayoutElementType::Text,
2680 1.0,
2681 )
2682 .with_text("Body");
2683 let footnote = LayoutElement::new(
2684 BoundingBox::from_coords(0.0, 40.0, 100.0, 60.0),
2685 LayoutElementType::Footnote,
2686 1.0,
2687 )
2688 .with_text("Footnote text");
2689 result = result.with_layout_elements(vec![body, footnote]);
2690
2691 let md = result.to_markdown();
2692 assert!(md.contains("Body"));
2693 assert!(!md.contains("Footnote text"));
2694 }
2695
2696 #[test]
2697 fn test_to_markdown_doc_title_joins_lines_with_space() {
2698 let mut result = StructureResult::new("test.jpg", 0);
2699 let title = LayoutElement::new(
2700 BoundingBox::from_coords(0.0, 0.0, 100.0, 20.0),
2701 LayoutElementType::DocTitle,
2702 1.0,
2703 )
2704 .with_text("Main\nTitle");
2705 result = result.with_layout_elements(vec![title]);
2706 let md = result.to_markdown();
2707 assert!(md.contains("# Main Title"));
2708 }
2709
2710 #[test]
2711 fn test_to_markdown_content_uses_soft_breaks() {
2712 let mut result = StructureResult::new("test.jpg", 0);
2713 let toc = LayoutElement::new(
2714 BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2715 LayoutElementType::Content,
2716 1.0,
2717 )
2718 .with_text("1 Intro\n2 Method");
2719 result = result.with_layout_elements(vec![toc]);
2720 let md = result.to_markdown();
2721 assert!(md.contains("1 Intro \n2 Method"));
2722 }
2723
2724 #[test]
2725 fn test_infer_paragraph_title_levels_by_height() {
2726 let titles = vec![
2727 LayoutElement::new(
2728 BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2729 LayoutElementType::ParagraphTitle,
2730 1.0,
2731 )
2732 .with_text("Large"),
2733 LayoutElement::new(
2734 BoundingBox::from_coords(0.0, 50.0, 100.0, 74.0),
2735 LayoutElementType::ParagraphTitle,
2736 1.0,
2737 )
2738 .with_text("Medium"),
2739 LayoutElement::new(
2740 BoundingBox::from_coords(0.0, 80.0, 100.0, 98.0),
2741 LayoutElementType::ParagraphTitle,
2742 1.0,
2743 )
2744 .with_text("Small"),
2745 ];
2746
2747 let levels = infer_paragraph_title_levels(&titles);
2748 let l0 = levels.get(&0).copied().unwrap_or(2);
2749 let l1 = levels.get(&1).copied().unwrap_or(2);
2750 let l2 = levels.get(&2).copied().unwrap_or(2);
2751 assert!(l0 <= l1 && l1 <= l2);
2752 }
2753
2754 #[test]
2755 fn test_infer_paragraph_title_levels_semantic_vote_wins_tie() {
2756 let titles = vec![
2757 LayoutElement::new(
2758 BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
2759 LayoutElementType::ParagraphTitle,
2760 1.0,
2761 )
2762 .with_text("1.1 Detail"),
2763 LayoutElement::new(
2764 BoundingBox::from_coords(0.0, 50.0, 100.0, 70.0),
2765 LayoutElementType::ParagraphTitle,
2766 1.0,
2767 )
2768 .with_text("2 Intro"),
2769 ];
2770
2771 let levels = infer_paragraph_title_levels(&titles);
2772 assert_eq!(levels.get(&0).copied(), Some(3));
2773 assert_eq!(levels.get(&1).copied(), Some(2));
2774 }
2775
2776 #[test]
2777 fn test_infer_paragraph_title_levels_uses_relative_indent_signal() {
2778 let titles = vec![
2779 LayoutElement::new(
2780 BoundingBox::from_coords(0.0, 0.0, 100.0, 24.0),
2781 LayoutElementType::ParagraphTitle,
2782 1.0,
2783 )
2784 .with_text("Heading A"),
2785 LayoutElement::new(
2786 BoundingBox::from_coords(40.0, 40.0, 140.0, 64.0),
2787 LayoutElementType::ParagraphTitle,
2788 1.0,
2789 )
2790 .with_text("Heading B"),
2791 ];
2792
2793 let levels = infer_paragraph_title_levels(&titles);
2794 let left_level = levels.get(&0).copied().unwrap_or(2);
2795 let indented_level = levels.get(&1).copied().unwrap_or(2);
2796 assert!(left_level < indented_level);
2797 }
2798}