use super::text_region::TextRegion;
use crate::processors::BoundingBox;
use image::RgbImage;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
use std::sync::LazyLock;
static TITLE_NUMBERING_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?x)
^\s*
(
# Arabic numerals: 1, 1.2, 1.2.3, etc.
[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?
|
# Parenthesized Arabic numerals: (1), (1.2), etc.
[((][1-9][0-9]*(?:\.[1-9][0-9]*)*[))]
|
# Chinese numerals with punctuation: 一、 二、
[一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]?
|
# Parenthesized Chinese numerals: (一)
[((][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[))]
|
# Roman numerals with delimiter (period or followed by space)
(?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b)
)
(\s+)
(.*)
$
",
)
.unwrap_or_else(|e| panic!("Invalid title numbering regex: {e}"))
});
fn semantic_title_level_and_format(cleaned: &str) -> Option<(usize, String)> {
let trimmed = cleaned.trim();
let keyword = trimmed.trim_end_matches(':').to_ascii_uppercase();
if matches!(
keyword.as_str(),
"ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
) {
return Some((2, trimmed.to_string()));
}
if let Some(captures) = TITLE_NUMBERING_REGEX.captures(cleaned) {
let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or("");
let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or("");
let dot_count = numbering.matches('.').count();
let level = (dot_count + 2).clamp(2, 6);
let formatted = if title_content.is_empty() {
numbering.trim_end_matches('.').to_string()
} else {
format!(
"{} {}",
numbering.trim_end_matches('.'),
title_content.trim_start()
)
};
return Some((level, formatted));
}
None
}
fn semantic_title_level(text: &str) -> Option<usize> {
let cleaned = text.replace("-\n", "").replace('\n', " ");
semantic_title_level_and_format(&cleaned).map(|(level, _)| level)
}
fn format_title_with_level(title: &str, clustered_level: Option<usize>) -> (usize, String) {
let cleaned = title.replace("-\n", "").replace('\n', " ");
if let Some((level, formatted)) = semantic_title_level_and_format(&cleaned) {
return (level, formatted);
}
let level = clustered_level.unwrap_or(2).clamp(2, 6);
(level, cleaned)
}
fn infer_paragraph_title_levels(elements: &[LayoutElement]) -> HashMap<usize, usize> {
let title_indices: Vec<usize> = elements
.iter()
.enumerate()
.filter(|(_, e)| e.element_type == LayoutElementType::ParagraphTitle)
.map(|(idx, _)| idx)
.collect();
if title_indices.is_empty() {
return HashMap::new();
}
let height_samples: Vec<(usize, f32)> = title_indices
.iter()
.filter_map(|&idx| {
let e = &elements[idx];
let height = (e.bbox.y_max() - e.bbox.y_min()).max(1.0);
let line_h = height / e.num_lines.unwrap_or(1).max(1) as f32;
let v = line_h.max(1.0);
if v.is_finite() { Some((idx, v)) } else { None }
})
.collect();
let indent_samples: Vec<(usize, f32)> = title_indices
.iter()
.filter_map(|&idx| {
let x = elements[idx].bbox.x_min();
if x.is_finite() { Some((idx, x)) } else { None }
})
.collect();
let semantic_levels: HashMap<usize, usize> = title_indices
.iter()
.filter_map(|&idx| {
elements[idx]
.text
.as_deref()
.and_then(semantic_title_level)
.map(|level| (idx, level))
})
.collect();
let font_levels = infer_levels_by_kmeans_feature(&height_samples, true);
let relative_levels = infer_levels_by_kmeans_feature(&indent_samples, false);
let mut voted = HashMap::new();
for idx in title_indices {
let semantic_level = semantic_levels.get(&idx).copied();
let font_level = font_levels.get(&idx).copied();
let relative_level = relative_levels.get(&idx).copied();
let mut score = [0u8; 7];
if let Some(level) = semantic_level {
score[level.clamp(1, 6)] += 2;
}
if let Some(level) = font_level {
score[level.clamp(1, 6)] += 1;
}
if let Some(level) = relative_level {
score[level.clamp(1, 6)] += 1;
}
let mut best_level = semantic_level.unwrap_or(2);
let mut best_score = 0u8;
for (level, &s) in score.iter().enumerate().skip(1) {
if s > best_score {
best_score = s;
best_level = level;
} else if s == best_score && s > 0 {
let is_semantic = semantic_level == Some(level);
let best_is_semantic = semantic_level == Some(best_level);
if (is_semantic && !best_is_semantic)
|| (is_semantic == best_is_semantic && level < best_level)
{
best_level = level;
}
}
}
if best_score == 0 {
best_level = semantic_level
.or(font_level)
.or(relative_level)
.unwrap_or(2);
}
voted.insert(idx, best_level.clamp(1, 6));
}
voted
}
fn infer_levels_by_kmeans_feature(
samples: &[(usize, f32)],
descending: bool,
) -> HashMap<usize, usize> {
let clean_samples: Vec<(usize, f32)> = samples
.iter()
.copied()
.filter(|(_, v)| v.is_finite())
.collect();
if clean_samples.len() < 2 {
return HashMap::new();
}
let mut values: Vec<f32> = clean_samples.iter().map(|(_, v)| *v).collect();
values.sort_by(|a, b| a.total_cmp(b));
let unique_count = values
.windows(2)
.filter(|w| (w[1] - w[0]).abs() > 1e-3)
.count()
+ 1;
let k = unique_count.clamp(1, 4).min(clean_samples.len());
if k <= 1 {
return HashMap::new();
}
let mut centroids = (0..k)
.map(|i| {
let pos = ((i as f32 + 0.5) / k as f32 * values.len() as f32).floor() as usize;
values[pos.min(values.len() - 1)]
})
.collect::<Vec<_>>();
for _ in 0..16 {
let mut sums = vec![0.0f32; k];
let mut counts = vec![0usize; k];
for (_, value) in &clean_samples {
let mut best_idx = 0usize;
let mut best_dist = f32::INFINITY;
for (idx, c) in centroids.iter().enumerate() {
let dist = (value - c).abs();
if dist < best_dist {
best_dist = dist;
best_idx = idx;
}
}
sums[best_idx] += *value;
counts[best_idx] += 1;
}
for idx in 0..k {
if counts[idx] > 0 {
centroids[idx] = sums[idx] / counts[idx] as f32;
}
}
}
let mut centroid_order: Vec<(usize, f32)> = centroids.iter().copied().enumerate().collect();
if descending {
centroid_order.sort_by(|a, b| b.1.total_cmp(&a.1));
} else {
centroid_order.sort_by(|a, b| a.1.total_cmp(&b.1));
}
let rank_by_cluster: HashMap<usize, usize> = centroid_order
.into_iter()
.enumerate()
.map(|(rank, (cluster_idx, _))| (cluster_idx, rank))
.collect();
let mut result = HashMap::new();
for (element_idx, value) in &clean_samples {
let mut best_idx = 0usize;
let mut best_dist = f32::INFINITY;
for (idx, c) in centroids.iter().enumerate() {
let dist = (value - c).abs();
if dist < best_dist {
best_dist = dist;
best_idx = idx;
}
}
let rank = rank_by_cluster.get(&best_idx).copied().unwrap_or(0);
let level = (rank + 2).clamp(2, 6);
result.insert(*element_idx, level);
}
result
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegionBlock {
pub bbox: BoundingBox,
pub confidence: f32,
pub order_index: Option<u32>,
pub element_indices: Vec<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageContinuationFlags {
pub paragraph_start: bool,
pub paragraph_end: bool,
}
impl PageContinuationFlags {
pub fn new(paragraph_start: bool, paragraph_end: bool) -> Self {
Self {
paragraph_start,
paragraph_end,
}
}
pub fn as_tuple(&self) -> (bool, bool) {
(self.paragraph_start, self.paragraph_end)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StructureResult {
pub input_path: Arc<str>,
pub index: usize,
pub layout_elements: Vec<LayoutElement>,
pub tables: Vec<TableResult>,
pub formulas: Vec<FormulaResult>,
pub text_regions: Option<Vec<TextRegion>>,
pub orientation_angle: Option<f32>,
pub region_blocks: Option<Vec<RegionBlock>>,
#[serde(skip)]
pub rectified_img: Option<Arc<RgbImage>>,
pub page_continuation_flags: Option<PageContinuationFlags>,
}
impl StructureResult {
pub fn new(input_path: impl Into<Arc<str>>, index: usize) -> Self {
Self {
input_path: input_path.into(),
index,
layout_elements: Vec::new(),
tables: Vec::new(),
formulas: Vec::new(),
text_regions: None,
orientation_angle: None,
region_blocks: None,
rectified_img: None,
page_continuation_flags: None,
}
}
pub fn with_layout_elements(mut self, elements: Vec<LayoutElement>) -> Self {
self.layout_elements = elements;
self
}
pub fn with_tables(mut self, tables: Vec<TableResult>) -> Self {
self.tables = tables;
self
}
pub fn with_formulas(mut self, formulas: Vec<FormulaResult>) -> Self {
self.formulas = formulas;
self
}
pub fn with_text_regions(mut self, regions: Vec<TextRegion>) -> Self {
self.text_regions = Some(regions);
self
}
pub fn with_region_blocks(mut self, blocks: Vec<RegionBlock>) -> Self {
self.region_blocks = Some(blocks);
self
}
pub fn with_page_continuation_flags(mut self, flags: PageContinuationFlags) -> Self {
self.page_continuation_flags = Some(flags);
self
}
pub fn to_markdown(&self) -> String {
let table_bboxes: Vec<&BoundingBox> = self
.layout_elements
.iter()
.filter(|e| e.element_type == LayoutElementType::Table)
.map(|e| &e.bbox)
.collect();
let original_image_width = self
.rectified_img
.as_ref()
.map(|img| img.width() as f32)
.or_else(|| {
self.layout_elements
.iter()
.map(|e| e.bbox.x_max())
.fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
})
.unwrap_or(1.0);
let mut md = String::new();
let elements = &self.layout_elements;
let paragraph_title_levels = infer_paragraph_title_levels(elements);
let mut prev_text_element: Option<&LayoutElement> = None;
for (idx, element) in elements.iter().enumerate() {
if matches!(
element.element_type,
LayoutElementType::Number
| LayoutElementType::Footnote
| LayoutElementType::Header
| LayoutElementType::HeaderImage
| LayoutElementType::Footer
| LayoutElementType::FooterImage
| LayoutElementType::AsideText
) {
continue;
}
if element.element_type == LayoutElementType::Text {
let overlaps_table = table_bboxes.iter().any(|table_bbox| {
element.bbox.ioa(table_bbox) > 0.3 });
if overlaps_table && element.confidence < 0.7 {
continue;
}
}
let seg_start_flag = get_seg_flag(element, prev_text_element);
let is_continuation = element.element_type == LayoutElementType::Text
&& prev_text_element.is_some()
&& !seg_start_flag;
if !is_continuation {
}
match element.element_type {
LayoutElementType::DocTitle => {
if !md.is_empty() {
md.push_str("\n\n");
}
if let Some(text) = &element.text {
let cleaned = clean_ocr_text(text);
let keyword = cleaned.trim().trim_end_matches(':').to_ascii_uppercase();
if matches!(
keyword.as_str(),
"ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
) {
md.push_str("## ");
} else {
md.push_str("# ");
}
md.push_str(&cleaned);
}
}
LayoutElementType::ParagraphTitle => {
if !md.is_empty() {
md.push_str("\n\n");
}
if let Some(text) = &element.text {
let cleaned = clean_ocr_text(text);
let clustered = paragraph_title_levels.get(&idx).copied();
let (level, formatted_title) = format_title_with_level(&cleaned, clustered);
for _ in 0..level {
md.push('#');
}
md.push(' ');
md.push_str(&formatted_title);
} else {
md.push_str("## ");
}
}
LayoutElementType::Table => {
if !md.is_empty() {
md.push_str("\n\n");
}
if let Some(table) =
self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
{
if let Some(html) = &table.html_structure {
let simplified = simplify_table_html(html);
let table_with_border =
simplified.replacen("<table>", "<table border=\"1\">", 1);
let cleaned = clean_ocr_text(&table_with_border);
md.push_str("<div style=\"text-align: center;\">");
md.push_str(&cleaned);
md.push_str("</div>");
} else {
md.push_str("[Table]");
}
} else {
md.push_str("[Table]");
}
}
LayoutElementType::FormulaNumber => {
continue;
}
LayoutElementType::Formula => {
let raw_content = element.text.as_deref().map(|s| s.trim()).unwrap_or("");
if raw_content.is_empty() {
continue;
}
let latex_content =
if raw_content.starts_with("$$") && raw_content.ends_with("$$") {
&raw_content[2..raw_content.len() - 2]
} else if raw_content.starts_with('$') && raw_content.ends_with('$') {
&raw_content[1..raw_content.len() - 1]
} else {
raw_content
};
let is_inline = {
let has_prev_text = (0..idx)
.rev()
.find(|&i| {
let t = elements[i].element_type;
!t.is_formula() && t != LayoutElementType::FormulaNumber
})
.is_some_and(|i| {
let prev = &elements[i];
(prev.element_type == LayoutElementType::Text
|| prev.element_type == LayoutElementType::ReferenceContent)
&& is_same_line(&element.bbox, &prev.bbox)
});
let has_next_text = ((idx + 1)..elements.len())
.find(|&i| {
let t = elements[i].element_type;
!t.is_formula() && t != LayoutElementType::FormulaNumber
})
.is_some_and(|i| {
let next = &elements[i];
(next.element_type == LayoutElementType::Text
|| next.element_type == LayoutElementType::ReferenceContent)
&& is_same_line(&element.bbox, &next.bbox)
});
has_prev_text && has_next_text
};
if is_inline {
md.push('$');
md.push_str(latex_content);
md.push_str("$ ");
} else {
if !md.is_empty() {
md.push_str("\n\n");
}
md.push_str("$$");
md.push_str(latex_content);
md.push_str("$$");
}
}
LayoutElementType::Image | LayoutElementType::Chart => {
if !md.is_empty() {
md.push_str("\n\n");
}
md.push_str("<div style=\"text-align: center;\"><img src=\"");
let img_name = format!(
"imgs/img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg",
if element.element_type == LayoutElementType::Chart {
"chart"
} else {
"image"
},
element.bbox.x_min(),
element.bbox.y_min(),
element.bbox.x_max(),
element.bbox.y_max()
);
md.push_str(&img_name);
md.push_str("\" alt=\"Image\" width=\"");
let image_width = element.bbox.x_max() - element.bbox.x_min();
let width_pct = (image_width / original_image_width * 100.0) as u32;
let width_pct = width_pct.clamp(1, 100);
md.push_str(&format!("{}%", width_pct));
md.push_str("\" /></div>");
}
LayoutElementType::Seal => {
if !md.is_empty() {
md.push_str("\n\n");
}
md.push_str("![Seal]");
if let Some(text) = &element.text {
md.push_str("\n> ");
md.push_str(text);
}
}
_ if element.element_type.is_caption() => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let cleaned = clean_ocr_text(text);
md.push_str("<div style=\"text-align: center;\">");
md.push_str(&cleaned);
md.push_str(" </div>");
}
}
LayoutElementType::Abstract => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let formatted = format_first_line(text, " ", &["abstract", "摘要"], "## ");
md.push_str(&formatted);
}
}
LayoutElementType::Reference => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let formatted =
format_first_line(text, "\n", &["references", "参考文献"], "## ");
md.push_str(&formatted);
}
}
LayoutElementType::Content => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let formatted = format_content_block(text);
md.push_str(&formatted);
}
}
LayoutElementType::Footnote => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let formatted = format_vision_footnote_block(text);
md.push_str(&formatted);
}
}
LayoutElementType::List => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
let cleaned = format_text_block(text);
for line in cleaned.lines() {
let line = line.trim();
if !line.is_empty() {
md.push_str("- ");
md.push_str(line);
md.push('\n');
}
}
}
}
LayoutElementType::Algorithm => {
if let Some(text) = &element.text {
if !md.is_empty() {
md.push_str("\n\n");
}
md.push_str(text.trim_matches('\n'));
}
}
_ if element.element_type.is_header() || element.element_type.is_footer() => {
continue;
}
_ => {
if let Some(text) = &element.text {
let cleaned = clean_ocr_text(text);
if has_bullet_markers(&cleaned) {
if !md.is_empty() {
md.push_str("\n\n");
}
format_as_bullet_list(&cleaned, &mut md);
} else if is_continuation {
let formatted = format_text_block(text);
md.push_str(&formatted);
} else {
if !md.is_empty() {
md.push_str("\n\n");
}
let formatted = format_text_block(text);
md.push_str(&formatted);
}
}
}
}
if element.element_type == LayoutElementType::Text
|| element.element_type == LayoutElementType::ReferenceContent
{
prev_text_element = Some(element);
}
}
md.trim().to_string()
}
pub fn calculate_continuation_flags(&self) -> PageContinuationFlags {
let elements = &self.layout_elements;
if elements.is_empty() {
return PageContinuationFlags::new(true, true);
}
let page_width = self
.rectified_img
.as_ref()
.map(|img| img.width() as f32)
.or_else(|| {
elements
.iter()
.map(|e| e.bbox.x_max())
.fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x))))
});
let text_elements: Vec<_> = elements
.iter()
.filter(|e| {
matches!(
e.element_type,
LayoutElementType::Text
| LayoutElementType::DocTitle
| LayoutElementType::ParagraphTitle
| LayoutElementType::Abstract
| LayoutElementType::Reference
)
})
.collect();
if text_elements.is_empty() {
return PageContinuationFlags::new(true, true);
}
let first = &text_elements[0];
let paragraph_start = is_new_paragraph_start(first, page_width);
let last = &text_elements[text_elements.len() - 1];
let paragraph_end = is_paragraph_complete(last, page_width);
PageContinuationFlags::new(paragraph_start, paragraph_end)
}
pub fn to_html(&self) -> String {
let mut html = String::from(
"<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n",
);
for element in &self.layout_elements {
match element.element_type {
LayoutElementType::DocTitle => {
html.push_str("<h1>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</h1>\n");
}
LayoutElementType::ParagraphTitle => {
html.push_str("<h2>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</h2>\n");
}
LayoutElementType::Table => {
if let Some(table) =
self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5)
{
if let Some(table_html) = &table.html_structure {
let simplified = simplify_table_html(table_html);
let styled = simplified.replacen(
"<table>",
"<table border=\"1\" style=\"border-collapse: collapse;\">",
1,
);
html.push_str(&styled);
html.push('\n');
} else {
html.push_str("<p>[Table]</p>\n");
}
} else {
html.push_str("<p>[Table]</p>\n");
}
}
LayoutElementType::Formula | LayoutElementType::FormulaNumber => {
html.push_str("<p class=\"formula\">$$");
if let Some(latex) = &element.text {
html.push_str(&Self::escape_html(latex));
}
html.push_str("$$</p>\n");
}
LayoutElementType::Image | LayoutElementType::Chart => {
html.push_str("<figure>\n<img alt=\"Figure\" />\n");
if let Some(caption) = &element.text {
html.push_str("<figcaption>");
html.push_str(&Self::escape_html(caption));
html.push_str("</figcaption>\n");
}
html.push_str("</figure>\n");
}
LayoutElementType::Seal => {
html.push_str("<figure class=\"seal\">\n<img alt=\"Seal\" />\n");
if let Some(text) = &element.text {
html.push_str("<figcaption>");
html.push_str(&Self::escape_html(text));
html.push_str("</figcaption>\n");
}
html.push_str("</figure>\n");
}
_ if element.element_type.is_caption() => {
if let Some(text) = &element.text {
html.push_str("<figcaption>");
html.push_str(&Self::escape_html(text));
html.push_str("</figcaption>\n");
}
}
LayoutElementType::Abstract => {
html.push_str("<section class=\"abstract\">\n<h3>Abstract</h3>\n<p>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</p>\n</section>\n");
}
LayoutElementType::Reference | LayoutElementType::ReferenceContent => {
html.push_str("<section class=\"references\">\n<p>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</p>\n</section>\n");
}
LayoutElementType::List => {
html.push_str("<ul>\n");
if let Some(text) = &element.text {
for line in text.lines() {
html.push_str("<li>");
html.push_str(&Self::escape_html(line));
html.push_str("</li>\n");
}
}
html.push_str("</ul>\n");
}
_ if element.element_type.is_header() => {
html.push_str("<header>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</header>\n");
}
_ if element.element_type.is_footer() => {
html.push_str("<footer>");
if let Some(text) = &element.text {
html.push_str(&Self::escape_html(text));
}
html.push_str("</footer>\n");
}
_ => {
if let Some(text) = &element.text {
html.push_str("<p>");
html.push_str(&Self::escape_html(text));
html.push_str("</p>\n");
}
}
}
}
html.push_str("</body>\n</html>");
html
}
fn escape_html(text: &str) -> String {
text.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
pub fn to_json_value(&self) -> serde_json::Result<serde_json::Value> {
serde_json::to_value(self)
}
pub fn save_results(
&self,
output_dir: impl AsRef<Path>,
to_json: bool,
to_html: bool,
) -> std::io::Result<()> {
let output_dir = output_dir.as_ref();
if !output_dir.exists() {
std::fs::create_dir_all(output_dir)?;
}
let input_path = Path::new(self.input_path.as_ref());
let stem = if let Some(path_str) = input_path.to_str() {
if let Some(hash_idx) = path_str.rfind('#') {
let base = &path_str[..hash_idx];
let page_num = &path_str[hash_idx + 1..];
let base_stem = Path::new(base)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("result");
format!("{}_{}", base_stem, page_num)
} else {
input_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("result")
.to_string()
}
} else {
"result".to_string()
};
if to_json {
let json_path = output_dir.join(format!("{}.json", stem));
let json_file = std::fs::File::create(json_path)?;
serde_json::to_writer_pretty(json_file, self)?;
}
if to_html {
let html_path = output_dir.join(format!("{}.html", stem));
std::fs::write(html_path, self.to_html())?;
}
Ok(())
}
}
fn get_seg_flag(current: &LayoutElement, prev: Option<&LayoutElement>) -> bool {
const COORD_THRESHOLD: f32 = 10.0;
let seg_start = current.seg_start_x.unwrap_or(current.bbox.x_min());
let mut context_left = current.bbox.x_min();
let mut context_right = current.bbox.x_max();
if let Some(prev) = prev {
let prev_seg_end = prev.seg_end_x.unwrap_or(prev.bbox.x_max());
let prev_num_lines = prev.num_lines.unwrap_or(1);
let overlap_blocks = context_left < prev.bbox.x_max() && context_right > prev.bbox.x_min();
let edge_distance;
if overlap_blocks {
context_left = context_left.min(prev.bbox.x_min());
context_right = context_right.max(prev.bbox.x_max());
edge_distance = 0.0;
} else {
edge_distance = (current.bbox.x_min() - prev.bbox.x_max()).abs();
}
let prev_end_space_small = (context_right - prev_seg_end).abs() < COORD_THRESHOLD;
let current_start_space_small = seg_start - context_left < COORD_THRESHOLD;
let prev_lines_more_than_one = prev_num_lines > 1;
let blocks_close = edge_distance
< (prev.bbox.x_max() - prev.bbox.x_min())
.max(current.bbox.x_max() - current.bbox.x_min());
if prev_end_space_small
&& current_start_space_small
&& prev_lines_more_than_one
&& blocks_close
{
return false; }
true } else {
if seg_start - context_left < COORD_THRESHOLD {
return false; }
true
}
}
fn is_new_paragraph_start(element: &LayoutElement, page_width: Option<f32>) -> bool {
let left = element.bbox.x_min();
let threshold = page_width.map_or(50.0, |w| w * 0.05); left <= threshold
}
fn is_paragraph_complete(element: &LayoutElement, page_width: Option<f32>) -> bool {
let right = element.bbox.x_max();
if let Some(width) = page_width {
let right_margin = width * 0.1;
return right <= (width - right_margin);
}
true
}
pub fn concatenate_markdown_pages(results: &[StructureResult]) -> String {
if results.is_empty() {
return String::new();
}
if results.len() == 1 {
return results[0].to_markdown();
}
let mut markdown = String::new();
let mut prev_page_end_flag = true;
for result in results.iter() {
let flags = result
.page_continuation_flags
.as_ref()
.cloned()
.unwrap_or_else(|| result.calculate_continuation_flags());
let page_markdown = result.to_markdown();
if page_markdown.trim().is_empty() {
prev_page_end_flag = flags.paragraph_end;
continue;
}
let page_first_continues = !flags.paragraph_start;
let _page_last_continues = !flags.paragraph_end;
if page_first_continues && !prev_page_end_flag {
let last_char = markdown.chars().last();
let first_char = page_markdown.chars().next();
let last_is_chinese = last_char.is_some_and(is_chinese_char);
let first_is_chinese = first_char.is_some_and(is_chinese_char);
if !last_is_chinese && !first_is_chinese {
markdown.push(' ');
markdown.push_str(page_markdown.trim_start());
} else {
markdown.push_str(page_markdown.trim_start());
}
} else {
if !markdown.is_empty() {
markdown.push_str("\n\n");
}
markdown.push_str(&page_markdown);
}
prev_page_end_flag = flags.paragraph_end;
}
markdown.trim().to_string()
}
fn clean_ocr_text(text: &str) -> String {
text.replace("-\n", "").replace('\n', " ")
}
fn format_first_line(
text: &str,
spliter: &str,
templates: &[&str],
heading_prefix: &str,
) -> String {
let parts: Vec<&str> = text.split(spliter).collect();
let mut result_parts: Vec<String> = Vec::with_capacity(parts.len());
let mut found_first = false;
for part in &parts {
if !found_first {
let trimmed = part.trim();
if trimmed.is_empty() {
result_parts.push(part.to_string());
continue;
}
found_first = true;
if templates.iter().any(|t| trimmed.eq_ignore_ascii_case(t)) {
result_parts.push(format!("{}{}\n", heading_prefix, trimmed));
} else {
result_parts.push(part.to_string());
}
} else {
result_parts.push(part.to_string());
}
}
result_parts.join(spliter)
}
fn format_text_block(text: &str) -> String {
let dehyphenated = text.replace("-\n", "");
let step1 = dehyphenated.replace("\n\n", "\n");
step1.replace('\n', "\n\n")
}
fn format_content_block(text: &str) -> String {
let step1 = text.replace("-\n", " \n");
step1.replace('\n', " \n")
}
fn format_vision_footnote_block(text: &str) -> String {
let dehyphenated = text.replace("-\n", "");
let step1 = dehyphenated.replace("\n\n", "\n");
step1.replace('\n', "\n\n")
}
const BULLET_MARKERS: &[char] = &['•', '●', '◦', '▪', '◆'];
fn has_bullet_markers(text: &str) -> bool {
BULLET_MARKERS.iter().any(|&m| text.contains(m))
}
fn format_as_bullet_list(text: &str, md: &mut String) {
for item in text.split(|c: char| BULLET_MARKERS.contains(&c)) {
let item = item.trim();
if !item.is_empty() {
md.push_str("- ");
md.push_str(item);
md.push('\n');
}
}
}
fn is_chinese_char(c: char) -> bool {
match c {
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' | '\u{2CEB0}'..='\u{2EBEF}' => true,
_ => false,
}
}
fn is_lowercase(c: char) -> bool {
c.is_ascii_lowercase()
}
fn is_uppercase(c: char) -> bool {
c.is_ascii_uppercase()
}
fn is_digit(c: char) -> bool {
c.is_ascii_digit()
}
fn dehyphenate(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let chars: Vec<char> = text.chars().collect();
let len = chars.len();
let mut i = 0;
let is_url_context = |pos: usize| -> bool {
let start = pos.saturating_sub(10);
let end = (pos + 5).min(len);
let window: String = chars[start..end].iter().collect();
window.contains("http") || window.contains("www") || window.contains("://")
};
while i < len {
if chars[i] == '-' {
if is_url_context(i) {
result.push('-');
i += 1;
continue;
}
let is_artifact = if i + 1 < len && chars[i + 1] == '\n' {
if i + 2 < len {
is_lowercase(chars[i + 2])
} else {
false
}
} else {
false
};
if is_artifact {
i += 1; } else {
result.push('-');
}
} else {
result.push(chars[i]);
}
i += 1;
}
result
}
fn fix_merged_words(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let chars: Vec<char> = text.chars().collect();
let mut i = 0;
while i < chars.len() {
let current = chars[i];
if i > 0 {
let prev = chars[i - 1];
if is_lowercase(prev) && is_lowercase(current) {
if i > 1 && chars[i - 2] == '\'' {
result.push(' ');
}
} else if is_lowercase(prev) && is_uppercase(current) {
if i + 1 < chars.len() && is_lowercase(chars[i + 1]) {
result.push(' ');
}
}
else if ((is_digit(prev) || prev == '%') && is_uppercase(current))
|| (is_letter(prev)
&& is_digit(current)
&& i + 1 < chars.len()
&& is_letter(chars[i + 1]))
{
result.push(' ');
}
}
result.push(current);
i += 1;
}
result
}
fn is_letter(c: char) -> bool {
is_lowercase(c) || is_uppercase(c)
}
fn simplify_table_html(html: &str) -> String {
html.replace("<html>", "")
.replace("</html>", "")
.replace("<body>", "")
.replace("</body>", "")
}
pub fn postprocess_text(text: &str) -> String {
let text = dehyphenate(text);
let text = fix_merged_words(&text);
let mut result = String::new();
let mut in_space = false;
for c in text.chars() {
if c.is_whitespace() {
if !in_space && !result.is_empty() {
result.push(' ');
in_space = true;
}
} else {
if c == '.' && !result.is_empty() {
let last = result.chars().last().unwrap();
if is_letter(last) || is_digit(last) {
result.push('.');
in_space = true;
continue;
}
}
if in_space && matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')' | ']' | '}') {
result.pop(); result.push(c);
continue;
}
result.push(c);
in_space = false;
}
}
result
}
fn deduplicate_sections(markdown: &str) -> String {
let mut result = String::new();
let mut seen_sections: std::collections::HashSet<String> = std::collections::HashSet::new();
for line in markdown.lines() {
let trimmed = line.trim();
let is_section_header =
trimmed.starts_with("**") && trimmed.ends_with("**") && trimmed.len() > 4;
let section_name = if is_section_header {
trimmed[2..trimmed.len() - 2].to_string()
} else {
String::new()
};
if is_section_header {
if seen_sections.contains(§ion_name) {
continue;
}
seen_sections.insert(section_name);
}
if !result.is_empty() {
result.push('\n');
}
result.push_str(line);
}
result
}
fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool {
let y1_min = bbox1.y_min();
let y1_max = bbox1.y_max();
let y2_min = bbox2.y_min();
let y2_max = bbox2.y_max();
let overlap_start = y1_min.max(y2_min);
let overlap_end = y1_max.min(y2_max);
let overlap = (overlap_end - overlap_start).max(0.0);
let height1 = y1_max - y1_min;
let height2 = y2_max - y2_min;
let min_height = height1.min(height2);
min_height > 0.0 && overlap / min_height > 0.5
}
fn filter_empty_formulas(markdown: &str) -> String {
let mut result = String::new();
let lines: Vec<&str> = markdown.lines().collect();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
if line.trim() == "$$" {
if i + 1 < lines.len() && lines[i + 1].trim() == "$$" {
i += 2;
if i < lines.len() && lines[i].trim().is_empty() {
i += 1;
}
continue;
}
let mut j = i + 1;
let has_content = if j < lines.len() {
let mut found = false;
while j < lines.len() {
if lines[j].trim() == "$$" {
break;
}
if !lines[j].trim().is_empty() {
found = true;
break;
}
j += 1;
}
found
} else {
false
};
if !has_content {
while i < lines.len() && lines[i].trim() != "$$" {
i += 1;
}
if i < lines.len() {
i += 1; }
continue;
}
}
if !result.is_empty() {
result.push('\n');
}
result.push_str(line);
i += 1;
}
result
}
pub fn postprocess_markdown(markdown: &str) -> String {
let markdown = filter_empty_formulas(markdown);
let markdown = deduplicate_sections(&markdown);
let mut result = String::new();
let mut in_code_block = false;
let mut in_formula = false;
for line in markdown.lines() {
let trimmed = line.trim();
if trimmed.starts_with("```") {
in_code_block = !in_code_block;
result.push_str(line);
result.push('\n');
continue;
}
if trimmed == "$$" {
in_formula = !in_formula;
result.push_str(line);
result.push('\n');
continue;
}
if in_code_block {
result.push_str(line);
result.push('\n');
continue;
}
if in_formula {
let contains_dollar = line.contains('$');
let is_plain_text = line.split_whitespace().count() > 3 && !line.contains('\\');
if contains_dollar && is_plain_text {
result.push_str(&line.replace('$', "\\$"));
} else if contains_dollar {
result.push_str(&line.replace('$', "\\$"));
} else {
result.push_str(line);
}
result.push('\n');
continue;
}
if trimmed.starts_with('#')
|| trimmed.starts_with('*')
|| trimmed.starts_with('>')
|| trimmed.starts_with('|')
|| trimmed.starts_with('-')
|| trimmed.starts_with('+')
{
result.push_str(line);
} else {
result.push_str(&postprocess_text(line));
}
result.push('\n');
}
result
}
pub trait StructureResultExt {
fn to_concatenated_markdown(results: &[Self]) -> String
where
Self: Sized;
fn save_multi_page_results(
results: &[Self],
output_dir: impl AsRef<std::path::Path>,
base_name: &str,
to_json: bool,
to_markdown: bool,
to_html: bool,
) -> std::io::Result<()>
where
Self: Sized;
}
impl StructureResultExt for StructureResult {
fn to_concatenated_markdown(results: &[Self]) -> String {
concatenate_markdown_pages(results)
}
fn save_multi_page_results(
results: &[Self],
output_dir: impl AsRef<std::path::Path>,
base_name: &str,
to_json: bool,
to_markdown: bool,
to_html: bool,
) -> std::io::Result<()>
where
Self: Sized,
{
let output_dir = output_dir.as_ref();
if !output_dir.exists() {
std::fs::create_dir_all(output_dir)?;
}
for (idx, result) in results.iter().enumerate() {
let page_dir = output_dir.join(format!("page_{:03}", idx));
std::fs::create_dir_all(&page_dir)?;
result.save_results(&page_dir, to_json, to_html)?;
}
if to_markdown {
let concat_md_path = output_dir.join(format!("{}.md", base_name));
std::fs::write(concat_md_path, Self::to_concatenated_markdown(results))?;
}
if to_json {
let concat_json_path = output_dir.join(format!("{}.json", base_name));
let json_file = std::fs::File::create(concat_json_path)?;
serde_json::to_writer_pretty(json_file, &results)?;
}
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayoutElement {
pub bbox: BoundingBox,
pub element_type: LayoutElementType,
pub confidence: f32,
pub label: Option<String>,
pub text: Option<String>,
pub order_index: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub seg_start_x: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub seg_end_x: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub num_lines: Option<u32>,
}
impl LayoutElement {
pub fn new(bbox: BoundingBox, element_type: LayoutElementType, confidence: f32) -> Self {
Self {
bbox,
element_type,
confidence,
label: None,
text: None,
order_index: None,
seg_start_x: None,
seg_end_x: None,
num_lines: None,
}
}
pub fn with_label(mut self, label: impl Into<String>) -> Self {
self.label = Some(label.into());
self
}
pub fn with_text(mut self, text: impl Into<String>) -> Self {
self.text = Some(text.into());
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum LayoutElementType {
DocTitle,
ParagraphTitle,
Text,
Content,
Abstract,
Image,
Table,
Chart,
Formula,
FigureTitle,
TableTitle,
ChartTitle,
FigureTableChartTitle,
Header,
HeaderImage,
Footer,
FooterImage,
Footnote,
Seal,
Number,
Reference,
ReferenceContent,
Algorithm,
FormulaNumber,
AsideText,
List,
Region,
Other,
}
impl LayoutElementType {
pub fn as_str(&self) -> &'static str {
match self {
LayoutElementType::DocTitle => "doc_title",
LayoutElementType::ParagraphTitle => "paragraph_title",
LayoutElementType::Text => "text",
LayoutElementType::Content => "content",
LayoutElementType::Abstract => "abstract",
LayoutElementType::Image => "image",
LayoutElementType::Table => "table",
LayoutElementType::Chart => "chart",
LayoutElementType::Formula => "formula",
LayoutElementType::FigureTitle => "figure_title",
LayoutElementType::TableTitle => "table_title",
LayoutElementType::ChartTitle => "chart_title",
LayoutElementType::FigureTableChartTitle => "figure_table_chart_title",
LayoutElementType::Header => "header",
LayoutElementType::HeaderImage => "header_image",
LayoutElementType::Footer => "footer",
LayoutElementType::FooterImage => "footer_image",
LayoutElementType::Footnote => "footnote",
LayoutElementType::Seal => "seal",
LayoutElementType::Number => "number",
LayoutElementType::Reference => "reference",
LayoutElementType::ReferenceContent => "reference_content",
LayoutElementType::Algorithm => "algorithm",
LayoutElementType::FormulaNumber => "formula_number",
LayoutElementType::AsideText => "aside_text",
LayoutElementType::List => "list",
LayoutElementType::Region => "region",
LayoutElementType::Other => "other",
}
}
pub fn from_label(label: &str) -> Self {
match label.to_lowercase().as_str() {
"doc_title" => LayoutElementType::DocTitle,
"paragraph_title" | "title" => LayoutElementType::ParagraphTitle,
"text" | "paragraph" => LayoutElementType::Text,
"content" => LayoutElementType::Content,
"abstract" => LayoutElementType::Abstract,
"image" | "figure" => LayoutElementType::Image,
"table" => LayoutElementType::Table,
"chart" | "flowchart" => LayoutElementType::Chart,
"formula" | "equation" | "display_formula" | "inline_formula" => {
LayoutElementType::Formula
}
"figure_title" => LayoutElementType::FigureTitle,
"table_title" => LayoutElementType::TableTitle,
"chart_title" => LayoutElementType::ChartTitle,
"figure_table_chart_title" | "caption" => LayoutElementType::FigureTableChartTitle,
"header" => LayoutElementType::Header,
"header_image" => LayoutElementType::HeaderImage,
"footer" => LayoutElementType::Footer,
"footer_image" => LayoutElementType::FooterImage,
"footnote" | "vision_footnote" => LayoutElementType::Footnote,
"seal" => LayoutElementType::Seal,
"number" => LayoutElementType::Number,
"reference" => LayoutElementType::Reference,
"reference_content" => LayoutElementType::ReferenceContent,
"algorithm" => LayoutElementType::Algorithm,
"formula_number" => LayoutElementType::FormulaNumber,
"aside_text" => LayoutElementType::AsideText,
"list" => LayoutElementType::List,
"vertical_text" => LayoutElementType::Text,
"region" => LayoutElementType::Region,
_ => LayoutElementType::Other,
}
}
pub fn semantic_category(&self) -> &'static str {
match self {
LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle => "title",
LayoutElementType::Text | LayoutElementType::Content | LayoutElementType::Abstract => {
"text"
}
LayoutElementType::Image | LayoutElementType::Chart => "visual",
LayoutElementType::Table => "table",
LayoutElementType::FigureTitle
| LayoutElementType::TableTitle
| LayoutElementType::ChartTitle
| LayoutElementType::FigureTableChartTitle => "caption",
LayoutElementType::Header | LayoutElementType::HeaderImage => "header",
LayoutElementType::Footer
| LayoutElementType::FooterImage
| LayoutElementType::Footnote => "footer",
LayoutElementType::Formula | LayoutElementType::FormulaNumber => "formula",
LayoutElementType::Seal
| LayoutElementType::Number
| LayoutElementType::Reference
| LayoutElementType::ReferenceContent
| LayoutElementType::Algorithm
| LayoutElementType::AsideText => "special",
LayoutElementType::List => "list",
LayoutElementType::Region => "region",
LayoutElementType::Other => "other",
}
}
pub fn is_title(&self) -> bool {
matches!(
self,
LayoutElementType::DocTitle | LayoutElementType::ParagraphTitle
)
}
pub fn is_visual(&self) -> bool {
matches!(self, LayoutElementType::Image | LayoutElementType::Chart)
}
pub fn is_caption(&self) -> bool {
matches!(
self,
LayoutElementType::FigureTitle
| LayoutElementType::TableTitle
| LayoutElementType::ChartTitle
| LayoutElementType::FigureTableChartTitle
)
}
pub fn is_header(&self) -> bool {
matches!(
self,
LayoutElementType::Header | LayoutElementType::HeaderImage
)
}
pub fn is_footer(&self) -> bool {
matches!(
self,
LayoutElementType::Footer
| LayoutElementType::FooterImage
| LayoutElementType::Footnote
)
}
pub fn is_formula(&self) -> bool {
matches!(
self,
LayoutElementType::Formula | LayoutElementType::FormulaNumber
)
}
pub fn should_ocr(&self) -> bool {
matches!(
self,
LayoutElementType::Text
| LayoutElementType::Content
| LayoutElementType::Abstract
| LayoutElementType::DocTitle
| LayoutElementType::ParagraphTitle
| LayoutElementType::FigureTitle
| LayoutElementType::TableTitle
| LayoutElementType::ChartTitle
| LayoutElementType::FigureTableChartTitle
| LayoutElementType::Header
| LayoutElementType::HeaderImage
| LayoutElementType::Footer
| LayoutElementType::FooterImage
| LayoutElementType::Footnote
| LayoutElementType::Reference
| LayoutElementType::ReferenceContent
| LayoutElementType::Algorithm
| LayoutElementType::AsideText
| LayoutElementType::List
| LayoutElementType::Number
)
}
}
pub fn remove_overlapping_layout_elements(
layout_elements: &mut Vec<LayoutElement>,
overlap_threshold: f32,
) -> usize {
use std::collections::HashSet;
if layout_elements.len() <= 1 {
return 0;
}
let bboxes: Vec<_> = layout_elements.iter().map(|e| e.bbox.clone()).collect();
let labels: Vec<&str> = layout_elements
.iter()
.map(|e| e.element_type.as_str())
.collect();
let remove_indices =
crate::processors::get_overlap_removal_indices(&bboxes, &labels, overlap_threshold);
if remove_indices.is_empty() {
return 0;
}
let remove_set: HashSet<usize> = remove_indices.into_iter().collect();
let before = layout_elements.len();
let mut idx = 0;
layout_elements.retain(|_| {
let keep = !remove_set.contains(&idx);
idx += 1;
keep
});
before.saturating_sub(layout_elements.len())
}
pub fn apply_standardized_layout_label_fixes(layout_elements: &mut [LayoutElement]) {
if layout_elements.is_empty() {
return;
}
let mut footnote_indices: Vec<usize> = Vec::new();
let mut paragraph_title_indices: Vec<usize> = Vec::new();
let mut bottom_text_y_max: f32 = 0.0;
let mut max_block_area: f32 = 0.0;
let mut doc_title_num: usize = 0;
for (idx, elem) in layout_elements.iter().enumerate() {
let area =
(elem.bbox.x_max() - elem.bbox.x_min()) * (elem.bbox.y_max() - elem.bbox.y_min());
max_block_area = max_block_area.max(area);
match elem.element_type {
LayoutElementType::Footnote => footnote_indices.push(idx),
LayoutElementType::ParagraphTitle => paragraph_title_indices.push(idx),
LayoutElementType::Text => {
bottom_text_y_max = bottom_text_y_max.max(elem.bbox.y_max());
}
LayoutElementType::DocTitle => doc_title_num += 1,
_ => {}
}
}
for idx in footnote_indices {
if layout_elements[idx].bbox.y_max() < bottom_text_y_max {
layout_elements[idx].element_type = LayoutElementType::Text;
layout_elements[idx].label = Some("text".to_string());
}
}
let only_one_paragraph_title = paragraph_title_indices.len() == 1 && doc_title_num == 0;
if only_one_paragraph_title {
let idx = paragraph_title_indices[0];
let area = (layout_elements[idx].bbox.x_max() - layout_elements[idx].bbox.x_min())
* (layout_elements[idx].bbox.y_max() - layout_elements[idx].bbox.y_min());
let title_area_ratio_threshold = 0.3f32;
if area > max_block_area * title_area_ratio_threshold {
layout_elements[idx].element_type = LayoutElementType::DocTitle;
layout_elements[idx].label = Some("doc_title".to_string());
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableResult {
pub bbox: BoundingBox,
pub table_type: TableType,
pub classification_confidence: Option<f32>,
pub structure_confidence: Option<f32>,
pub cells: Vec<TableCell>,
pub html_structure: Option<String>,
pub cell_texts: Option<Vec<Option<String>>>,
#[serde(skip)]
pub structure_tokens: Option<Vec<String>>,
#[serde(skip)]
pub detected_cell_bboxes: Option<Vec<BoundingBox>>,
#[serde(skip)]
pub is_e2e: bool,
}
impl TableResult {
pub fn new(bbox: BoundingBox, table_type: TableType) -> Self {
Self {
bbox,
table_type,
classification_confidence: None,
structure_confidence: None,
cells: Vec::new(),
html_structure: None,
cell_texts: None,
structure_tokens: None,
detected_cell_bboxes: None,
is_e2e: false,
}
}
pub fn with_classification_confidence(mut self, confidence: f32) -> Self {
self.classification_confidence = Some(confidence);
self
}
pub fn with_structure_confidence(mut self, confidence: f32) -> Self {
self.structure_confidence = Some(confidence);
self
}
pub fn with_cells(mut self, cells: Vec<TableCell>) -> Self {
self.cells = cells;
self
}
pub fn with_html_structure(mut self, html: impl Into<String>) -> Self {
self.html_structure = Some(html.into());
self
}
pub fn with_cell_texts(mut self, texts: Vec<Option<String>>) -> Self {
self.cell_texts = Some(texts);
self
}
pub fn with_structure_tokens(mut self, tokens: Vec<String>) -> Self {
self.structure_tokens = Some(tokens);
self
}
pub fn with_detected_cell_bboxes(mut self, bboxes: Vec<BoundingBox>) -> Self {
self.detected_cell_bboxes = Some(bboxes);
self
}
pub fn with_e2e(mut self, is_e2e: bool) -> Self {
self.is_e2e = is_e2e;
self
}
pub fn confidence(&self) -> Option<f32> {
match (self.classification_confidence, self.structure_confidence) {
(Some(cls), Some(str)) => Some(cls.min(str)),
(None, Some(str)) => Some(str),
(Some(cls), None) => Some(cls),
(None, None) => None,
}
}
pub fn has_structure(&self) -> bool {
!self.cells.is_empty() || self.html_structure.is_some()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TableType {
Wired,
Wireless,
Unknown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableCell {
pub bbox: BoundingBox,
pub row: Option<usize>,
pub col: Option<usize>,
pub row_span: Option<usize>,
pub col_span: Option<usize>,
pub confidence: f32,
pub text: Option<String>,
}
impl TableCell {
pub fn new(bbox: BoundingBox, confidence: f32) -> Self {
Self {
bbox,
row: None,
col: None,
row_span: None,
col_span: None,
confidence,
text: None,
}
}
pub fn with_position(mut self, row: usize, col: usize) -> Self {
self.row = Some(row);
self.col = Some(col);
self
}
pub fn with_span(mut self, row_span: usize, col_span: usize) -> Self {
self.row_span = Some(row_span);
self.col_span = Some(col_span);
self
}
pub fn with_text(mut self, text: impl Into<String>) -> Self {
self.text = Some(text.into());
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FormulaResult {
pub bbox: BoundingBox,
pub latex: String,
pub confidence: f32,
}
impl FormulaResult {
pub fn new(bbox: BoundingBox, latex: impl Into<String>, confidence: f32) -> Self {
Self {
bbox,
latex: latex.into(),
confidence,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_structure_result_creation() {
let result = StructureResult::new("test.jpg", 0);
assert_eq!(result.input_path.as_ref(), "test.jpg");
assert_eq!(result.index, 0);
assert!(result.layout_elements.is_empty());
assert!(result.tables.is_empty());
assert!(result.formulas.is_empty());
assert!(result.text_regions.is_none());
}
#[test]
fn test_layout_element_type_as_str() {
assert_eq!(LayoutElementType::Text.as_str(), "text");
assert_eq!(LayoutElementType::Table.as_str(), "table");
assert_eq!(LayoutElementType::Formula.as_str(), "formula");
}
#[test]
fn test_table_result_creation() {
let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
let table = TableResult::new(bbox, TableType::Wired);
assert_eq!(table.table_type, TableType::Wired);
assert!(table.cells.is_empty());
assert!(table.html_structure.is_none());
}
#[test]
fn test_structure_result_export() {
let bbox = BoundingBox::from_coords(0.0, 0.0, 100.0, 100.0);
let mut result = StructureResult::new("test.jpg", 0);
let title = LayoutElement::new(bbox.clone(), LayoutElementType::DocTitle, 1.0)
.with_text("Test Document");
let text =
LayoutElement::new(bbox.clone(), LayoutElementType::Text, 1.0).with_text("Hello world");
result = result.with_layout_elements(vec![title, text]);
let md = result.to_markdown();
assert!(md.contains("# Test Document"));
assert!(md.contains("Hello world"));
let html = result.to_html();
assert!(html.contains("<h1>Test Document</h1>"));
assert!(html.contains("<p>Hello world</p>"));
}
#[test]
fn test_format_title_with_level_keywords() {
let (level, text) = format_title_with_level("Abstract", None);
assert_eq!(level, 2);
assert_eq!(text, "Abstract");
let (level, text) = format_title_with_level("References:", None);
assert_eq!(level, 2);
assert_eq!(text, "References:");
}
#[test]
fn test_format_title_with_level_cluster_fallback() {
let (level, text) = format_title_with_level("Unnumbered Heading", Some(4));
assert_eq!(level, 4);
assert_eq!(text, "Unnumbered Heading");
}
#[test]
fn test_to_markdown_skips_footnote() {
let mut result = StructureResult::new("test.jpg", 0);
let body = LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 30.0),
LayoutElementType::Text,
1.0,
)
.with_text("Body");
let footnote = LayoutElement::new(
BoundingBox::from_coords(0.0, 40.0, 100.0, 60.0),
LayoutElementType::Footnote,
1.0,
)
.with_text("Footnote text");
result = result.with_layout_elements(vec![body, footnote]);
let md = result.to_markdown();
assert!(md.contains("Body"));
assert!(!md.contains("Footnote text"));
}
#[test]
fn test_to_markdown_doc_title_joins_lines_with_space() {
let mut result = StructureResult::new("test.jpg", 0);
let title = LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 20.0),
LayoutElementType::DocTitle,
1.0,
)
.with_text("Main\nTitle");
result = result.with_layout_elements(vec![title]);
let md = result.to_markdown();
assert!(md.contains("# Main Title"));
}
#[test]
fn test_to_markdown_content_uses_soft_breaks() {
let mut result = StructureResult::new("test.jpg", 0);
let toc = LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
LayoutElementType::Content,
1.0,
)
.with_text("1 Intro\n2 Method");
result = result.with_layout_elements(vec![toc]);
let md = result.to_markdown();
assert!(md.contains("1 Intro \n2 Method"));
}
#[test]
fn test_infer_paragraph_title_levels_by_height() {
let titles = vec![
LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("Large"),
LayoutElement::new(
BoundingBox::from_coords(0.0, 50.0, 100.0, 74.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("Medium"),
LayoutElement::new(
BoundingBox::from_coords(0.0, 80.0, 100.0, 98.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("Small"),
];
let levels = infer_paragraph_title_levels(&titles);
let l0 = levels.get(&0).copied().unwrap_or(2);
let l1 = levels.get(&1).copied().unwrap_or(2);
let l2 = levels.get(&2).copied().unwrap_or(2);
assert!(l0 <= l1 && l1 <= l2);
}
#[test]
fn test_infer_paragraph_title_levels_semantic_vote_wins_tie() {
let titles = vec![
LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("1.1 Detail"),
LayoutElement::new(
BoundingBox::from_coords(0.0, 50.0, 100.0, 70.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("2 Intro"),
];
let levels = infer_paragraph_title_levels(&titles);
assert_eq!(levels.get(&0).copied(), Some(3));
assert_eq!(levels.get(&1).copied(), Some(2));
}
#[test]
fn test_infer_paragraph_title_levels_uses_relative_indent_signal() {
let titles = vec![
LayoutElement::new(
BoundingBox::from_coords(0.0, 0.0, 100.0, 24.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("Heading A"),
LayoutElement::new(
BoundingBox::from_coords(40.0, 40.0, 140.0, 64.0),
LayoutElementType::ParagraphTitle,
1.0,
)
.with_text("Heading B"),
];
let levels = infer_paragraph_title_levels(&titles);
let left_level = levels.get(&0).copied().unwrap_or(2);
let indented_level = levels.get(&1).copied().unwrap_or(2);
assert!(left_level < indented_level);
}
}