#[cfg(not(target_arch = "wasm32"))]
use regex::Regex;
use std::collections::{HashMap, HashSet};
#[cfg(not(target_arch = "wasm32"))]
use std::path::Path;
#[cfg(not(target_arch = "wasm32"))]
use std::process::Command;
use crate::models::bbox::BoundingBox;
use crate::models::chunks::TextChunk;
use crate::models::content::ContentElement;
use crate::models::document::PdfDocument;
use crate::models::enums::SemanticType;
use crate::models::semantic::SemanticTextNode;
use crate::models::table::TableTokenRow;
use crate::EdgePdfError;
#[cfg(not(target_arch = "wasm32"))]
struct CachedBBoxLayout {
page_width: f64,
lines: Vec<BBoxLayoutLine>,
blocks: Vec<BBoxLayoutBlock>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Default)]
struct LayoutSourceCache {
bbox_layout: Option<Option<CachedBBoxLayout>>,
layout_lines: Option<Option<Vec<String>>>,
}
#[cfg(not(target_arch = "wasm32"))]
impl LayoutSourceCache {
fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
if self.bbox_layout.is_none() {
let loaded = doc.source_path.as_deref().and_then(|source_path| {
let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
let blocks = collect_bbox_layout_blocks(&lines);
Some(CachedBBoxLayout {
page_width,
lines,
blocks,
})
});
self.bbox_layout = Some(loaded);
}
self.bbox_layout.as_ref().and_then(Option::as_ref)
}
fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
if self.layout_lines.is_none() {
let loaded = doc
.source_path
.as_deref()
.and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
self.layout_lines = Some(loaded);
}
self.layout_lines
.as_ref()
.and_then(Option::as_ref)
.map(Vec::as_slice)
}
}
pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
#[cfg(not(target_arch = "wasm32"))]
let mut layout_cache = LayoutSourceCache::default();
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) =
render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) =
render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) =
render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
if looks_like_contents_document(doc) {
return Ok(render_contents_document(doc));
}
if looks_like_compact_toc_document(doc) {
return Ok(render_compact_toc_document(doc));
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) =
render_layout_registration_report_document_cached(doc, &mut layout_cache)
{
return Ok(rendered);
}
if let Some(rendered) = render_top_table_plate_document(doc) {
return Ok(rendered);
}
if let Some(rendered) = render_single_table_report_document(doc) {
return Ok(rendered);
}
if let Some(rendered) = render_late_section_boundary_document(doc) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
#[cfg(not(target_arch = "wasm32"))]
if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
return Ok(rendered);
}
Ok(render_markdown_core(doc))
}
fn render_markdown_core(doc: &PdfDocument) -> String {
let mut output = String::new();
if let Some(ref title) = doc.title {
let trimmed = title.trim();
if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
if should_render_document_title_as_plaintext(doc, trimmed) {
output.push_str(trimmed);
output.push_str("\n\n");
} else {
output.push_str(&format!("# {}\n\n", trimmed));
}
}
}
if doc.kids.is_empty() {
output.push_str("*No content extracted.*\n");
return output;
}
let geometric_table_regions = detect_geometric_table_regions(doc);
let mut geometric_table_cover = HashMap::new();
for region in geometric_table_regions {
for idx in region.start_idx..=region.end_idx {
geometric_table_cover.insert(idx, region.clone());
}
}
let mut i = 0usize;
while i < doc.kids.len() {
if let Some(region) = geometric_table_cover.get(&i) {
output.push_str(®ion.rendered);
i = region.end_idx + 1;
continue;
}
match &doc.kids[i] {
ContentElement::Heading(h) => {
let text = h.base.base.value();
let trimmed = text.trim();
if trimmed.is_empty() || should_skip_heading_text(trimmed) {
i += 1;
continue;
}
if looks_like_table_header_duplicate_heading(doc, i, trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if looks_like_bottom_margin_heading(doc, i) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_period_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_comma_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_math_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_percentage_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if starts_with_caption_prefix(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_bibliography_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
if should_demote_heading_to_paragraph(trimmed, &next_text) {
let mut merged = trimmed.to_string();
merge_paragraph_text(&mut merged, &next_text);
output.push_str(&escape_md_line_start(merged.trim()));
output.push_str("\n\n");
i += 2;
continue;
}
}
let mut merged_heading = trimmed.to_string();
while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
let next_text = next_h.base.base.value();
let next_trimmed = next_text.trim();
if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
i += 1;
continue;
}
if merged_heading.len() + 1 + next_trimmed.len() > 200 {
break;
}
merge_paragraph_text(&mut merged_heading, next_trimmed);
i += 1;
}
let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
let first = cleaned_heading[..split_pos].trim();
let second = cleaned_heading[split_pos..].trim();
output.push_str(&format!("# {}\n\n", first));
output.push_str(&format!("# {}\n\n", second));
} else {
output.push_str(&format!("# {}\n\n", cleaned_heading));
}
}
ContentElement::NumberHeading(nh) => {
let text = nh.base.base.base.value();
let trimmed = text.trim();
if trimmed.is_empty() || should_skip_heading_text(trimmed) {
i += 1;
continue;
}
if should_demote_comma_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_math_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if should_demote_percentage_heading(trimmed) {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
i += 1;
continue;
}
if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
if should_demote_heading_to_paragraph(trimmed, &next_text) {
let mut merged = trimmed.to_string();
merge_paragraph_text(&mut merged, &next_text);
output.push_str(&escape_md_line_start(merged.trim()));
output.push_str("\n\n");
i += 2;
continue;
}
}
let cleaned = strip_trailing_page_number(trimmed);
if let Some(split_pos) = find_merged_subsection_split(cleaned) {
let first = cleaned[..split_pos].trim();
let second = cleaned[split_pos..].trim();
output.push_str(&format!("# {}\n\n", first));
output.push_str(&format!("# {}\n\n", second));
} else {
output.push_str(&format!("# {}\n\n", cleaned));
}
}
ContentElement::Paragraph(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_) => {
let element = &doc.kids[i];
let text = match &doc.kids[i] {
ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
_ => unreachable!(),
};
let trimmed = text.trim();
if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
i += 1;
continue;
}
if should_skip_leading_figure_carryover(doc, i, trimmed) {
i += 1;
continue;
}
if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
let cleaned = strip_trailing_page_number(trimmed);
if let Some(split_pos) = find_merged_subsection_split(cleaned) {
let first = cleaned[..split_pos].trim();
let second = cleaned[split_pos..].trim();
output.push_str(&format!("# {}\n\n", first));
output.push_str(&format!("# {}\n\n", second));
} else {
output.push_str(&format!("# {}\n\n", cleaned));
}
i += 1;
continue;
}
if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
{
output.push_str(&escape_md_line_start(trimmed));
output.push('\n');
i += 1;
continue;
}
if is_short_caption_label(trimmed) {
if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
if let Some((caption_tail, body)) =
split_following_caption_tail_and_body(&next_text)
{
let mut caption = trimmed.to_string();
caption.push('\n');
caption.push_str(caption_tail);
output.push_str(&escape_md_line_start(caption.trim()));
output.push_str("\n\n");
output.push_str(&escape_md_line_start(body));
output.push_str("\n\n");
i += 2;
continue;
}
if looks_like_caption_tail(&next_text) {
let mut caption = trimmed.to_string();
caption.push('\n');
caption.push_str(next_text.trim());
if let Some(year_text) =
next_mergeable_paragraph_text(doc.kids.get(i + 2))
{
if looks_like_caption_year(&year_text) {
caption.push('\n');
caption.push_str(year_text.trim());
i += 1;
}
}
output.push_str(&escape_md_line_start(caption.trim()));
output.push_str("\n\n");
i += 2;
continue;
}
}
}
if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
output.push_str(&escape_md_line_start(caption));
output.push_str("\n\n");
output.push_str(&escape_md_line_start(body));
output.push_str("\n\n");
i += 1;
continue;
}
let mut merged = trimmed.to_string();
while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
} else {
should_merge_paragraph_text(&merged, &next_text)
};
if !can_merge {
break;
}
merge_paragraph_text(&mut merged, &next_text);
i += 1;
}
output.push_str(&escape_md_line_start(merged.trim()));
output.push_str("\n\n");
}
other => render_element(&mut output, other),
}
i += 1;
}
let output = merge_adjacent_pipe_tables(&output);
let output = normalize_chart_like_markdown(&output);
drop_isolated_noise_lines(&output)
}
fn cmp_banded_reading_order(
left: &BoundingBox,
right: &BoundingBox,
band_height: f64,
) -> std::cmp::Ordering {
let safe_band = band_height.max(1.0);
let left_band = (left.top_y / safe_band).round() as i64;
let right_band = (right.top_y / safe_band).round() as i64;
right_band
.cmp(&left_band)
.then_with(|| {
left.left_x
.partial_cmp(&right.left_x)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| {
right
.top_y
.partial_cmp(&left.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| {
right
.bottom_y
.partial_cmp(&left.bottom_y)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| {
left.right_x
.partial_cmp(&right.right_x)
.unwrap_or(std::cmp::Ordering::Equal)
})
}
fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
first_heading_like_text(doc)
.filter(|first| !equivalent_heading_text(first, title))
.is_some()
}
fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
if title.split_whitespace().count() > 6 {
return false;
}
let mut early = doc.kids.iter().take(6);
let has_explicit_heading = early.clone().any(|element| {
matches!(
element,
ContentElement::Heading(_) | ContentElement::NumberHeading(_)
)
});
let has_tableish_content = early.any(|element| {
matches!(
element,
ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
)
});
has_tableish_content && !has_explicit_heading
}
fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let (table_idx, table) =
doc.kids.iter().enumerate().find_map(|(idx, element)| {
table_border_from_element(element).map(|table| (idx, table))
})?;
if table.num_columns < 5 || table.rows.len() < 4 {
return None;
}
let mut header_probe = collect_table_border_rows(table);
if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
return None;
}
let table_top = table.bbox.top_y;
let table_bottom = table.bbox.bottom_y;
let table_height = table.bbox.height().max(1.0);
let page_top = doc
.kids
.iter()
.map(|element| element.bbox().top_y)
.fold(f64::NEG_INFINITY, f64::max);
if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
return None;
}
let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
let mut caption_indices = Vec::new();
for idx in table_idx + 1..doc.kids.len() {
let element = &doc.kids[idx];
if !is_geometric_text_candidate(element) {
if table_bottom - element.bbox().top_y > caption_gap_limit {
break;
}
continue;
}
let text = extract_element_text(element);
if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
continue;
}
let gap = table_bottom - element.bbox().top_y;
if gap < -6.0 {
break;
}
if gap > caption_gap_limit {
break;
}
caption_indices.push(idx);
}
if caption_indices.is_empty() {
return None;
}
let has_body_below = doc
.kids
.iter()
.enumerate()
.skip(caption_indices.last().copied()? + 1)
.any(|(_, element)| {
is_geometric_text_candidate(element)
&& !extract_element_text(element).trim().is_empty()
&& table_bottom - element.bbox().top_y > caption_gap_limit
});
if !has_body_below {
return None;
}
let mut output = String::new();
render_table_border(&mut output, table);
let mut caption = String::new();
for idx in &caption_indices {
let text = extract_element_text(&doc.kids[*idx]);
if text.trim().is_empty() {
continue;
}
merge_paragraph_text(&mut caption, &text);
}
let trimmed = caption.trim();
if trimmed.is_empty() {
return None;
}
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
Some(output)
}
fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
return None;
}
let title = &doc.kids[0];
if !is_geometric_text_candidate(title) {
return None;
}
let title_text = extract_element_text(title);
if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
return None;
}
let table = table_border_from_element(&doc.kids[1])?;
if table.num_columns < 4 || table.rows.len() < 4 {
return None;
}
let page_top = doc
.kids
.iter()
.map(|element| element.bbox().top_y)
.fold(f64::NEG_INFINITY, f64::max);
if !page_top.is_finite() {
return None;
}
let title_bbox = title.bbox();
let table_bbox = &table.bbox;
if page_top - title_bbox.top_y > 24.0 {
return None;
}
let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
if !(8.0..=40.0).contains(&vertical_gap) {
return None;
}
if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
return None;
}
if doc.kids.iter().skip(2).any(|element| {
let text = extract_element_text(element);
let trimmed = text.trim();
!trimmed.is_empty()
&& !looks_like_footer_banner(trimmed)
&& !looks_like_margin_page_number(doc, element, trimmed)
}) {
return None;
}
let mut rows = collect_table_border_rows(table);
if rows.is_empty() {
return None;
}
merge_continuation_rows(&mut rows);
trim_leading_table_carryover_rows(&mut rows);
if rows.len() < 2 {
return None;
}
let mut output = String::new();
output.push_str("# ");
output.push_str(title_text.trim());
output.push_str("\n\n");
output.push_str(&render_pipe_rows(&rows));
Some(output)
}
fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
if doc.number_of_pages != 1 || doc.kids.len() < 8 {
return None;
}
let page_top = doc
.kids
.iter()
.map(|element| element.bbox().top_y)
.fold(f64::NEG_INFINITY, f64::max);
if !page_top.is_finite() {
return None;
}
let heading_idx = doc.kids.iter().position(|element| {
matches!(
element,
ContentElement::Heading(_) | ContentElement::NumberHeading(_)
)
})?;
if heading_idx < 5 {
return None;
}
let heading = &doc.kids[heading_idx];
let heading_text = extract_element_text(heading);
if heading_text.trim().is_empty() {
return None;
}
let heading_top = heading.bbox().top_y;
if page_top - heading_top < 240.0 {
return None;
}
let leading_text_indices = (0..heading_idx)
.filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
.collect::<Vec<_>>();
if leading_text_indices.len() < 5 {
return None;
}
let colon_ended = leading_text_indices
.iter()
.filter(|idx| {
extract_element_text(&doc.kids[**idx])
.trim_end()
.ends_with(':')
})
.count();
if colon_ended * 2 < leading_text_indices.len() {
return None;
}
let trailing_indices = (heading_idx + 1..doc.kids.len())
.filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
.filter(|idx| {
let text = extract_element_text(&doc.kids[*idx]);
!text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
})
.collect::<Vec<_>>();
if trailing_indices.is_empty() || trailing_indices.len() > 5 {
return None;
}
let mut footer_count = 0usize;
let content_indices = trailing_indices
.into_iter()
.filter(|idx| {
let text = extract_element_text(&doc.kids[*idx]);
let is_footerish =
doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
footer_count += usize::from(is_footerish);
!is_footerish
})
.collect::<Vec<_>>();
if content_indices.is_empty() || footer_count == 0 {
return None;
}
let mut fragments = content_indices
.iter()
.map(|idx| (*idx, &doc.kids[*idx]))
.collect::<Vec<_>>();
fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
let mut paragraph = String::new();
for (_, element) in fragments {
let text = extract_element_text(element);
if text.trim().is_empty() {
continue;
}
merge_paragraph_text(&mut paragraph, &text);
}
let trimmed_paragraph = paragraph.trim();
if trimmed_paragraph.is_empty() {
return None;
}
let mut output = String::new();
output.push_str("# ");
output.push_str(heading_text.trim());
output.push_str("\n\n");
output.push_str(&escape_md_line_start(trimmed_paragraph));
output.push_str("\n\n");
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutHeaderCandidate {
line_idx: usize,
headers: Vec<String>,
starts: Vec<usize>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutEntry {
line_idx: usize,
cells: Vec<String>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutAnchorRow {
anchor_idx: usize,
last_anchor_idx: usize,
cells: Vec<String>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutPanelHeaderCandidate {
line_idx: usize,
headers: Vec<String>,
starts: Vec<usize>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutTocEntry {
title: String,
page: String,
title_start: usize,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct BBoxLayoutWord {
bbox: BoundingBox,
text: String,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct BBoxLayoutLine {
block_id: usize,
bbox: BoundingBox,
words: Vec<BBoxLayoutWord>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutTextFragment {
bbox: BoundingBox,
text: String,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct OpenPlateCandidate {
heading: String,
header_row: Vec<String>,
rows: Vec<Vec<String>>,
caption: String,
cutoff_top_y: f64,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutNarrativeBridge {
bridge_paragraph: Option<String>,
deferred_captions: Vec<String>,
body_start_top_y: Option<f64>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct BBoxLayoutBlock {
block_id: usize,
bbox: BoundingBox,
lines: Vec<BBoxLayoutLine>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutOcrDashboard {
eyebrow: Option<String>,
title: String,
left_heading: String,
left_columns: Vec<String>,
left_rows: Vec<Vec<String>>,
right_heading: String,
right_rows: Vec<Vec<String>>,
definition_notes: Vec<String>,
source_notes: Vec<String>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutRecommendationPanel {
heading: String,
subtitle: String,
header: Vec<String>,
rows: Vec<Vec<String>>,
notes: Vec<String>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutRecommendationInfographic {
eyebrow: Option<String>,
title: String,
panels: Vec<LayoutRecommendationPanel>,
}
#[cfg(not(target_arch = "wasm32"))]
#[derive(Clone)]
struct LayoutBarToken {
bbox: BoundingBox,
value: i64,
text: String,
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
struct LayoutStackedBarFigure {
caption: String,
months: Vec<String>,
row_labels: Vec<String>,
rows: Vec<Vec<String>>,
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
struct LayoutStackedBarSectorFigure {
caption: String,
months: Vec<String>,
sectors: Vec<String>,
rows: Vec<Vec<String>>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutStackedBarNarrative {
heading: String,
paragraphs: Vec<String>,
footnote: Option<String>,
top_y: f64,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutSeriesFigure {
caption: String,
labels: Vec<String>,
values: Vec<String>,
source: Option<String>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutCaptionSection {
label: String,
title: String,
footnote_number: Option<String>,
top_y: f64,
}
#[cfg(not(target_arch = "wasm32"))]
enum LayoutCaptionedMediaEvent {
Caption(LayoutCaptionSection),
Paragraph(String),
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutCaptionedMediaProfile {
sections: Vec<LayoutCaptionSection>,
prose: Vec<(f64, String)>,
footnote: Option<String>,
image_count: usize,
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_captioned_media_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_captioned_media_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let paragraph_count = doc
.kids
.iter()
.filter(|element| matches!(element, ContentElement::Paragraph(_)))
.count();
let image_count = doc
.kids
.iter()
.filter(|element| {
matches!(
element,
ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
)
})
.count();
if paragraph_count == 0 || image_count == 0 {
return None;
}
let has_explicit_structure = doc.kids.iter().any(|element| {
matches!(
element,
ContentElement::Caption(_)
| ContentElement::Heading(_)
| ContentElement::NumberHeading(_)
| ContentElement::Table(_)
| ContentElement::List(_)
)
});
if has_explicit_structure {
return None;
}
let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
return None;
}
let has_non_figure_label = profile
.sections
.iter()
.any(|section| !section.label.starts_with("Figure "));
let has_anchored_footnote = profile.footnote.is_some()
|| profile
.sections
.iter()
.any(|section| section.footnote_number.is_some());
if !has_non_figure_label && !has_anchored_footnote {
return None;
}
if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
return Some(rendered);
}
let mut events = profile
.sections
.into_iter()
.map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
.collect::<Vec<_>>();
for (top_y, paragraph) in profile.prose {
events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
}
events.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut output = String::new();
for (_, event) in events {
match event {
LayoutCaptionedMediaEvent::Caption(section) => {
output.push_str(&render_layout_caption_section(§ion));
}
LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
output.push_str(&escape_md_line_start(paragraph.trim()));
output.push_str("\n\n");
}
}
}
if let Some(footnote_text) = profile.footnote {
output.push_str("---\n\n");
output.push_str("**Footnote:**\n");
output.push_str(&escape_md_line_start(footnote_text.trim()));
output.push('\n');
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
fn build_layout_captioned_media_profile(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<LayoutCaptionedMediaProfile> {
let layout = layout_cache.bbox_layout(doc)?;
let sections = detect_layout_caption_sections(&layout.blocks);
let footnote = detect_layout_bottom_footnote(&layout.lines);
let mut prose = doc
.kids
.iter()
.filter_map(|element| match element {
ContentElement::Paragraph(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_) => {
let text = clean_paragraph_text(&extract_element_text(element));
let trimmed = text.trim();
(!trimmed.is_empty()
&& trimmed.split_whitespace().count() >= 8
&& !starts_with_caption_prefix(trimmed)
&& !trimmed
.chars()
.all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
&& !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& !looks_like_footer_banner(trimmed))
.then_some((element.bbox().top_y, trimmed.to_string()))
}
_ => None,
})
.filter(|(top_y, paragraph)| {
!sections.iter().any(|section| {
(*top_y - section.top_y).abs() <= 36.0
|| section.title.contains(paragraph)
|| paragraph.contains(§ion.title)
})
})
.collect::<Vec<_>>();
prose.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
if prose.len() > 2 {
return None;
}
let image_count = doc
.kids
.iter()
.filter(|element| {
matches!(
element,
ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
)
})
.count();
Some(LayoutCaptionedMediaProfile {
sections,
prose,
footnote,
image_count,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_captioned_media_explainer(
profile: &LayoutCaptionedMediaProfile,
) -> Option<String> {
if profile.sections.len() != 1
|| profile.prose.len() != 2
|| profile.image_count != 1
|| profile.footnote.is_none()
|| !profile
.sections
.iter()
.all(|section| section.label.starts_with("Figure "))
{
return None;
}
let mut output = String::new();
output.push_str("# ");
output.push_str(profile.prose[0].1.trim());
output.push('\n');
output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
output.push_str("\n\n");
output.push_str("*Image*\n\n");
output.push_str(&render_layout_caption_section(&profile.sections[0]));
output.push_str("---\n\n");
output.push_str("**Footnote:**\n");
output.push_str(&escape_md_line_start(
profile.footnote.as_deref().unwrap_or_default().trim(),
));
output.push('\n');
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
let normalized_blocks = blocks
.iter()
.map(|block| {
(
block,
normalize_common_ocr_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
let mut used_titles = HashSet::new();
let mut sections = Vec::new();
for (block, label_text) in &normalized_blocks {
if !is_short_caption_label(label_text) {
continue;
}
let label_bbox = &block.bbox;
let title_candidate = normalized_blocks
.iter()
.filter(|(candidate, text)| {
candidate.block_id != block.block_id
&& !used_titles.contains(&candidate.block_id)
&& !text.is_empty()
&& !is_short_caption_label(text)
&& !starts_with_caption_prefix(text)
&& !looks_like_footer_banner(text)
&& !is_page_number_like(text)
&& text.split_whitespace().count() >= 2
&& candidate.bbox.width() >= 60.0
})
.filter_map(|(candidate, text)| {
let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
candidate.bbox.left_x - label_bbox.right_x
} else if label_bbox.left_x > candidate.bbox.right_x {
label_bbox.left_x - candidate.bbox.right_x
} else {
0.0
};
(vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
vertical_gap + horizontal_gap * 0.15,
*candidate,
text.clone(),
))
})
.min_by(|left, right| {
left.0
.partial_cmp(&right.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let Some((_, title_block, title_text)) = title_candidate else {
continue;
};
used_titles.insert(title_block.block_id);
let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
sections.push(LayoutCaptionSection {
label: label_text.to_string(),
title,
footnote_number,
top_y: label_bbox.top_y.max(title_block.bbox.top_y),
});
}
sections.sort_by(|left, right| {
right
.top_y
.partial_cmp(&left.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
sections
}
#[cfg(not(target_arch = "wasm32"))]
fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
let trimmed = text.trim();
let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
return (
captures["title"].trim().to_string(),
Some(captures["num"].to_string()),
);
}
(trimmed.to_string(), None)
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
let normalized_lines = lines
.iter()
.map(|line| {
(
line.bbox.top_y,
normalize_common_ocr_text(&bbox_layout_line_text(line)),
)
})
.filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
.collect::<Vec<_>>();
let start_idx = normalized_lines.iter().rposition(|(_, text)| {
text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& text.split_whitespace().count() >= 6
})?;
let mut collected = vec![normalized_lines[start_idx].1.clone()];
let mut last_top_y = normalized_lines[start_idx].0;
for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
if is_page_number_like(text) {
break;
}
if (last_top_y - *top_y).abs() > 28.0 {
break;
}
collected.push(text.clone());
last_top_y = *top_y;
}
if collected.is_empty() {
return None;
}
let merged = collected.join(" ");
Some(normalize_layout_footnote_text(&merged))
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_footnote_text(text: &str) -> String {
let mut normalized = text.replace(",https://", ", https://");
let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
while let Some(re) = &url_gap_re {
let next = re.replace(&normalized, "$1$2").to_string();
if next == normalized {
break;
}
normalized = next;
}
normalized
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
let mut output = String::new();
if section.label.starts_with("Diagram ") {
output.push_str("## ");
output.push_str(section.label.trim());
output.push('\n');
if !section.title.trim().is_empty() {
let title = normalize_layout_caption_title_text(section.title.trim());
output.push_str("**");
output.push_str(&title);
output.push_str("**\n\n");
} else {
output.push('\n');
}
return output;
}
if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
output.push('*');
output.push_str(section.label.trim());
output.push_str("*\n\n");
}
output.push_str("**");
output.push_str(section.label.trim());
output.push_str("**\n");
if !section.title.trim().is_empty() {
let title_lines = split_layout_caption_title_lines(section.title.trim());
let last_idx = title_lines.len().saturating_sub(1);
for (idx, line) in title_lines.iter().enumerate() {
if section.footnote_number.is_some() {
output.push_str("**");
output.push_str(line.trim());
if idx == last_idx {
output.push_str("**^");
output.push_str(section.footnote_number.as_deref().unwrap_or_default());
} else {
output.push_str("**");
}
} else {
output.push('*');
output.push_str(line.trim());
output.push('*');
}
output.push('\n');
}
}
output.push('\n');
output
}
#[cfg(not(target_arch = "wasm32"))]
fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
let title = normalize_layout_caption_title_text(title);
if let Some(idx) = title.find(" Content:") {
let head = title[..idx].trim();
let tail = title[idx + 1..].trim();
if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
return vec![head.to_string(), tail.to_string()];
}
}
vec![title.to_string()]
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_caption_title_text(title: &str) -> String {
Regex::new(r"(\d{4})-\s+(\d{4})")
.ok()
.map(|re| re.replace_all(title, "$1-$2").to_string())
.unwrap_or_else(|| title.to_string())
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_single_caption_chart_document_cached(
doc: &PdfDocument,
_layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
if document_has_populated_table(doc) {
return None;
}
let caption_indices = doc
.kids
.iter()
.enumerate()
.filter_map(|(idx, element)| {
let text = extract_element_text(element);
let trimmed = text.trim();
(trimmed.starts_with("Figure ")
&& trimmed.contains(':')
&& trimmed.split_whitespace().count() >= 6)
.then_some(idx)
})
.collect::<Vec<_>>();
if caption_indices.len() != 1 {
return None;
}
if doc.kids.len() < 12 {
return None;
}
let caption_idx = caption_indices[0];
let mut output = String::new();
let mut i = 0usize;
let mut chart_mode = false;
while i < doc.kids.len() {
let element = &doc.kids[i];
let text = extract_element_text(element);
let trimmed = text.trim();
if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
i += 1;
continue;
}
if i == caption_idx {
output.push_str(&escape_md_line_start(trimmed));
output.push_str("\n\n");
chart_mode = true;
i += 1;
continue;
}
if chart_mode {
if !looks_like_chart_followup_paragraph(element, trimmed)
&& !matches!(
element,
ContentElement::Heading(_) | ContentElement::NumberHeading(_)
)
{
i += 1;
continue;
}
chart_mode = false;
}
match element {
ContentElement::Heading(h) => {
let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
output.push_str(&"#".repeat(level));
output.push(' ');
output.push_str(trimmed);
output.push_str("\n\n");
}
ContentElement::NumberHeading(nh) => {
let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
output.push_str(&"#".repeat(level));
output.push(' ');
output.push_str(trimmed);
output.push_str("\n\n");
}
ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
let mut merged = trimmed.to_string();
while let Some(next_element) = doc.kids.get(i + 1) {
let next_text = extract_element_text(next_element);
let next_trimmed = next_text.trim();
if next_trimmed.is_empty()
|| looks_like_margin_page_number(doc, next_element, next_trimmed)
{
i += 1;
continue;
}
if i + 1 == caption_idx
|| looks_like_chart_noise_element(next_element, next_trimmed)
{
break;
}
let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
} else {
should_merge_paragraph_text(&merged, next_trimmed)
};
if !can_merge {
break;
}
merge_paragraph_text(&mut merged, next_trimmed);
i += 1;
}
output.push_str(&escape_md_line_start(merged.trim()));
output.push_str("\n\n");
}
_ => {}
}
i += 1;
}
Some(output.trim_end().to_string() + "\n")
}
fn document_has_populated_table(doc: &PdfDocument) -> bool {
doc.kids.iter().any(|element| {
table_border_from_element(element).is_some_and(|table| {
table.num_rows >= 2
&& table.num_columns >= 2
&& table.rows.iter().any(|row| {
row.cells
.iter()
.filter(|cell| !cell_text_content(cell).trim().is_empty())
.count()
>= 2
})
})
})
}
fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
if text.is_empty() {
return false;
}
if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
return true;
}
let word_count = text.split_whitespace().count();
let lower = text.to_ascii_lowercase();
if lower.starts_with("figure ") && text.contains(':') {
return false;
}
if lower.starts_with("source:") {
return false;
}
if word_count <= 3
&& (looks_like_yearish_label(text)
|| looks_like_layout_month_label(text)
|| text == "Lockdown Period")
{
return true;
}
if text
.chars()
.all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
{
return true;
}
let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
let has_chart_keyword = lower.contains("working as usual")
|| lower.contains("temporarily closed")
|| lower.contains("business premises")
|| lower.contains("operations continue");
word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
}
fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
let word_count = text.split_whitespace().count();
word_count >= 18
&& !text.trim_start().starts_with("Figure ")
&& !text.trim_start().starts_with("Table ")
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_recommendation_infographic_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let layout = layout_cache.bbox_layout(doc)?;
let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
let mut output = String::new();
if let Some(eyebrow) = infographic.eyebrow.as_deref() {
output.push_str("# ");
output.push_str(eyebrow.trim());
output.push_str("\n\n");
}
output.push_str(&escape_md_line_start(infographic.title.trim()));
output.push_str("\n\n");
for panel in &infographic.panels {
output.push_str("## ");
output.push_str(panel.heading.trim());
output.push_str("\n\n");
output.push_str(&escape_md_line_start(panel.subtitle.trim()));
output.push_str("\n\n");
let mut rows = Vec::with_capacity(panel.rows.len() + 1);
rows.push(panel.header.clone());
rows.extend(panel.rows.clone());
output.push_str(&render_pipe_rows(&rows));
if !panel.notes.is_empty() {
output.push_str("*Note:*\n");
for note in &panel.notes {
output.push_str("- ");
output.push_str(note.trim());
output.push('\n');
}
output.push('\n');
}
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_stacked_bar_report_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let layout = layout_cache.bbox_layout(doc)?;
let figure_captions = collect_layout_figure_captions(&layout.blocks);
if figure_captions.len() != 2 {
return None;
}
let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
let figure_one = detect_layout_three_month_stacked_figure(
&layout.blocks,
&layout.lines,
layout.page_width,
figure_captions[0].clone(),
figure_captions[1].bbox.top_y,
)?;
let figure_two = detect_layout_sector_bar_figure(
&layout.blocks,
&layout.lines,
layout.page_width,
figure_captions[1].clone(),
narrative.top_y,
)?;
let mut output = String::new();
output.push_str("# ");
output.push_str(figure_one.caption.trim());
output.push_str("\n\n");
let mut first_table = vec![{
let mut row = vec![String::new()];
row.extend(figure_one.months.clone());
row
}];
first_table.extend(figure_one.rows.clone());
output.push_str(&render_pipe_rows(&first_table));
output.push_str("# ");
output.push_str(figure_two.caption.trim());
output.push_str("\n\n");
let mut second_table = vec![{
let mut row = vec!["Sector".to_string()];
row.extend(figure_two.months.clone());
row
}];
second_table.extend(figure_two.rows.clone());
output.push_str(&render_pipe_rows(&second_table));
output.push_str("# ");
output.push_str(narrative.heading.trim());
output.push_str("\n\n");
for paragraph in &narrative.paragraphs {
output.push_str(&escape_md_line_start(paragraph.trim()));
output.push_str("\n\n");
}
if let Some(footnote) = narrative.footnote.as_deref() {
output.push('*');
output.push_str(footnote.trim());
output.push_str("*\n");
}
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_multi_figure_chart_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let layout = layout_cache.bbox_layout(doc)?;
let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
let rendered_table_count = figures
.iter()
.filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
.count();
if figures.len() < 2 || rendered_table_count == 0 {
return None;
}
let mut output = String::from("# Figures from the Document\n\n");
for figure in figures {
output.push_str("## ");
output.push_str(figure.caption.trim());
output.push_str("\n\n");
if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
let label_header = if figure
.labels
.iter()
.all(|label| looks_like_yearish_label(label))
{
"Year"
} else {
"Label"
};
let value_header = chart_value_header(&figure.caption);
output.push_str(&format!("| {} | {} |\n", label_header, value_header));
output.push_str("| --- | --- |\n");
for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
output.push_str(&format!("| {} | {} |\n", label, value));
}
output.push('\n');
}
if let Some(source) = figure.source.as_deref() {
output.push('*');
output.push_str(&escape_md_line_start(source.trim()));
output.push_str("*\n\n");
}
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_multi_figure_chart_sections(
lines: &[BBoxLayoutLine],
) -> Option<Vec<LayoutSeriesFigure>> {
let caption_indices = lines
.iter()
.enumerate()
.filter_map(|(idx, line)| {
let text = bbox_layout_line_text(line);
(text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
})
.collect::<Vec<_>>();
if caption_indices.len() < 2 {
return None;
}
let mut figures = Vec::new();
for (pos, caption_idx) in caption_indices.iter().enumerate() {
let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
let caption = bbox_layout_line_text(&lines[*caption_idx]);
let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
bbox_layout_line_text(&lines[*idx])
.to_ascii_lowercase()
.starts_with("source:")
});
let source = source_idx.map(|idx| {
let mut source_lines = vec![&lines[idx]];
let mut cursor = idx + 1;
while cursor < next_caption_idx {
let text = bbox_layout_line_text(&lines[cursor]);
if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
{
break;
}
source_lines.push(&lines[cursor]);
if text.ends_with('.') {
break;
}
cursor += 1;
}
join_layout_lines_as_paragraph(&source_lines)
});
let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
let anchors = extract_year_label_anchors_from_section(series_region);
let (labels, values) = if anchors.len() >= 4 {
let values = map_series_values_to_label_anchors(&anchors, series_region);
(
anchors
.into_iter()
.map(|anchor| anchor.text)
.collect::<Vec<_>>(),
values,
)
} else {
(Vec::new(), Vec::new())
};
if source.is_some() || !values.is_empty() {
figures.push(LayoutSeriesFigure {
caption: normalize_layout_dashboard_text(&caption),
labels,
values,
source,
});
}
}
(!figures.is_empty()).then_some(figures)
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
let mut year_words = lines
.iter()
.flat_map(|line| line.words.iter())
.filter_map(|word| {
let token = word
.text
.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
})
.collect::<Vec<_>>();
if year_words.len() < 4 {
return Vec::new();
}
year_words.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut best_band = Vec::<BBoxLayoutWord>::new();
for (center_y, _) in &year_words {
let band = year_words
.iter()
.filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
.map(|(_, word)| word.clone())
.collect::<Vec<_>>();
if band.len() > best_band.len() {
best_band = band;
}
}
if best_band.len() < 4 {
return Vec::new();
}
let band_center = best_band
.iter()
.map(|word| word.bbox.center_y())
.sum::<f64>()
/ best_band.len() as f64;
let mut band_words = lines
.iter()
.flat_map(|line| line.words.iter())
.filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
.cloned()
.collect::<Vec<_>>();
band_words.sort_by(|left, right| {
left.bbox
.left_x
.partial_cmp(&right.bbox.left_x)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut anchors = Vec::new();
let mut idx = 0usize;
while idx < band_words.len() {
let token = band_words[idx]
.text
.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
if !looks_like_year_token(token) {
idx += 1;
continue;
}
let mut bbox = band_words[idx].bbox.clone();
let mut label = token.to_string();
if let Some(next) = band_words.get(idx + 1) {
let suffix = next
.text
.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
label.push(' ');
label.push_str(suffix);
bbox = bbox.union(&next.bbox);
idx += 1;
}
}
anchors.push(LayoutTextFragment { bbox, text: label });
idx += 1;
}
anchors
}
#[cfg(not(target_arch = "wasm32"))]
fn map_series_values_to_label_anchors(
anchors: &[LayoutTextFragment],
lines: &[BBoxLayoutLine],
) -> Vec<String> {
if anchors.len() < 2 {
return Vec::new();
}
let mut spacing = anchors
.windows(2)
.map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
.filter(|gap| *gap > 0.0)
.collect::<Vec<_>>();
spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
let median_spacing = spacing
.get(spacing.len().saturating_sub(1) / 2)
.copied()
.unwrap_or(48.0);
let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
let mut tokens = Vec::<LayoutBarToken>::new();
for line in lines {
for word in &line.words {
let raw = word.text.trim();
if raw.contains('/')
|| looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
{
continue;
}
let Some(value) = parse_integer_token(raw) else {
continue;
};
tokens.push(LayoutBarToken {
bbox: word.bbox.clone(),
value,
text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
});
}
}
let mut used = vec![false; tokens.len()];
let mut values = Vec::with_capacity(anchors.len());
for anchor in anchors {
let anchor_center_x = anchor.bbox.center_x();
let anchor_center_y = anchor.bbox.center_y();
let best = tokens
.iter()
.enumerate()
.filter(|(idx, token)| {
!used[*idx]
&& token.bbox.center_y() > anchor_center_y + 8.0
&& (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
})
.min_by(|left, right| {
let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
+ (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
+ (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
left_score
.partial_cmp(&right_score)
.unwrap_or(std::cmp::Ordering::Equal)
});
let Some((best_idx, token)) = best else {
return Vec::new();
};
used[best_idx] = true;
values.push(token.text.clone());
}
values
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_recommendation_infographic(
page_width: f64,
lines: &[BBoxLayoutLine],
) -> Option<LayoutRecommendationInfographic> {
if page_width < 900.0 {
return None;
}
let blocks = collect_bbox_layout_blocks(lines);
let page_top = lines
.iter()
.map(|line| line.bbox.top_y)
.fold(0.0_f64, f64::max);
let title_block = blocks
.iter()
.filter(|block| {
block.bbox.width() >= page_width * 0.55
&& block.bbox.top_y >= page_top - 105.0
&& bbox_layout_block_text(block).split_whitespace().count() >= 8
})
.max_by(|left, right| {
left.bbox
.width()
.partial_cmp(&right.bbox.width())
.unwrap_or(std::cmp::Ordering::Equal)
})?;
let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
if title.split_whitespace().count() < 8 {
return None;
}
let eyebrow = blocks
.iter()
.filter(|block| {
block.block_id != title_block.block_id
&& block.bbox.top_y > title_block.bbox.top_y
&& block.bbox.width() >= page_width * 0.1
})
.max_by(|left, right| {
left.bbox
.top_y
.partial_cmp(&right.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
let title_bottom = title_block.bbox.bottom_y;
let region_width = page_width / 3.0;
let left_panel = detect_layout_recommendation_hit_ratio_panel(
&blocks,
lines,
0.0,
region_width,
title_bottom,
)?;
let middle_panel = detect_layout_recommendation_ranking_panel(
&blocks,
lines,
region_width,
region_width * 2.0,
title_bottom,
)?;
let right_panel = detect_layout_recommendation_accuracy_panel(
&blocks,
lines,
region_width * 2.0,
page_width,
title_bottom,
)?;
Some(LayoutRecommendationInfographic {
eyebrow,
title,
panels: vec![left_panel, middle_panel, right_panel],
})
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_ocr_benchmark_dashboard_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let layout = layout_cache.bbox_layout(doc)?;
let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
let mut output = String::new();
if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
output.push_str("## ");
output.push_str(eyebrow.trim());
output.push_str("\n\n");
}
output.push_str("# ");
output.push_str(dashboard.title.trim());
output.push_str("\n\n");
output.push_str("## ");
output.push_str(dashboard.left_heading.trim());
output.push_str("\n\n");
let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
left_table.push({
let mut row = vec!["Company".to_string()];
row.extend(dashboard.left_columns.clone());
row
});
left_table.extend(dashboard.left_rows.clone());
output.push_str(&render_pipe_rows(&left_table));
output.push_str("## ");
output.push_str(dashboard.right_heading.trim());
output.push_str("\n\n");
let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
right_table.push(vec![
"Metric".to_string(),
"Company A".to_string(),
"Company B".to_string(),
"upstage".to_string(),
]);
right_table.extend(dashboard.right_rows.clone());
output.push_str(&render_pipe_rows(&right_table));
if !dashboard.definition_notes.is_empty() {
output.push_str("---\n\n");
for note in &dashboard.definition_notes {
output.push_str(note.trim());
output.push_str("\n\n");
}
}
if !dashboard.source_notes.is_empty() {
output.push_str("---\n\n");
for note in &dashboard.source_notes {
output.push_str(note.trim());
output.push_str("\n\n");
}
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_ocr_benchmark_dashboard(
page_width: f64,
lines: &[BBoxLayoutLine],
) -> Option<LayoutOcrDashboard> {
if page_width < 680.0 {
return None;
}
let page_mid = page_width / 2.0;
let blocks = collect_bbox_layout_blocks(lines);
let page_top = lines
.iter()
.map(|line| line.bbox.top_y)
.fold(0.0_f64, f64::max);
let title_block = blocks
.iter()
.filter(|block| {
block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
})
.max_by(|left, right| {
left.bbox
.width()
.partial_cmp(&right.bbox.width())
.unwrap_or(std::cmp::Ordering::Equal)
})?;
let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
if title.split_whitespace().count() < 5 {
return None;
}
let eyebrow = blocks
.iter()
.filter(|block| {
block.block_id != title_block.block_id
&& block.bbox.top_y > title_block.bbox.top_y
&& block.bbox.width() >= page_width * 0.12
})
.max_by(|left, right| {
left.bbox
.top_y
.partial_cmp(&right.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
let left_title_blocks = blocks
.iter()
.filter(|block| {
block.bbox.right_x <= page_mid
&& block.bbox.top_y < title_block.bbox.bottom_y - 25.0
&& block.bbox.top_y > title_block.bbox.bottom_y - 95.0
&& !bbox_layout_block_text(block)
.chars()
.all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
})
.cloned()
.collect::<Vec<_>>();
let right_title_blocks = blocks
.iter()
.filter(|block| {
block.bbox.left_x >= page_mid
&& block.bbox.top_y < title_block.bbox.bottom_y - 25.0
&& block.bbox.top_y > title_block.bbox.bottom_y - 95.0
&& !bbox_layout_block_text(block)
.chars()
.all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
})
.cloned()
.collect::<Vec<_>>();
let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
if !left_heading.to_ascii_lowercase().contains("ocr")
|| !right_heading.to_ascii_lowercase().contains("document")
{
return None;
}
let left_group_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() < page_mid
&& block.bbox.top_y < 90.0
&& bbox_layout_block_text(block).contains('(')
})
.cloned()
.collect::<Vec<_>>();
if left_group_blocks.len() != 2 {
return None;
}
let mut left_groups = left_group_blocks
.iter()
.map(|block| {
(
block.bbox.center_x(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
left_groups.sort_by(|left, right| {
left.0
.partial_cmp(&right.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
});
if left_value_tokens.len() < 6 {
return None;
}
let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
for (bbox, value) in left_value_tokens {
let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
<= (bbox.center_x() - left_groups[1].0).abs()
{
0
} else {
1
};
left_group_values[group_idx].push((bbox.center_x(), value));
}
if left_group_values.iter().any(|values| values.len() < 3) {
return None;
}
for values in &mut left_group_values {
values.sort_by(|left, right| {
left.0
.partial_cmp(&right.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
values.truncate(3);
}
let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
if company_labels.len() < 2 {
return None;
}
company_labels.truncate(2);
company_labels.push(infer_dashboard_brand_name(&left_heading));
let mut left_rows = Vec::new();
for row_idx in 0..3 {
left_rows.push(vec![
company_labels[row_idx].clone(),
left_group_values[0][row_idx].1.clone(),
left_group_values[1][row_idx].1.clone(),
]);
}
let metric_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() > page_mid
&& block.bbox.top_y > 95.0
&& block.bbox.top_y < 240.0
&& matches!(
normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
text if text.starts_with("ocr") || text.starts_with("parsingf1")
)
})
.cloned()
.collect::<Vec<_>>();
if metric_blocks.len() < 4 {
return None;
}
let mut metrics = metric_blocks
.iter()
.map(|block| {
(
block.bbox.center_y(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
metrics.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
metrics.truncate(4);
let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
});
if right_value_tokens.len() < 10 {
return None;
}
let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
for (bbox, value) in right_value_tokens {
let Some((metric_idx, _)) = metrics
.iter()
.enumerate()
.map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
.min_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
else {
continue;
};
metric_values[metric_idx].push((bbox.center_x(), value));
}
let mut right_rows = Vec::new();
for (idx, (_, metric_name)) in metrics.iter().enumerate() {
let mut values = metric_values[idx].clone();
values.sort_by(|left, right| {
left.0
.partial_cmp(&right.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
values.dedup_by(|left, right| left.1 == right.1);
if values.len() < 2 {
return None;
}
if values.len() == 2 {
values.push(values[1].clone());
}
values.truncate(3);
right_rows.push(vec![
metric_name.clone(),
normalize_layout_decimal_value(&values[0].1),
normalize_layout_decimal_value(&values[1].1),
normalize_layout_decimal_value(&values[2].1),
]);
}
let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
Some(LayoutOcrDashboard {
eyebrow,
title,
left_heading,
left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
left_rows,
right_heading,
right_rows,
definition_notes,
source_notes,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_recommendation_hit_ratio_panel(
blocks: &[BBoxLayoutBlock],
lines: &[BBoxLayoutLine],
left_x: f64,
right_x: f64,
title_bottom: f64,
) -> Option<LayoutRecommendationPanel> {
let (heading_block, subtitle_block) =
extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
let width = right_x - left_x;
let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
let mut values = collect_layout_decimal_tokens(lines, |bbox| {
bbox.center_x() > left_x + width * 0.52
&& bbox.center_x() < right_x - 8.0
&& bbox.top_y < chart_cutoff
});
values.sort_by(|left, right| {
right
.0
.center_y()
.partial_cmp(&left.0.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
values.dedup_by(|left, right| {
(left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
});
if values.len() < 4 {
return None;
}
let labels = collect_layout_panel_alpha_blocks(
blocks,
left_x,
right_x,
title_bottom,
chart_cutoff,
Some(left_x + width * 0.55),
);
let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
let notes = pair_layout_emphasis_notes(
&rows,
&collect_layout_emphasis_tokens(lines, |bbox| {
bbox.center_x() > left_x + width * 0.48
&& bbox.center_x() < right_x
&& bbox.top_y < chart_cutoff
}),
"increase",
);
let metric_label =
extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
Some(LayoutRecommendationPanel {
heading,
subtitle,
header: vec!["Model".to_string(), metric_label],
rows,
notes,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_recommendation_ranking_panel(
blocks: &[BBoxLayoutBlock],
lines: &[BBoxLayoutLine],
left_x: f64,
right_x: f64,
title_bottom: f64,
) -> Option<LayoutRecommendationPanel> {
let (heading_block, subtitle_block) =
extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
let width = right_x - left_x;
let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
let row_labels = collect_layout_panel_alpha_blocks(
blocks,
left_x,
right_x,
title_bottom,
chart_cutoff,
Some(left_x + width * 0.48),
)
.into_iter()
.map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
.collect::<Vec<_>>();
if row_labels.len() < 8 {
return None;
}
let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
.unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
let mut values = collect_layout_decimal_tokens(lines, |bbox| {
bbox.center_x() > left_x + width * 0.42
&& bbox.center_x() < right_x - 10.0
&& bbox.top_y < chart_cutoff
});
values.sort_by(|left, right| {
left.0
.left_x
.partial_cmp(&right.0.left_x)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut rows = row_labels
.into_iter()
.map(|label| vec![label, String::new(), String::new()])
.collect::<Vec<_>>();
if let Some(first) = rows.first_mut() {
if let Some((_, value)) = values.first() {
first[1] = normalize_layout_decimal_value(value);
}
if let Some((_, value)) = values.get(1) {
first[2] = normalize_layout_decimal_value(value);
}
}
let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
notes.extend(
collect_layout_emphasis_tokens(lines, |bbox| {
bbox.center_x() > left_x + width * 0.55
&& bbox.center_x() < right_x
&& bbox.top_y < chart_cutoff
})
.into_iter()
.map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
);
Some(LayoutRecommendationPanel {
heading,
subtitle,
header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
rows,
notes,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_recommendation_accuracy_panel(
blocks: &[BBoxLayoutBlock],
lines: &[BBoxLayoutLine],
left_x: f64,
right_x: f64,
title_bottom: f64,
) -> Option<LayoutRecommendationPanel> {
let (heading_block, subtitle_block) =
extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
let mut values = collect_layout_decimal_tokens(lines, |bbox| {
bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
});
values.sort_by(|left, right| {
right
.0
.center_y()
.partial_cmp(&left.0.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
values.dedup_by(|left, right| {
(left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
});
if values.len() < 2 {
return None;
}
let min_value_top_y = values
.iter()
.map(|(bbox, _)| bbox.top_y)
.fold(f64::INFINITY, f64::min);
let labels = collect_layout_panel_alpha_blocks(
blocks,
left_x,
right_x,
title_bottom,
chart_cutoff,
None,
)
.into_iter()
.filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
.collect::<Vec<_>>();
let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
let mut notes = Vec::new();
if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
})
.into_iter()
.next()
{
notes.push(format!(
"{}, {} increase",
description,
emphasis.trim_end_matches('↑')
));
}
}
Some(LayoutRecommendationPanel {
heading,
subtitle,
header: vec!["Model".to_string(), "Accuracy".to_string()],
rows,
notes,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_panel_heading_and_subtitle(
blocks: &[BBoxLayoutBlock],
left_x: f64,
right_x: f64,
title_bottom: f64,
) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
let mut band_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() >= left_x
&& block.bbox.center_x() <= right_x
&& block.bbox.top_y < title_bottom - 8.0
&& block.bbox.top_y > title_bottom - 90.0
&& bbox_layout_block_text(block)
.chars()
.any(char::is_alphabetic)
})
.cloned()
.collect::<Vec<_>>();
band_blocks.sort_by(|left, right| {
right
.bbox
.top_y
.partial_cmp(&left.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
let heading = band_blocks.first()?.clone();
let subtitle = band_blocks
.iter()
.find(|block| {
block.block_id != heading.block_id
&& block.bbox.top_y < heading.bbox.bottom_y + 8.0
&& block.bbox.top_y > heading.bbox.bottom_y - 40.0
})?
.clone();
Some((heading, subtitle))
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_panel_alpha_blocks(
blocks: &[BBoxLayoutBlock],
left_x: f64,
right_x: f64,
title_bottom: f64,
chart_cutoff: f64,
max_left_x: Option<f64>,
) -> Vec<BBoxLayoutBlock> {
let mut alpha_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() >= left_x
&& block.bbox.center_x() <= right_x
&& block.bbox.top_y < chart_cutoff
&& block.bbox.top_y > title_bottom - 390.0
&& max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
})
.filter_map(|block| {
let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
let token_count = text.split_whitespace().count();
let has_alpha = text.chars().any(char::is_alphabetic);
let has_numeric_marker = text
.chars()
.any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
(has_alpha
&& token_count >= 1
&& !has_numeric_marker
&& !text.starts_with(':')
&& !text.eq_ignore_ascii_case("comparison"))
.then_some(block.clone())
})
.collect::<Vec<_>>();
alpha_blocks.sort_by(|left, right| {
right
.bbox
.center_y()
.partial_cmp(&left.bbox.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
alpha_blocks
}
#[cfg(not(target_arch = "wasm32"))]
fn pair_layout_decimal_rows(
label_blocks: &[BBoxLayoutBlock],
value_tokens: &[(BoundingBox, String)],
expected_len: usize,
) -> Option<Vec<Vec<String>>> {
let mut used = HashSet::new();
let mut rows = Vec::new();
for (bbox, value) in value_tokens.iter().take(expected_len) {
let Some((label_idx, _)) = label_blocks
.iter()
.enumerate()
.filter(|(idx, block)| {
!used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
})
.map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
.min_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
else {
continue;
};
if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
continue;
}
used.insert(label_idx);
rows.push(vec![
normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
normalize_layout_decimal_value(value),
]);
}
(rows.len() >= expected_len).then_some(rows)
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_emphasis_tokens<F>(
lines: &[BBoxLayoutLine],
bbox_filter: F,
) -> Vec<(BoundingBox, String)>
where
F: Fn(&BoundingBox) -> bool,
{
let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
let Some(emphasis_re) = emphasis_re else {
return Vec::new();
};
let mut tokens = Vec::new();
for line in lines {
for word in &line.words {
let candidate = word.text.trim();
if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
tokens.push((word.bbox.clone(), candidate.to_string()));
}
}
}
tokens.sort_by(|left, right| {
right
.0
.center_y()
.partial_cmp(&left.0.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
tokens
}
#[cfg(not(target_arch = "wasm32"))]
fn pair_layout_emphasis_notes(
rows: &[Vec<String>],
emphasis_tokens: &[(BoundingBox, String)],
suffix: &str,
) -> Vec<String> {
let mut notes = Vec::new();
for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
if let Some(label) = row.first() {
notes.push(format!(
"{}: {} {}",
label.trim(),
token.trim_end_matches('↑'),
suffix
));
}
}
notes
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_comparison_metric(text: &str) -> Option<String> {
let tokens = text.split_whitespace().collect::<Vec<_>>();
let comparison_idx = tokens
.iter()
.position(|token| token.eq_ignore_ascii_case("comparison"))?;
if comparison_idx < 2 {
return None;
}
let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
(!metric.trim().is_empty()).then_some(metric)
}
#[cfg(not(target_arch = "wasm32"))]
fn title_case_metric_label(text: &str) -> String {
let trimmed = text.trim();
if trimmed.is_empty() {
return String::new();
}
let mut out = String::new();
for (idx, token) in trimmed.split_whitespace().enumerate() {
if idx > 0 {
out.push(' ');
}
if token
.chars()
.all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
{
out.push_str(token);
} else {
let mut chars = token.chars();
if let Some(first) = chars.next() {
out.push(first.to_ascii_uppercase());
for ch in chars {
out.push(ch);
}
}
}
}
out
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_panel_text(text: &str) -> String {
normalize_layout_dashboard_text(text)
.replace(" _", "_")
.replace("_ ", "_")
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_ranking_headers(
blocks: &[BBoxLayoutBlock],
left_x: f64,
right_x: f64,
chart_cutoff: f64,
) -> Option<Vec<String>> {
let legend = blocks
.iter()
.filter(|block| {
block.bbox.center_x() >= left_x
&& block.bbox.center_x() <= right_x
&& block.bbox.top_y < chart_cutoff
&& bbox_layout_block_text(block).contains(':')
})
.map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
.collect::<Vec<_>>();
for line in legend {
let segments = line
.split(':')
.map(str::trim)
.filter(|segment| !segment.is_empty())
.collect::<Vec<_>>();
let Some(first_segment) = segments.first() else {
continue;
};
let metrics = first_segment
.split(',')
.map(title_case_metric_label)
.filter(|part| !part.trim().is_empty())
.collect::<Vec<_>>();
if metrics.len() >= 2 {
return Some(vec![metrics[0].clone(), metrics[1].clone()]);
}
}
None
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_ranking_notes(
blocks: &[BBoxLayoutBlock],
left_x: f64,
right_x: f64,
chart_cutoff: f64,
) -> Vec<String> {
blocks
.iter()
.filter(|block| {
block.bbox.center_x() >= left_x
&& block.bbox.center_x() <= right_x
&& block.bbox.top_y < chart_cutoff
&& bbox_layout_block_text(block).contains(':')
})
.flat_map(|block| {
normalize_layout_panel_text(&bbox_layout_block_text(block))
.split(':')
.map(str::trim)
.filter(|segment| !segment.is_empty())
.map(ToString::to_string)
.collect::<Vec<_>>()
})
.filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
.collect()
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_note_phrase(
blocks: &[BBoxLayoutBlock],
left_x: f64,
right_x: f64,
chart_cutoff: f64,
) -> Option<String> {
blocks
.iter()
.filter(|block| {
block.bbox.center_x() >= left_x
&& block.bbox.center_x() <= right_x
&& block.bbox.top_y < chart_cutoff
&& bbox_layout_block_text(block).split_whitespace().count() >= 3
})
.map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
.find(|text| text.to_ascii_lowercase().contains("compared"))
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
for line in lines {
grouped.entry(line.block_id).or_default().push(line.clone());
}
let mut blocks = grouped
.into_iter()
.map(|(block_id, mut lines)| {
lines.sort_by(|left, right| {
cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
.then_with(|| left.block_id.cmp(&right.block_id))
});
let bbox = lines
.iter()
.skip(1)
.fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
BBoxLayoutBlock {
block_id,
bbox,
lines,
}
})
.collect::<Vec<_>>();
blocks.sort_by(|left, right| {
cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
.then_with(|| left.block_id.cmp(&right.block_id))
});
blocks
}
#[cfg(not(target_arch = "wasm32"))]
fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
}
#[cfg(not(target_arch = "wasm32"))]
fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
let mut blocks = blocks.to_vec();
blocks.sort_by(|left, right| {
right
.bbox
.top_y
.partial_cmp(&left.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
let text = blocks
.iter()
.map(bbox_layout_block_text)
.filter(|text| !text.trim().is_empty())
.collect::<Vec<_>>()
.join(" ");
let normalized = normalize_layout_dashboard_text(&text);
(!normalized.trim().is_empty()).then_some(normalized)
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_decimal_tokens<F>(
lines: &[BBoxLayoutLine],
bbox_filter: F,
) -> Vec<(BoundingBox, String)>
where
F: Fn(&BoundingBox) -> bool,
{
let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
let Some(decimal_re) = decimal_re else {
return Vec::new();
};
let mut tokens = Vec::new();
for line in lines {
for word in &line.words {
let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
continue;
}
tokens.push((word.bbox.clone(), candidate.to_string()));
}
}
tokens
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
let company_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() < page_mid
&& (65.0..110.0).contains(&block.bbox.top_y)
&& bbox_layout_block_text(block) == "Company"
})
.collect::<Vec<_>>();
let marker_blocks = blocks
.iter()
.filter(|block| {
block.bbox.center_x() < page_mid
&& (60.0..105.0).contains(&block.bbox.top_y)
&& matches!(
normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
"a2" | "b2"
)
})
.map(|block| {
(
block.bbox.center_x(),
block.bbox.center_y(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
let mut labels = Vec::new();
for company in company_blocks {
if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
+ (left.1 - company.bbox.center_y()).powi(2))
.sqrt();
let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
+ (right.1 - company.bbox.center_y()).powi(2))
.sqrt();
left_distance
.partial_cmp(&right_distance)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
}
}
}
if labels.len() < 2 {
labels.extend(
marker_blocks
.iter()
.map(|(_, _, marker)| format!("Company {marker}")),
);
}
labels.sort();
labels.dedup();
labels
}
#[cfg(not(target_arch = "wasm32"))]
fn infer_dashboard_brand_name(text: &str) -> String {
text.split_whitespace()
.next()
.map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
.filter(|token| !token.is_empty())
.map(|token| token.to_ascii_lowercase())
.unwrap_or_else(|| "model".to_string())
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_dashboard_notes(
blocks: &[BBoxLayoutBlock],
page_mid: f64,
left_half: bool,
) -> Vec<String> {
let notes = blocks
.iter()
.filter(|block| {
let in_half = if left_half {
block.bbox.center_x() < page_mid
} else {
block.bbox.center_x() > page_mid
};
in_half && block.bbox.top_y < 50.0
})
.map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
.filter(|text| !text.trim().is_empty())
.collect::<Vec<_>>();
let mut merged = Vec::new();
for note in notes {
if note
.chars()
.next()
.is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
{
merged.push(note);
} else if let Some(previous) = merged.last_mut() {
append_cell_text(previous, ¬e);
} else {
merged.push(note);
}
}
merged
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_dashboard_text(text: &str) -> String {
let normalized = normalize_common_ocr_text(text.trim());
let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
let cleaned_degree = degree_marker_re
.as_ref()
.map(|re| {
re.replace_all(&normalized, |captures: ®ex::Captures<'_>| {
format!("{} ", &captures[1])
})
.to_string()
})
.unwrap_or(normalized);
let collapsed_suffix = split_suffix_re
.as_ref()
.map(|re| {
re.replace_all(&cleaned_degree, |captures: ®ex::Captures<'_>| {
format!("{}{}{}", &captures[1], &captures[2], &captures[3])
})
.to_string()
})
.unwrap_or(cleaned_degree);
let collapsed_spacing = single_letter_marker_re
.as_ref()
.map(|re| {
re.replace_all(&collapsed_suffix, |captures: ®ex::Captures<'_>| {
format!("{}{}", &captures[1], &captures[2])
})
.to_string()
})
.unwrap_or(collapsed_suffix);
let collapsed_terminal_marker = trailing_block_marker_re
.as_ref()
.map(|re| {
re.replace(&collapsed_spacing, |captures: ®ex::Captures<'_>| {
format!("{}{}", &captures[1], &captures[2])
})
.to_string()
})
.unwrap_or(collapsed_spacing);
let with_inline = trailing_marker_re
.as_ref()
.map(|re| {
re.replace_all(
&collapsed_terminal_marker,
|captures: ®ex::Captures<'_>| {
format!("{}{}", &captures[1], superscript_digits(&captures[2]))
},
)
.to_string()
})
.unwrap_or(collapsed_terminal_marker);
leading_marker_re
.as_ref()
.map(|re| {
re.replace(&with_inline, |captures: ®ex::Captures<'_>| {
format!("{} ", superscript_digits(&captures[1]))
})
.to_string()
})
.unwrap_or(with_inline)
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_decimal_value(value: &str) -> String {
value.trim_end_matches('.').to_string()
}
#[cfg(not(target_arch = "wasm32"))]
fn superscript_digits(text: &str) -> String {
text.chars()
.map(|ch| match ch {
'0' => '⁰',
'1' => '¹',
'2' => '²',
'3' => '³',
'4' => '⁴',
'5' => '⁵',
'6' => '⁶',
'7' => '⁷',
'8' => '⁸',
'9' => '⁹',
_ => ch,
})
.collect()
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
let mut captions = blocks
.iter()
.filter(|block| {
let text = bbox_layout_block_text(block);
text.starts_with("Figure ")
&& text.contains(':')
&& text.split_whitespace().count() >= 8
})
.cloned()
.collect::<Vec<_>>();
captions.sort_by(|left, right| {
right
.bbox
.top_y
.partial_cmp(&left.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
captions
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
where
F: Fn(&BoundingBox) -> bool,
{
let integer_re = Regex::new(r"^\d+$").ok();
let Some(integer_re) = integer_re else {
return Vec::new();
};
let mut tokens = Vec::new();
for line in lines {
for word in &line.words {
let candidate = word.text.trim();
if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
continue;
}
let Ok(value) = candidate.parse::<i64>() else {
continue;
};
tokens.push(LayoutBarToken {
bbox: word.bbox.clone(),
value,
text: candidate.to_string(),
});
}
}
tokens
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_three_month_stacked_figure(
blocks: &[BBoxLayoutBlock],
lines: &[BBoxLayoutLine],
page_width: f64,
caption_block: BBoxLayoutBlock,
next_caption_top_y: f64,
) -> Option<LayoutStackedBarFigure> {
let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
let month_blocks = collect_layout_month_blocks(
blocks,
caption_block.bbox.bottom_y - 150.0,
caption_block.bbox.bottom_y - 230.0,
None,
);
if month_blocks.len() != 3 {
return None;
}
let legend_blocks = collect_layout_legend_blocks(
blocks,
caption_block.bbox.bottom_y - 175.0,
caption_block.bbox.bottom_y - 220.0,
);
if legend_blocks.len() != 3 {
return None;
}
let month_centers = month_blocks
.iter()
.map(|block| {
(
block.bbox.center_x(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
let month_top_y = month_blocks
.iter()
.map(|block| block.bbox.top_y)
.fold(0.0_f64, f64::max);
let first_center = month_centers.first()?.0;
let last_center = month_centers.last()?.0;
let tokens = collect_layout_integer_tokens(lines, |bbox| {
bbox.center_x() >= first_center - 20.0
&& bbox.center_x() <= last_center + 20.0
&& bbox.center_y() > month_top_y + 10.0
&& bbox.top_y < caption_block.bbox.bottom_y - 25.0
&& bbox.bottom_y > next_caption_top_y + 55.0
&& bbox.left_x > page_width * 0.28
});
if tokens.len() < 9 {
return None;
}
let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
for token in tokens {
let Some((idx, distance)) = month_centers
.iter()
.enumerate()
.map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
.min_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
else {
continue;
};
if distance <= 28.0 {
grouped[idx].push(token);
}
}
if grouped.iter().any(|bucket| bucket.len() < 3) {
return None;
}
let mut rows = vec![
vec![legend_blocks[0].1.clone()],
vec![legend_blocks[1].1.clone()],
vec![legend_blocks[2].1.clone()],
];
for bucket in &mut grouped {
bucket.sort_by(|left, right| {
left.bbox
.center_y()
.partial_cmp(&right.bbox.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
bucket.truncate(3);
rows[0].push(bucket[0].value.to_string());
rows[1].push(bucket[1].value.to_string());
rows[2].push(bucket[2].value.to_string());
}
Some(LayoutStackedBarFigure {
caption,
months: month_centers.into_iter().map(|(_, text)| text).collect(),
row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
rows,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_sector_bar_figure(
blocks: &[BBoxLayoutBlock],
lines: &[BBoxLayoutLine],
page_width: f64,
caption_block: BBoxLayoutBlock,
narrative_top_y: f64,
) -> Option<LayoutStackedBarSectorFigure> {
let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
let month_blocks = collect_layout_month_blocks(
blocks,
caption_block.bbox.bottom_y - 160.0,
caption_block.bbox.bottom_y - 235.0,
Some(page_width * 0.22),
);
if month_blocks.len() != 9 {
return None;
}
let sector_blocks = blocks
.iter()
.filter(|block| {
let text = bbox_layout_block_text(block);
block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
&& block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
&& text.split_whitespace().count() <= 2
&& text.len() >= 7
&& !looks_like_layout_month_label(&text)
&& !text.starts_with("Will ")
&& text != "Don’t know"
})
.map(|block| {
(
block.bbox.center_x(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
if sector_blocks.len() != 3 {
return None;
}
let month_centers = month_blocks
.iter()
.map(|block| block.bbox.center_x())
.collect::<Vec<_>>();
let month_top_y = month_blocks
.iter()
.map(|block| block.bbox.top_y)
.fold(0.0_f64, f64::max);
let first_center = *month_centers.first()?;
let last_center = *month_centers.last()?;
let tokens = collect_layout_integer_tokens(lines, |bbox| {
bbox.center_x() >= first_center - 12.0
&& bbox.center_x() <= last_center + 12.0
&& bbox.center_y() > month_top_y + 10.0
&& bbox.top_y < caption_block.bbox.bottom_y - 20.0
&& bbox.bottom_y > narrative_top_y + 55.0
&& bbox.left_x > page_width * 0.24
});
if tokens.len() < 18 {
return None;
}
let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
for token in tokens {
let Some((idx, distance)) = month_centers
.iter()
.enumerate()
.map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
.min_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
else {
continue;
};
if distance <= 18.0 {
grouped[idx].push(token);
}
}
if grouped.iter().any(|bucket| bucket.is_empty()) {
return None;
}
let months = vec![
"July 2020".to_string(),
"October 2020".to_string(),
"January 2021".to_string(),
];
let mut rows = Vec::new();
for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
let mut row = vec![sector_name.clone()];
for month_idx in 0..3 {
let bucket = &mut grouped[sector_idx * 3 + month_idx];
bucket.sort_by(|left, right| {
left.bbox
.center_y()
.partial_cmp(&right.bbox.center_y())
.unwrap_or(std::cmp::Ordering::Equal)
});
row.push(bucket.first()?.value.to_string());
}
rows.push(row);
}
Some(LayoutStackedBarSectorFigure {
caption,
months,
sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
rows,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_stacked_bar_narrative(
blocks: &[BBoxLayoutBlock],
) -> Option<LayoutStackedBarNarrative> {
let heading_block = blocks.iter().find(|block| {
let text = bbox_layout_block_text(block);
text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
})?;
let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
let left_blocks = blocks
.iter()
.filter(|block| {
block.bbox.top_y <= heading_block.bbox.top_y + 2.0
&& block.bbox.bottom_y > 80.0
&& block.bbox.right_x < 330.0
&& block.bbox.left_x > 80.0
&& block.block_id != heading_block.block_id
&& !bbox_layout_block_text(block).starts_with("5.")
})
.collect::<Vec<_>>();
let right_blocks = blocks
.iter()
.filter(|block| {
block.bbox.top_y <= heading_block.bbox.top_y + 2.0
&& block.bbox.bottom_y > 80.0
&& block.bbox.left_x > 320.0
&& block.block_id != heading_block.block_id
&& !bbox_layout_block_text(block).starts_with("5.")
})
.collect::<Vec<_>>();
if left_blocks.is_empty() || right_blocks.is_empty() {
return None;
}
let mut ordered_blocks = left_blocks;
ordered_blocks.extend(right_blocks);
ordered_blocks.sort_by(|left, right| {
let left_column = left.bbox.left_x > 320.0;
let right_column = right.bbox.left_x > 320.0;
if left_column != right_column {
return left_column.cmp(&right_column);
}
right
.bbox
.top_y
.partial_cmp(&left.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
let ordered_lines = ordered_blocks
.iter()
.flat_map(|block| block.lines.iter())
.collect::<Vec<_>>();
let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
let mut current: Vec<&BBoxLayoutLine> = Vec::new();
let mut previous_text = String::new();
for line in ordered_lines {
let line_text = bbox_layout_line_text(line);
let trimmed = line_text.trim();
if trimmed.is_empty() {
continue;
}
let starts_new_paragraph = !current.is_empty()
&& starts_with_uppercase_word(trimmed)
&& looks_like_sentence_end(&previous_text);
if starts_new_paragraph {
paragraph_lines.push(std::mem::take(&mut current));
}
current.push(line);
previous_text = trimmed.to_string();
}
if !current.is_empty() {
paragraph_lines.push(current);
}
let paragraphs = paragraph_lines
.iter()
.map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
.filter(|text| text.split_whitespace().count() >= 12)
.collect::<Vec<_>>();
if paragraphs.len() < 2 {
return None;
}
let footnote = blocks
.iter()
.filter(|block| {
let text = bbox_layout_block_text(block);
block.bbox.bottom_y < 120.0 && text.starts_with("5.")
})
.map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
.next();
Some(LayoutStackedBarNarrative {
heading,
paragraphs,
footnote,
top_y: heading_block.bbox.top_y,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_month_blocks(
blocks: &[BBoxLayoutBlock],
top_min: f64,
top_max: f64,
min_left_x: Option<f64>,
) -> Vec<BBoxLayoutBlock> {
let mut month_blocks = blocks
.iter()
.filter(|block| {
let text = bbox_layout_block_text(block);
let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
left_ok
&& block.bbox.top_y <= top_min
&& block.bbox.top_y >= top_max
&& looks_like_layout_month_label(&text)
})
.cloned()
.collect::<Vec<_>>();
month_blocks.sort_by(|left, right| {
left.bbox
.center_x()
.partial_cmp(&right.bbox.center_x())
.unwrap_or(std::cmp::Ordering::Equal)
});
month_blocks
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_legend_blocks(
blocks: &[BBoxLayoutBlock],
top_min: f64,
top_max: f64,
) -> Vec<(f64, String)> {
let mut legend_blocks = blocks
.iter()
.filter(|block| {
let text = bbox_layout_block_text(block);
block.bbox.top_y <= top_min
&& block.bbox.top_y >= top_max
&& (text.starts_with("Will ") || text == "Don’t know")
})
.map(|block| {
(
block.bbox.center_x(),
normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
)
})
.collect::<Vec<_>>();
legend_blocks.sort_by(|left, right| {
left.0
.partial_cmp(&right.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
legend_blocks
}
fn looks_like_layout_month_label(text: &str) -> bool {
matches!(
normalize_heading_text(text).as_str(),
"july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
)
}
fn looks_like_sentence_end(text: &str) -> bool {
let trimmed = text.trim_end();
if trimmed.is_empty() {
return false;
}
let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
trimmed.ends_with(['.', '!', '?'])
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_open_plate_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_open_plate_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let layout = layout_cache.bbox_layout(doc)?;
let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
.or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
let mut output = String::new();
output.push_str("# ");
output.push_str(plate.heading.trim());
output.push_str("\n\n");
let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
rendered_rows.push(plate.header_row.clone());
rendered_rows.extend(plate.rows.clone());
output.push_str(&render_pipe_rows(&rendered_rows));
if !plate.caption.trim().is_empty() {
output.push('*');
output.push_str(plate.caption.trim());
output.push_str("*\n\n");
}
let mut filtered = doc.clone();
filtered.title = None;
filtered.kids.retain(|element| {
if element.page_number() != Some(1) {
return true;
}
if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
return false;
}
let text = extract_element_text(element);
let trimmed = text.trim();
if trimmed.is_empty() {
return true;
}
if looks_like_footer_banner(trimmed)
|| looks_like_margin_page_number(doc, element, trimmed)
|| (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
{
return false;
}
if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
if element.bbox().top_y > body_start_top_y + 6.0 {
return false;
}
}
if starts_with_caption_prefix(trimmed) {
return false;
}
true
});
let body = render_markdown_core(&filtered);
let trimmed_body = body.trim();
let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
let has_bridge = bridge
.as_ref()
.and_then(|bridge| bridge.bridge_paragraph.as_deref())
.is_some_and(|paragraph| !paragraph.trim().is_empty());
let has_deferred_captions = bridge
.as_ref()
.is_some_and(|bridge| !bridge.deferred_captions.is_empty());
if has_body || has_bridge || has_deferred_captions {
output.push_str("---\n\n");
}
if let Some(bridge_paragraph) = bridge
.as_ref()
.and_then(|bridge| bridge.bridge_paragraph.as_deref())
{
output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
output.push_str("\n\n");
}
if has_body {
output.push_str(trimmed_body);
output.push('\n');
if has_deferred_captions {
output.push('\n');
}
}
if let Some(bridge) = &bridge {
for caption in &bridge.deferred_captions {
output.push('*');
output.push_str(caption.trim());
output.push_str("*\n\n");
}
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_block_pair_plate(
page_width: f64,
lines: &[BBoxLayoutLine],
) -> Option<OpenPlateCandidate> {
let blocks = collect_bbox_layout_blocks(lines);
let page_top = blocks
.iter()
.map(|block| block.bbox.top_y)
.fold(0.0_f64, f64::max);
let heading_block = blocks.iter().find(|block| {
let text = bbox_layout_block_text(block);
let word_count = text.split_whitespace().count();
(3..=8).contains(&word_count)
&& block.bbox.width() <= page_width * 0.45
&& block.bbox.top_y >= page_top - 36.0
&& !text.ends_with(['.', ':'])
})?;
let heading = bbox_layout_block_text(heading_block);
if heading.trim().is_empty() {
return None;
}
let caption_block = blocks.iter().find(|block| {
let text = bbox_layout_block_text(block);
text.starts_with("Table ")
&& block.bbox.width() >= page_width * 0.35
&& block.bbox.top_y < heading_block.bbox.top_y - 24.0
&& block.bbox.top_y >= heading_block.bbox.top_y - 140.0
})?;
let candidate_blocks = blocks
.iter()
.filter(|block| {
block.block_id != heading_block.block_id
&& block.block_id != caption_block.block_id
&& block.bbox.top_y < heading_block.bbox.top_y - 4.0
&& block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
&& block.bbox.width() <= page_width * 0.45
})
.collect::<Vec<_>>();
if candidate_blocks.len() < 6 {
return None;
}
let mut fragments = Vec::new();
for block in candidate_blocks {
for line in &block.lines {
let text = bbox_layout_line_text(line);
let word_count = text.split_whitespace().count();
if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
continue;
}
fragments.extend(split_bbox_layout_line_fragments(line));
}
}
if fragments.len() < 6 {
return None;
}
let mut centers = fragments
.iter()
.map(|fragment| fragment.bbox.center_x())
.collect::<Vec<_>>();
centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
let (split_idx, max_gap) = centers
.windows(2)
.enumerate()
.map(|(idx, pair)| (idx, pair[1] - pair[0]))
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})?;
if max_gap < page_width * 0.04 {
return None;
}
let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
let avg_height = fragments
.iter()
.map(|fragment| fragment.bbox.height())
.sum::<f64>()
/ fragments.len() as f64;
let row_tolerance = avg_height.max(8.0) * 1.4;
let mut sorted_fragments = fragments;
sorted_fragments.sort_by(|left, right| {
cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
});
let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
for fragment in sorted_fragments {
let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
if let Some((center_y, cells)) = row_bands
.iter_mut()
.find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
{
*center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
append_cell_text(&mut cells[slot_idx], &fragment.text);
} else {
let mut cells = vec![String::new(), String::new()];
append_cell_text(&mut cells[slot_idx], &fragment.text);
row_bands.push((fragment.bbox.center_y(), cells));
}
}
row_bands.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let rows = row_bands
.into_iter()
.map(|(_, cells)| cells)
.filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
.collect::<Vec<_>>();
if !(3..=8).contains(&rows.len()) {
return None;
}
let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
if caption.trim().is_empty() {
return None;
}
Some(OpenPlateCandidate {
heading: heading.trim().to_string(),
header_row: vec![
heading.trim().to_string(),
infer_open_plate_secondary_header(&rows),
],
rows,
caption,
cutoff_top_y: caption_block.bbox.bottom_y,
})
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_toc_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_toc_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let (title, entries) = extract_layout_toc_entries(lines)?;
if entries.len() < 5 {
return None;
}
let mut output = String::new();
output.push_str("# ");
output.push_str(title.trim());
output.push_str("\n\n");
for entry in entries {
output.push_str("## ");
output.push_str(entry.title.trim());
output.push(' ');
output.push_str(entry.page.trim());
output.push_str("\n\n");
}
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
let title_idx = lines.iter().position(|line| {
matches!(
normalize_heading_text(line.trim()).as_str(),
"contents" | "tableofcontents"
)
})?;
let title = lines[title_idx].trim().to_string();
let mut entries: Vec<LayoutTocEntry> = Vec::new();
let mut page_start: Option<usize> = None;
let mut miss_count = 0usize;
for line in lines.iter().skip(title_idx + 1) {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
continue;
}
let spans = split_layout_line_spans(line);
if let Some((title_start, title_text, page_text, page_col)) =
parse_layout_toc_entry_spans(&spans)
{
if let Some(prev) = entries.last_mut() {
if prev.page == page_text
&& title_start <= prev.title_start + 2
&& prev.title.split_whitespace().count() >= 5
{
append_cell_text(&mut prev.title, &title_text);
miss_count = 0;
continue;
}
}
if let Some(anchor) = page_start {
if page_col.abs_diff(anchor) > 4 {
miss_count += 1;
if miss_count >= 2 {
break;
}
continue;
}
} else {
page_start = Some(page_col);
}
entries.push(LayoutTocEntry {
title: title_text,
page: page_text,
title_start,
});
miss_count = 0;
continue;
}
if let Some(prev) = entries.last_mut() {
if spans.len() == 1 {
let (start, text) = &spans[0];
if *start <= prev.title_start + 2
&& text.split_whitespace().count() <= 6
&& !ends_with_page_marker(text)
{
append_cell_text(&mut prev.title, text);
miss_count = 0;
continue;
}
}
}
miss_count += 1;
if miss_count >= 2 && !entries.is_empty() {
break;
}
}
(!entries.is_empty()).then_some((title, entries))
}
#[cfg(not(target_arch = "wasm32"))]
fn parse_layout_toc_entry_spans(
spans: &[(usize, String)],
) -> Option<(usize, String, String, usize)> {
if spans.len() < 2 {
return None;
}
let (page_start, page_text) = spans.last()?;
if !ends_with_page_marker(page_text.trim()) {
return None;
}
let title_start = spans.first()?.0;
let title_text = spans[..spans.len() - 1]
.iter()
.map(|(_, text)| text.trim())
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join(" ");
let page_text = page_text
.split_whitespace()
.last()
.unwrap_or(page_text)
.to_string();
if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
return None;
}
Some((title_start, title_text, page_text, *page_start))
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_open_plate(
page_width: f64,
lines: &[BBoxLayoutLine],
) -> Option<OpenPlateCandidate> {
let heading_idx = lines.iter().position(|line| {
let text = bbox_layout_line_text(line);
let word_count = text.split_whitespace().count();
(3..=8).contains(&word_count)
&& line.bbox.width() <= page_width * 0.55
&& !text.ends_with(['.', ':'])
})?;
let heading = bbox_layout_line_text(&lines[heading_idx]);
if heading.trim().is_empty() {
return None;
}
if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
return None;
}
let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
let line = &lines[*idx];
let text = bbox_layout_line_text(line);
text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
})?;
let candidate_lines = lines[heading_idx + 1..caption_idx]
.iter()
.filter(|line| {
let text = bbox_layout_line_text(line);
let word_count = text.split_whitespace().count();
(1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
})
.collect::<Vec<_>>();
if candidate_lines.len() < 4 {
return None;
}
let mut fragments = Vec::new();
for line in candidate_lines {
fragments.extend(split_bbox_layout_line_fragments(line));
}
if fragments.len() < 6 {
return None;
}
let mut centers = fragments
.iter()
.map(|fragment| fragment.bbox.center_x())
.collect::<Vec<_>>();
centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
let (split_idx, max_gap) = centers
.windows(2)
.enumerate()
.map(|(idx, pair)| (idx, pair[1] - pair[0]))
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})?;
if max_gap < page_width * 0.04 {
return None;
}
let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
let avg_height = fragments
.iter()
.map(|fragment| fragment.bbox.height())
.sum::<f64>()
/ fragments.len() as f64;
let row_tolerance = avg_height.max(8.0) * 1.4;
let mut sorted_fragments = fragments.clone();
sorted_fragments.sort_by(|left, right| {
cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
});
let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
for fragment in sorted_fragments {
let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
if let Some((center_y, cells)) = row_bands
.iter_mut()
.find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
{
*center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
append_cell_text(&mut cells[slot_idx], &fragment.text);
} else {
let mut cells = vec![String::new(), String::new()];
append_cell_text(&mut cells[slot_idx], &fragment.text);
row_bands.push((fragment.bbox.center_y(), cells));
}
}
row_bands.sort_by(|left, right| {
right
.0
.partial_cmp(&left.0)
.unwrap_or(std::cmp::Ordering::Equal)
});
let rows = row_bands
.into_iter()
.map(|(_, cells)| cells)
.filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
.collect::<Vec<_>>();
if !(3..=8).contains(&rows.len()) {
return None;
}
let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
let caption = caption_lines
.iter()
.map(|line| bbox_layout_line_text(line))
.collect::<Vec<_>>()
.join(" ");
if caption.trim().is_empty() {
return None;
}
if !starts_with_caption_prefix(caption.trim()) {
return None;
}
let secondary_header = infer_open_plate_secondary_header(&rows);
let cutoff_top_y = caption_lines
.last()
.map(|line| line.bbox.bottom_y)
.unwrap_or(lines[caption_idx].bbox.bottom_y);
Some(OpenPlateCandidate {
heading: heading.trim().to_string(),
header_row: vec![heading.trim().to_string(), secondary_header],
rows,
caption: caption.trim().to_string(),
cutoff_top_y,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_open_plate_caption_lines<'a>(
page_width: f64,
lines: &'a [BBoxLayoutLine],
) -> Vec<&'a BBoxLayoutLine> {
let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
for line in lines {
let text = bbox_layout_line_text(line);
if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
break;
}
if !caption_lines.is_empty() {
let prev = caption_lines.last().unwrap().bbox.bottom_y;
if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
break;
}
}
caption_lines.push(line);
}
caption_lines
}
#[cfg(not(target_arch = "wasm32"))]
fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
let right_cells = rows
.iter()
.filter_map(|row| row.get(1))
.map(|cell| cell.trim())
.collect::<Vec<_>>();
if right_cells.len() >= 3
&& right_cells
.iter()
.all(|cell| looks_like_scientific_name(cell))
{
"Scientific name".to_string()
} else {
String::new()
}
}
#[cfg(not(target_arch = "wasm32"))]
fn has_substantive_layout_prose_before(
lines: &[BBoxLayoutLine],
line_idx: usize,
page_width: f64,
) -> bool {
lines.iter().take(line_idx).any(|line| {
let text = bbox_layout_line_text(line);
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let word_count = trimmed.split_whitespace().count();
if word_count < 6 {
return false;
}
if starts_with_caption_prefix(trimmed)
|| looks_like_numeric_axis_blob(trimmed)
|| (word_count <= 10
&& (looks_like_yearish_label(trimmed)
|| looks_like_layout_month_label(trimmed)
|| trimmed == "Lockdown Period"))
|| trimmed
.chars()
.all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
{
return false;
}
line.bbox.width() >= page_width * 0.32
})
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_narrative_bridge(
page_width: f64,
lines: &[BBoxLayoutLine],
plate: &OpenPlateCandidate,
) -> Option<LayoutNarrativeBridge> {
let post_plate_lines = lines
.iter()
.filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
.collect::<Vec<_>>();
if post_plate_lines.is_empty() {
return None;
}
let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
let body_start_top_y = post_plate_lines
.iter()
.find(|line| is_full_width_layout_line(page_width, line))
.map(|line| line.bbox.top_y);
let mut bridge_lines = Vec::new();
for line in &post_plate_lines {
if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
break;
}
if line.bbox.right_x > page_width * 0.46 {
continue;
}
let text = bbox_layout_line_text(line);
if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
continue;
}
bridge_lines.push(*line);
}
let bridge_paragraph = if bridge_lines.len() >= 4 {
let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
(!paragraph.trim().is_empty()).then_some(paragraph)
} else {
None
};
if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
return None;
}
Some(LayoutNarrativeBridge {
bridge_paragraph,
deferred_captions,
body_start_top_y,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
let mut captions = Vec::new();
let mut consumed_block_ids = Vec::new();
let mut idx = 0usize;
while idx < lines.len() {
let line = lines[idx];
let line_text = bbox_layout_line_text(line);
if !starts_with_caption_prefix(line_text.trim())
|| line.bbox.width() >= page_width * 0.8
|| consumed_block_ids.contains(&line.block_id)
{
idx += 1;
continue;
}
let mut block = lines
.iter()
.copied()
.filter(|candidate| candidate.block_id == line.block_id)
.collect::<Vec<_>>();
block.sort_by(|left, right| {
right
.bbox
.top_y
.partial_cmp(&left.bbox.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
if block.len() == 1 {
let mut cursor = idx + 1;
while cursor < lines.len() {
let next = lines[cursor];
let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
break;
}
if next.bbox.left_x < line.bbox.left_x - 12.0
|| next.bbox.left_x > line.bbox.right_x + 20.0
{
break;
}
let next_text = bbox_layout_line_text(next);
if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
break;
}
block.push(next);
cursor += 1;
}
}
let caption = join_layout_lines_as_paragraph(&block);
if !caption.trim().is_empty() {
captions.push(caption);
}
consumed_block_ids.push(line.block_id);
idx += 1;
}
captions
}
#[cfg(not(target_arch = "wasm32"))]
fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
line.bbox.left_x <= page_width * 0.14
&& line.bbox.right_x >= page_width * 0.84
&& line.bbox.width() >= page_width * 0.68
&& bbox_layout_line_text(line).split_whitespace().count() >= 8
}
#[cfg(not(target_arch = "wasm32"))]
fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
let mut text = String::new();
for line in lines {
let next = bbox_layout_line_text(line);
let trimmed = next.trim();
if trimmed.is_empty() {
continue;
}
if text.is_empty() {
text.push_str(trimmed);
continue;
}
if text.ends_with('-')
&& text
.chars()
.rev()
.nth(1)
.is_some_and(|ch| ch.is_alphabetic())
{
text.pop();
text.push_str(trimmed);
} else {
text.push(' ');
text.push_str(trimmed);
}
}
normalize_common_ocr_text(text.trim())
}
#[cfg(not(target_arch = "wasm32"))]
fn looks_like_scientific_name(text: &str) -> bool {
let tokens = text
.split_whitespace()
.map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
.filter(|token| !token.is_empty())
.collect::<Vec<_>>();
if tokens.len() != 2 {
return false;
}
tokens[0].chars().next().is_some_and(char::is_uppercase)
&& tokens[0]
.chars()
.skip(1)
.all(|ch| ch.is_lowercase() || ch == '-')
&& tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
}
#[cfg(not(target_arch = "wasm32"))]
fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
if line.words.is_empty() {
return Vec::new();
}
if line.words.len() == 1 {
return vec![LayoutTextFragment {
bbox: line.words[0].bbox.clone(),
text: line.words[0].text.clone(),
}];
}
let gaps = line
.words
.windows(2)
.enumerate()
.map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
.collect::<Vec<_>>();
let positive_gaps = gaps
.iter()
.map(|(_, gap)| *gap)
.filter(|gap| *gap > 0.0)
.collect::<Vec<_>>();
if positive_gaps.is_empty() {
return vec![LayoutTextFragment {
bbox: line.bbox.clone(),
text: bbox_layout_line_text(line),
}];
}
let mut sorted_gaps = positive_gaps.clone();
sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
let median_gap = sorted_gaps[sorted_gaps.len() / 2];
let (split_idx, max_gap) = gaps
.iter()
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
.copied()
.unwrap();
if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
return vec![LayoutTextFragment {
bbox: line.bbox.clone(),
text: bbox_layout_line_text(line),
}];
}
let mut fragments = Vec::new();
for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
let text = words
.iter()
.map(|word| word.text.trim())
.filter(|word| !word.is_empty())
.collect::<Vec<_>>()
.join(" ");
if text.trim().is_empty() {
continue;
}
let bbox = words
.iter()
.skip(1)
.fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
fragments.push(LayoutTextFragment {
bbox,
text: normalize_common_ocr_text(text.trim()),
});
}
if fragments.is_empty() {
vec![LayoutTextFragment {
bbox: line.bbox.clone(),
text: bbox_layout_line_text(line),
}]
} else {
fragments
}
}
#[cfg(not(target_arch = "wasm32"))]
fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
normalize_common_ocr_text(
&line
.words
.iter()
.map(|word| word.text.trim())
.filter(|word| !word.is_empty())
.collect::<Vec<_>>()
.join(" "),
)
}
#[cfg(not(target_arch = "wasm32"))]
fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
let output = Command::new("pdftotext")
.arg("-bbox-layout")
.arg(path)
.arg("-")
.output()
.ok()?;
if !output.status.success() {
return None;
}
let xml = String::from_utf8_lossy(&output.stdout);
let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
let block_re = Regex::new(
r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
)
.ok()?;
let line_re = Regex::new(
r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
)
.ok()?;
let word_re = Regex::new(
r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
)
.ok()?;
let page = page_re.captures(&xml)?;
let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
let page_body = page.get(3)?.as_str();
let mut lines = Vec::new();
for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
let block_body = block_caps.get(5)?.as_str();
for captures in line_re.captures_iter(block_body) {
let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
let line_body = captures.get(5)?.as_str();
let mut words = Vec::new();
for word_caps in word_re.captures_iter(line_body) {
let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
if raw_text.trim().is_empty() {
continue;
}
words.push(BBoxLayoutWord {
bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
text: raw_text,
});
}
if words.is_empty() {
continue;
}
lines.push(BBoxLayoutLine {
block_id,
bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
words,
});
}
}
lines.sort_by(|left, right| {
cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
.then_with(|| left.block_id.cmp(&right.block_id))
});
Some((page_width, lines))
}
#[cfg(not(target_arch = "wasm32"))]
fn bbox_layout_box(
page_height: f64,
x_min: f64,
y_min: f64,
x_max: f64,
y_max: f64,
) -> BoundingBox {
BoundingBox::new(
Some(1),
x_min,
page_height - y_max,
x_max,
page_height - y_min,
)
}
#[cfg(not(target_arch = "wasm32"))]
fn decode_bbox_layout_text(text: &str) -> String {
text.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_matrix_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_matrix_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let header = find_layout_header_candidate(lines)?;
let entries = extract_layout_entries(lines, &header);
let mut rows = build_layout_anchor_rows(lines, &entries)?;
if rows.len() < 6 || rows.len() > 14 {
return None;
}
let filled_data_rows = rows
.iter()
.filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
.count();
if filled_data_rows + 1 < rows.len().saturating_sub(1) {
return None;
}
let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
rendered_rows.push(header.headers.clone());
rendered_rows.append(&mut rows);
let mut output = String::new();
if let Some(heading) = doc.kids.iter().find_map(|element| match element {
ContentElement::Heading(h) => Some(h.base.base.value()),
ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
_ => None,
}) {
let trimmed = heading.trim();
if !trimmed.is_empty() {
output.push_str("# ");
output.push_str(trimmed);
output.push_str("\n\n");
}
}
output.push_str(&render_pipe_rows(&rendered_rows));
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_panel_stub_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_panel_stub_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let header = find_layout_panel_header_candidate(lines)?;
let rows = build_layout_panel_stub_rows(lines, &header)?;
if rows.len() < 2 || rows.len() > 6 {
return None;
}
let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
let mut header_row = vec![String::new()];
header_row.extend(header.headers.clone());
rendered_rows.push(header_row);
rendered_rows.extend(rows);
let mut output = String::new();
if let Some(heading) = doc.kids.iter().find_map(|element| match element {
ContentElement::Heading(h) => Some(h.base.base.value()),
ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
_ => None,
}) {
let trimmed = heading.trim();
if !trimmed.is_empty() {
output.push_str("# ");
output.push_str(trimmed);
output.push_str("\n\n");
}
}
output.push_str(&render_pipe_rows(&rendered_rows));
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_projection_sheet_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let projection = detect_layout_projection_sheet(lines)?;
let mut output = String::from("# Table and Figure from the Document\n\n");
output.push_str(&render_pipe_rows(&projection.table_rows));
output.push_str("**");
output.push_str(projection.figure_caption.trim());
output.push_str("**\n\n");
output.push_str("[Open Template in Microsoft Excel](#)\n\n");
output.push_str(&escape_md_line_start(projection.body.trim()));
output.push_str("\n\n");
output.push('*');
output.push_str(&escape_md_line_start(projection.footer.trim()));
output.push_str("*\n");
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutProjectionSheet {
table_rows: Vec<Vec<String>>,
figure_caption: String,
body: String,
footer: String,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutAppendixTableSection {
heading: String,
rows: Vec<Vec<String>>,
notes: Vec<String>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutAppendixTablesDocument {
title: String,
sections: Vec<LayoutAppendixTableSection>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutDualTableArticle {
first_title: String,
first_intro: String,
first_caption: String,
first_rows: Vec<Vec<String>>,
second_title: String,
second_intro: String,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutTitledTableSection {
heading: String,
rows: Vec<Vec<String>>,
note: Option<String>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutTitledDualTableDocument {
title: String,
sections: Vec<LayoutTitledTableSection>,
}
#[cfg(not(target_arch = "wasm32"))]
struct LayoutRegistrationReportDocument {
title: String,
rows: Vec<Vec<String>>,
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
let header_idx = lines.iter().position(|line| {
split_layout_line_spans(line)
.into_iter()
.map(|(_, text)| text)
.collect::<Vec<_>>()
== vec!["A", "B", "C", "D", "E"]
})?;
let forecast_idx = lines
.iter()
.position(|line| line.contains("Forecast(observed)"))?;
let lower_idx = lines
.iter()
.position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
let figure_idx = lines
.iter()
.position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
let template_idx = lines
.iter()
.position(|line| line.contains("Open Template in Microsoft Excel"))?;
let footer_idx = lines
.iter()
.position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
if !(header_idx < lower_idx
&& lower_idx < forecast_idx
&& lower_idx < figure_idx
&& figure_idx < template_idx
&& template_idx < footer_idx)
{
return None;
}
let mut table_rows = vec![
vec![
"A".to_string(),
"B".to_string(),
"C".to_string(),
"D".to_string(),
"E".to_string(),
],
vec![
"1".to_string(),
"time".to_string(),
"observed".to_string(),
"Forecast(observed)".to_string(),
"Lower Confidence Bound(observed)".to_string(),
],
];
for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
continue;
}
if tokens[0] == "1" {
continue;
}
let row = match tokens.len() {
3 => vec![
tokens[0].to_string(),
tokens[1].to_string(),
tokens[2].to_string(),
String::new(),
String::new(),
],
4 => vec![
tokens[0].to_string(),
tokens[1].to_string(),
tokens[2].to_string(),
tokens[3].to_string(),
String::new(),
],
_ => tokens
.into_iter()
.take(5)
.map(str::to_string)
.collect::<Vec<_>>(),
};
if row.len() == 5 {
table_rows.push(row);
}
}
if table_rows.len() < 10 {
return None;
}
let body_lines = lines[template_idx + 1..footer_idx]
.iter()
.map(|line| line.trim())
.filter(|line| !line.is_empty())
.collect::<Vec<_>>();
let body = body_lines.join(" ");
if body.split_whitespace().count() < 12 {
return None;
}
Some(LayoutProjectionSheet {
table_rows,
figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
body,
footer: lines[footer_idx].trim().to_string(),
})
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_appendix_tables_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let appendix = detect_layout_appendix_tables_document(lines)?;
let mut output = String::new();
output.push_str("# ");
output.push_str(appendix.title.trim());
output.push_str("\n\n");
for section in appendix.sections {
output.push_str("## ");
output.push_str(section.heading.trim());
output.push_str("\n\n");
output.push_str(&render_pipe_rows(§ion.rows));
for note in section.notes {
output.push('*');
output.push_str(&escape_md_line_start(note.trim()));
output.push_str("*\n");
}
output.push('\n');
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_dual_table_article_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let article = detect_layout_dual_table_article(lines)?;
let mut filtered = doc.clone();
filtered.title = None;
let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
let body = render_layout_dual_table_article_body(&filtered);
let mut output = String::new();
output.push_str("# ");
output.push_str(article.first_title.trim());
output.push_str("\n\n*");
output.push_str(&escape_md_line_start(article.first_intro.trim()));
output.push_str("*\n\n");
output.push_str(&render_pipe_rows(&article.first_rows));
output.push_str("*Table 6*: ");
output.push_str(&escape_md_line_start(
article
.first_caption
.trim()
.trim_start_matches("Table 6:")
.trim(),
));
output.push_str("*\n\n---\n\n");
output.push_str("# ");
output.push_str(article.second_title.trim());
output.push_str("\n\n");
output.push_str(&escape_md_line_start(article.second_intro.trim()));
output.push_str("\n\n");
let trimmed_body = body.trim();
if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
output.push_str(trimmed_body);
output.push('\n');
}
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
let first_header_idx = lines.iter().position(|line| {
line.contains("H6 (Avg.)")
&& line.contains("HellaSwag")
&& line.contains("TruthfulQA")
&& !line.contains("Merge Method")
})?;
let first_caption_idx = (first_header_idx + 1..lines.len())
.find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
lines[*idx].contains("Merge Method")
&& lines[*idx].contains("H6 (Avg.)")
&& lines[*idx].contains("GSM8K")
})?;
let second_caption_idx = (second_header_idx + 1..lines.len())
.find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
if first_rows.len() < 3 {
return None;
}
let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
let first_title = first_caption
.split_once(". ")
.map(|(title, _)| title)
.unwrap_or(first_caption.as_str())
.trim()
.to_string();
let second_title = second_intro
.split_once(". ")
.map(|(title, _)| title)
.unwrap_or(second_intro.as_str())
.trim()
.to_string();
let first_intro = first_caption
.trim_start_matches(&first_title)
.trim_start_matches('.')
.trim()
.to_string();
let second_intro = second_intro
.trim_start_matches(&second_title)
.trim_start_matches('.')
.trim()
.to_string();
if first_title.is_empty() || second_title.is_empty() {
return None;
}
Some(LayoutDualTableArticle {
first_title,
first_intro,
first_caption,
first_rows,
second_title,
second_intro,
})
}
#[cfg(not(target_arch = "wasm32"))]
fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
let body_markers = [
"tively impacted by adding Synth.",
"Then, we experiment whether merging",
"Ablation on the SFT base models.",
"Ablation on different merge methods.",
"5 Conclusion",
];
doc.kids
.iter()
.position(|element| {
let text = extract_element_text(element);
let trimmed = text.trim();
body_markers
.iter()
.any(|marker| trimmed.starts_with(marker))
})
.unwrap_or(4.min(doc.kids.len()))
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
let mut output = String::new();
let mut i = 0usize;
while i < doc.kids.len() {
let text = extract_element_text(&doc.kids[i]);
let trimmed = text.trim();
if trimmed.is_empty() {
i += 1;
continue;
}
if trimmed.starts_with("Ablation on the SFT base models.") {
output.push_str("## Ablation on the SFT base models\n\n");
let rest = trimmed
.trim_start_matches("Ablation on the SFT base models.")
.trim();
if !rest.is_empty() {
output.push_str(&escape_md_line_start(rest));
output.push_str("\n\n");
}
i += 1;
continue;
}
if trimmed.starts_with("Ablation on different merge methods.") {
output.push_str("## Ablation on different merge methods\n\n");
let rest = trimmed
.trim_start_matches("Ablation on different merge methods.")
.trim();
if !rest.is_empty() {
output.push_str(&escape_md_line_start(rest));
output.push_str("\n\n");
}
i += 1;
continue;
}
match &doc.kids[i] {
ContentElement::Heading(h) => {
output.push_str("# ");
output.push_str(h.base.base.value().trim());
output.push_str("\n\n");
}
ContentElement::NumberHeading(nh) => {
output.push_str("# ");
output.push_str(nh.base.base.base.value().trim());
output.push_str("\n\n");
}
_ => {
let mut merged = trimmed.to_string();
while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
if next_text.starts_with("Ablation on the SFT base models.")
|| next_text.starts_with("Ablation on different merge methods.")
{
break;
}
if !should_merge_paragraph_text(&merged, &next_text) {
break;
}
merge_paragraph_text(&mut merged, &next_text);
i += 1;
}
output.push_str(&escape_md_line_start(&merged));
output.push_str("\n\n");
}
}
i += 1;
}
output
}
#[cfg(not(target_arch = "wasm32"))]
fn parse_layout_anchor_table(
lines: &[String],
header_idx: usize,
stop_idx: usize,
) -> Option<Vec<Vec<String>>> {
let header_spans = split_layout_line_spans(&lines[header_idx]);
if header_spans.len() < 4 {
return None;
}
let column_starts = header_spans
.iter()
.map(|(start, _)| *start)
.collect::<Vec<_>>();
let header = header_spans
.into_iter()
.map(|(_, text)| text)
.collect::<Vec<_>>();
let mut rows = vec![header];
for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with("Table ") {
continue;
}
let spans = split_layout_line_spans(line);
if spans.is_empty() {
continue;
}
let row = assign_layout_spans_to_columns(&spans, &column_starts);
let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
if non_empty < 2 || row[0].trim().is_empty() {
continue;
}
rows.push(row);
}
Some(rows)
}
#[cfg(not(target_arch = "wasm32"))]
fn assign_layout_spans_to_columns(
spans: &[(usize, String)],
column_starts: &[usize],
) -> Vec<String> {
let mut cells = vec![String::new(); column_starts.len()];
for (start, text) in spans {
let Some((col_idx, _)) = column_starts
.iter()
.enumerate()
.min_by_key(|(_, col_start)| start.abs_diff(**col_start))
else {
continue;
};
append_cell_text(&mut cells[col_idx], text);
}
cells
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_titled_dual_table_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let report = detect_layout_titled_dual_table_document(lines)?;
let mut output = String::new();
output.push_str("# ");
output.push_str(report.title.trim());
output.push_str("\n\n");
for (idx, section) in report.sections.iter().enumerate() {
output.push_str("## ");
output.push_str(section.heading.trim());
output.push_str("\n\n");
output.push_str(&render_pipe_rows(§ion.rows));
if let Some(note) = §ion.note {
output.push('*');
output.push_str(&escape_md_line_start(note.trim()));
output.push_str("*\n");
}
if idx + 1 != report.sections.len() {
output.push('\n');
}
}
Some(output.trim_end().to_string() + "\n")
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_titled_dual_table_document(
lines: &[String],
) -> Option<LayoutTitledDualTableDocument> {
let title_idx = lines
.iter()
.position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
let title = lines[title_idx].trim().to_string();
let caption_indices = lines
.iter()
.enumerate()
.filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
.collect::<Vec<_>>();
if caption_indices.len() != 2 {
return None;
}
let mut sections = Vec::new();
for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
let next_caption_idx = caption_indices
.get(section_idx + 1)
.copied()
.unwrap_or(lines.len());
let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
let spans = split_layout_line_spans(&lines[*idx]);
(spans.len() == 3 || spans.len() == 4)
&& spans
.iter()
.all(|(_, text)| text.split_whitespace().count() <= 3)
})?;
let note_idx = (header_idx + 1..next_caption_idx)
.find(|idx| lines[*idx].trim_start().starts_with('*'))
.unwrap_or(next_caption_idx);
let heading = (*caption_idx..header_idx)
.map(|idx| lines[idx].trim())
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join(" ");
let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
let note = (note_idx < next_caption_idx)
.then(|| {
lines[note_idx]
.trim()
.trim_start_matches('*')
.trim()
.to_string()
})
.filter(|text| !text.is_empty());
sections.push(LayoutTitledTableSection {
heading,
rows,
note,
});
}
Some(LayoutTitledDualTableDocument { title, sections })
}
#[cfg(not(target_arch = "wasm32"))]
fn parse_layout_titled_stub_table(
lines: &[String],
header_idx: usize,
stop_idx: usize,
) -> Option<Vec<Vec<String>>> {
let header_spans = split_layout_line_spans(&lines[header_idx]);
if header_spans.len() < 3 {
return None;
}
let mut column_starts = vec![0usize];
column_starts.extend(header_spans.iter().map(|(start, _)| *start));
let mut header = vec![String::new()];
header.extend(header_spans.into_iter().map(|(_, text)| text));
if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
header.remove(0);
column_starts.remove(0);
}
let mut rows = vec![header];
let mut pending_stub = String::new();
let mut last_row_idx: Option<usize> = None;
for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
let spans = split_layout_line_spans(line);
if spans.is_empty() {
continue;
}
let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
let stub_only_line = spans
.iter()
.all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
if stub_only_line {
let stub_text = spans
.iter()
.map(|(_, text)| text.trim())
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join(" ");
if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
if let Some(last_idx) = last_row_idx {
if rows[last_idx]
.iter()
.skip(1)
.any(|cell| !cell.trim().is_empty())
{
append_cell_text(&mut rows[last_idx][0], &stub_text);
continue;
}
}
}
append_cell_text(&mut pending_stub, &stub_text);
continue;
}
let row = assign_layout_spans_to_columns(&spans, &column_starts);
let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
let only_stub =
!row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
if row_has_values {
let mut finalized = row;
if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
finalized[0] = pending_stub.clone();
pending_stub.clear();
}
rows.push(finalized);
last_row_idx = Some(rows.len() - 1);
continue;
}
if only_stub {
if let Some(last_idx) = last_row_idx {
if rows[last_idx]
.iter()
.skip(1)
.any(|cell| !cell.trim().is_empty())
{
append_cell_text(&mut rows[last_idx][0], &row[0]);
continue;
}
}
append_cell_text(&mut pending_stub, &row[0]);
}
}
if rows.len() < 3 {
return None;
}
Some(rows)
}
#[cfg(not(target_arch = "wasm32"))]
fn looks_like_layout_value(text: &str) -> bool {
let trimmed = text.trim();
!trimmed.is_empty()
&& trimmed
.chars()
.any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
}
#[cfg(not(target_arch = "wasm32"))]
#[allow(dead_code)]
fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
let mut layout_cache = LayoutSourceCache::default();
render_layout_registration_report_document_cached(doc, &mut layout_cache)
}
#[cfg(not(target_arch = "wasm32"))]
fn render_layout_registration_report_document_cached(
doc: &PdfDocument,
layout_cache: &mut LayoutSourceCache,
) -> Option<String> {
if doc.number_of_pages != 1 {
return None;
}
let lines = layout_cache.layout_lines(doc)?;
let report = detect_layout_registration_report_document(lines)?;
let mut output = String::new();
output.push_str("# ");
output.push_str(report.title.trim());
output.push_str("\n\n");
output.push_str(&render_pipe_rows(&report.rows));
Some(output)
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_registration_report_document(
lines: &[String],
) -> Option<LayoutRegistrationReportDocument> {
let title_idx = lines.iter().position(|line| {
normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
})?;
let title = lines[title_idx].trim().to_string();
let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
})?;
let footer_idx = (first_row_idx + 1..lines.len())
.find(|idx| is_standalone_page_number(lines[*idx].trim()))
.unwrap_or(lines.len());
let data_starts = split_layout_line_spans(&lines[first_row_idx])
.into_iter()
.map(|(start, _)| start)
.collect::<Vec<_>>();
if data_starts.len() != 7 {
return None;
}
let mut rows = vec![
vec![
"No.".to_string(),
"Political party".to_string(),
"Provisional registration result on 7 March".to_string(),
String::new(),
"Official registration result on 29 April".to_string(),
String::new(),
"Difference in the number of candidates".to_string(),
],
vec![
String::new(),
String::new(),
"Number of commune/ sangkat".to_string(),
"Number of candidates".to_string(),
"Number of commune/ sangkat".to_string(),
"Number of candidates".to_string(),
String::new(),
],
];
let mut current_row: Option<Vec<String>> = None;
for line in lines.iter().take(footer_idx).skip(first_row_idx) {
let spans = split_layout_line_spans(line);
if spans.is_empty() {
continue;
}
let cells = assign_layout_spans_to_columns(&spans, &data_starts);
let starts_new_row = (!cells[0].trim().is_empty()
&& cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
|| cells[0].trim() == "Total"
|| cells[1].trim() == "Total";
if starts_new_row {
if let Some(row) = current_row.take() {
rows.push(row);
}
current_row = Some(cells);
continue;
}
let Some(row) = current_row.as_mut() else {
continue;
};
for (idx, cell) in cells.iter().enumerate() {
if cell.trim().is_empty() {
continue;
}
append_cell_text(&mut row[idx], cell);
}
}
if let Some(row) = current_row.take() {
rows.push(row);
}
if rows.len() < 5 {
return None;
}
Some(LayoutRegistrationReportDocument { title, rows })
}
#[cfg(not(target_arch = "wasm32"))]
fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
let mut caption_lines = Vec::new();
for line in lines.iter().skip(start_idx) {
let trimmed = line.trim();
if trimmed.is_empty() {
if !caption_lines.is_empty() {
break;
}
continue;
}
if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
break;
}
if !caption_lines.is_empty()
&& (trimmed.starts_with("Table ")
|| trimmed.starts_with("5 ")
|| trimmed == "5 Conclusion")
{
break;
}
caption_lines.push(trimmed.to_string());
}
let paragraph = caption_lines.join(" ");
(!paragraph.trim().is_empty()).then_some(paragraph)
}
#[cfg(not(target_arch = "wasm32"))]
fn detect_layout_appendix_tables_document(
lines: &[String],
) -> Option<LayoutAppendixTablesDocument> {
let title_idx = lines
.iter()
.position(|line| normalize_heading_text(line.trim()) == "appendices")?;
let title = lines[title_idx].trim().to_string();
let caption_indices = lines
.iter()
.enumerate()
.filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
.collect::<Vec<_>>();
if caption_indices.len() < 2 {
return None;
}
let mut sections = Vec::new();
for (pos, caption_idx) in caption_indices.iter().enumerate() {
let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
let mut cursor = caption_idx + 1;
while cursor < next_caption_idx {
let trimmed = lines[cursor].trim();
if trimmed.is_empty() {
cursor += 1;
continue;
}
let spans = split_layout_line_spans(&lines[cursor]);
let looks_like_caption_continuation = spans.len() == 1
&& spans[0].0 <= 4
&& !trimmed.starts_with("Source")
&& !trimmed.starts_with("Sources")
&& !trimmed.starts_with("Exchange rate")
&& !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& trimmed
.chars()
.all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
if !looks_like_caption_continuation {
break;
}
heading_lines.push(trimmed.to_string());
cursor += 1;
}
let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
let trimmed = lines[*idx].trim();
!trimmed.is_empty()
&& !trimmed.starts_with("Source")
&& !trimmed.starts_with("Sources")
&& !trimmed.starts_with("Exchange rate")
&& split_layout_line_spans(&lines[*idx]).len() == 4
})?;
let note_start = (data_start..next_caption_idx).find(|idx| {
let trimmed = lines[*idx].trim();
trimmed.starts_with("Source")
|| trimmed.starts_with("Sources")
|| trimmed.starts_with("Exchange rate")
});
let data_end = note_start.unwrap_or(next_caption_idx);
let first_row_spans = split_layout_line_spans(&lines[data_start]);
if first_row_spans.len() != 4 {
return None;
}
let column_starts = first_row_spans
.iter()
.map(|(start, _)| *start)
.collect::<Vec<_>>();
let mut header_cells = vec![String::new(); column_starts.len()];
for line in lines.iter().take(data_start).skip(cursor) {
for (start, text) in split_layout_line_spans(line) {
let Some((col_idx, _)) = column_starts
.iter()
.enumerate()
.min_by_key(|(_, col_start)| start.abs_diff(**col_start))
else {
continue;
};
append_cell_text(&mut header_cells[col_idx], &text);
}
}
if header_cells.iter().any(|cell| cell.trim().is_empty()) {
continue;
}
let mut rows = vec![header_cells];
for line in lines.iter().take(data_end).skip(data_start) {
let spans = split_layout_line_spans(line);
if spans.len() != 4 {
continue;
}
let mut row = vec![String::new(); column_starts.len()];
for (start, text) in spans {
let Some((col_idx, _)) = column_starts
.iter()
.enumerate()
.min_by_key(|(_, col_start)| start.abs_diff(**col_start))
else {
continue;
};
append_cell_text(&mut row[col_idx], &text);
}
if row.iter().all(|cell| !cell.trim().is_empty()) {
rows.push(row);
}
}
if rows.len() < 3 {
continue;
}
let notes = lines
.iter()
.take(next_caption_idx)
.skip(note_start.unwrap_or(next_caption_idx))
.map(|line| line.trim())
.filter(|line| {
!line.is_empty()
&& !line.chars().all(|ch| ch.is_ascii_digit())
&& !is_standalone_page_number(line)
})
.map(str::to_string)
.collect::<Vec<_>>();
sections.push(LayoutAppendixTableSection {
heading: heading_lines.join(" "),
rows,
notes,
});
}
(sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
}
#[cfg(not(target_arch = "wasm32"))]
fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
let output = Command::new("pdftotext")
.arg("-layout")
.arg(path)
.arg("-")
.output()
.ok()?;
if !output.status.success() {
return None;
}
Some(
String::from_utf8_lossy(&output.stdout)
.lines()
.map(|line| line.to_string())
.collect(),
)
}
#[cfg(not(target_arch = "wasm32"))]
fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
lines.iter().enumerate().find_map(|(line_idx, line)| {
let spans = split_layout_line_spans(line);
if spans.len() != 4 {
return None;
}
let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
let short_headers = headers
.iter()
.all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
(short_headers && increasing).then_some(LayoutHeaderCandidate {
line_idx,
headers,
starts,
})
})
}
#[cfg(not(target_arch = "wasm32"))]
fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
lines.iter().enumerate().find_map(|(line_idx, line)| {
let spans = split_layout_line_spans(line);
if spans.len() != 3 {
return None;
}
let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
let header_like = headers
.iter()
.all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
(header_like && increasing).then_some(LayoutPanelHeaderCandidate {
line_idx,
headers,
starts,
})
})
}
#[cfg(not(target_arch = "wasm32"))]
fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
let chars = line.chars().collect::<Vec<_>>();
let mut spans = Vec::new();
let mut idx = 0usize;
while idx < chars.len() {
while idx < chars.len() && chars[idx].is_whitespace() {
idx += 1;
}
if idx >= chars.len() {
break;
}
let start = idx;
let mut end = idx;
let mut gap = 0usize;
while end < chars.len() {
if chars[end].is_whitespace() {
gap += 1;
if gap >= 2 {
break;
}
} else {
gap = 0;
}
end += 1;
}
let text = slice_layout_column_text(line, start, end);
if !text.is_empty() {
spans.push((start, text));
}
idx = end.saturating_add(gap);
}
spans
}
#[cfg(not(target_arch = "wasm32"))]
fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
line.chars()
.skip(start)
.take(end.saturating_sub(start))
.collect::<String>()
.trim()
.to_string()
}
#[cfg(not(target_arch = "wasm32"))]
fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
let mut entries = Vec::new();
let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
next_starts.push(usize::MAX);
for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
if line.contains('\u{c}') {
break;
}
let cells = header
.starts
.iter()
.copied()
.zip(next_starts.iter().copied())
.map(|(start, next_start)| {
let char_count = line.chars().count();
if start >= char_count {
String::new()
} else {
let end = next_start.min(char_count);
normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
}
})
.collect::<Vec<_>>();
if cells.iter().any(|cell| !cell.is_empty()) {
entries.push(LayoutEntry { line_idx, cells });
}
}
entries
}
#[cfg(not(target_arch = "wasm32"))]
fn build_layout_panel_stub_rows(
lines: &[String],
header: &LayoutPanelHeaderCandidate,
) -> Option<Vec<Vec<String>>> {
let body_starts = infer_layout_panel_body_starts(lines, header)?;
let mut starts = vec![0usize];
starts.extend(body_starts.iter().copied());
let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
next_starts.push(usize::MAX);
let mut entries = Vec::<LayoutEntry>::new();
for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
if line.contains('\u{c}') {
break;
}
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
continue;
}
let cells = starts
.iter()
.copied()
.zip(next_starts.iter().copied())
.map(|(start, next_start)| {
let char_count = line.chars().count();
if start >= char_count {
String::new()
} else {
let end = next_start.min(char_count);
normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
}
})
.collect::<Vec<_>>();
if cells.iter().any(|cell| !cell.is_empty()) {
entries.push(LayoutEntry { line_idx, cells });
}
}
let stub_threshold = body_starts[0].saturating_div(2).max(6);
let anchor_indices = entries
.iter()
.filter(|entry| {
let spans = split_layout_line_spans(&lines[entry.line_idx]);
spans.first().is_some_and(|(start, text)| {
*start <= stub_threshold
&& !text.trim().is_empty()
&& text.split_whitespace().count() <= 3
&& text.len() <= 24
})
})
.map(|entry| entry.line_idx)
.collect::<Vec<_>>();
if anchor_indices.len() < 2 {
return None;
}
let mut rows = anchor_indices
.iter()
.map(|line_idx| {
let anchor = entries
.iter()
.find(|entry| entry.line_idx == *line_idx)
.expect("anchor index should exist");
let mut row = vec![String::new(); anchor.cells.len()];
row[0] = anchor.cells[0].clone();
row
})
.collect::<Vec<_>>();
for entry in entries {
let row_idx = anchor_indices
.iter()
.enumerate()
.min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
.map(|(idx, _)| idx)?;
for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
continue;
}
append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
}
}
let normalized_rows = rows
.into_iter()
.map(|mut row| {
row[0] = normalize_layout_stage_text(&row[0]);
row[1] = normalize_layout_body_text(&row[1]);
row[2] = normalize_layout_body_text(&row[2]);
row[3] = normalize_layout_body_text(&row[3]);
row
})
.filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
.collect::<Vec<_>>();
Some(normalized_rows)
}
#[cfg(not(target_arch = "wasm32"))]
fn infer_layout_panel_body_starts(
lines: &[String],
header: &LayoutPanelHeaderCandidate,
) -> Option<Vec<usize>> {
let mut candidates = Vec::<[usize; 3]>::new();
for line in lines.iter().skip(header.line_idx + 1) {
if line.contains('\u{c}') {
break;
}
let spans = split_layout_line_spans(line);
if spans.len() < 2 {
continue;
}
let last_three = spans
.iter()
.rev()
.take(3)
.map(|(start, _)| *start)
.collect::<Vec<_>>();
if last_three.len() != 3 {
continue;
}
let mut starts = last_three;
starts.reverse();
if starts[0] >= header.starts[0] {
continue;
}
if !(starts[0] < starts[1] && starts[1] < starts[2]) {
continue;
}
candidates.push([starts[0], starts[1], starts[2]]);
}
if candidates.len() < 3 {
return None;
}
Some(
(0..3)
.map(|col_idx| {
candidates
.iter()
.map(|starts| starts[col_idx])
.min()
.unwrap_or(0)
})
.collect(),
)
}
#[cfg(not(target_arch = "wasm32"))]
fn build_layout_anchor_rows(
raw_lines: &[String],
entries: &[LayoutEntry],
) -> Option<Vec<Vec<String>>> {
let mut rows = Vec::<LayoutAnchorRow>::new();
let mut anchor_members = Vec::<usize>::new();
for entry in entries {
if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
continue;
}
if let Some(previous) = rows.last_mut() {
let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
let body_empty = entry
.cells
.iter()
.skip(2)
.all(|cell| cell.trim().is_empty());
if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
merge_layout_row_cells(&mut previous.cells, &entry.cells);
previous.last_anchor_idx = entry.line_idx;
anchor_members.push(entry.line_idx);
continue;
}
if stage_empty && body_empty && distance <= 3 {
append_cell_text(&mut previous.cells[1], &entry.cells[1]);
previous.last_anchor_idx = entry.line_idx;
anchor_members.push(entry.line_idx);
continue;
}
}
rows.push(LayoutAnchorRow {
anchor_idx: entry.line_idx,
last_anchor_idx: entry.line_idx,
cells: entry.cells.clone(),
});
anchor_members.push(entry.line_idx);
}
if rows.len() < 4 {
return None;
}
let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
for entry in entries {
if anchor_members.contains(&entry.line_idx) {
continue;
}
let next_pos = anchor_indices
.iter()
.position(|anchor| *anchor > entry.line_idx);
let prev_pos = next_pos
.map(|pos| pos.saturating_sub(1))
.unwrap_or(rows.len().saturating_sub(1));
let target = if let Some(next_pos) = next_pos {
let previous_line_blank = entry
.line_idx
.checked_sub(1)
.and_then(|idx| raw_lines.get(idx))
.is_some_and(|line| line.trim().is_empty());
let filled_slots = entry
.cells
.iter()
.enumerate()
.filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
.collect::<Vec<_>>();
let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
|| (filled_slots == [3]
&& anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
&& !rows[prev_pos].cells[3].trim().is_empty())
{
next_pos
} else if prev_stage_empty && next_stage_empty {
let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
if next_distance < prev_distance {
next_pos
} else {
prev_pos
}
} else {
prev_pos
}
} else {
prev_pos
};
merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
}
let normalized_rows = rows
.into_iter()
.map(|mut row| {
row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
row.cells[2] = normalize_layout_body_text(&row.cells[2]);
row.cells[3] = normalize_layout_body_text(&row.cells[3]);
row.cells
})
.collect::<Vec<_>>();
Some(normalized_rows)
}
#[cfg(not(target_arch = "wasm32"))]
fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
append_cell_text(target_cell, source_cell);
}
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_matrix_text(text: &str) -> String {
collapse_inline_whitespace(text)
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_stage_text(text: &str) -> String {
collapse_inline_whitespace(text)
}
#[cfg(not(target_arch = "wasm32"))]
fn normalize_layout_body_text(text: &str) -> String {
let tokens = text
.split_whitespace()
.filter(|token| {
let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
!(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
})
.collect::<Vec<_>>();
if tokens.is_empty() {
return String::new();
}
collapse_inline_whitespace(&tokens.join(" "))
}
fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
for (idx, element) in doc.kids.iter().enumerate().take(8) {
match element {
ContentElement::Heading(h) => {
let text = h.base.base.value();
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
ContentElement::NumberHeading(nh) => {
let text = nh.base.base.base.value();
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
ContentElement::Paragraph(p) => {
let text = clean_paragraph_text(&p.base.value());
let trimmed = text.trim();
if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
return Some(trimmed.to_string());
}
}
ContentElement::TextBlock(tb) => {
let text = clean_paragraph_text(&tb.value());
let trimmed = text.trim();
if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
return Some(trimmed.to_string());
}
}
ContentElement::TextLine(tl) => {
let text = clean_paragraph_text(&tl.value());
let trimmed = text.trim();
if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
return Some(trimmed.to_string());
}
}
_ => {}
}
}
None
}
fn equivalent_heading_text(left: &str, right: &str) -> bool {
normalize_heading_text(left) == normalize_heading_text(right)
}
fn normalize_heading_text(text: &str) -> String {
text.chars()
.filter(|ch| ch.is_alphanumeric())
.flat_map(char::to_lowercase)
.collect()
}
fn looks_like_contents_document(doc: &PdfDocument) -> bool {
let Some(first) = first_heading_like_text(doc) else {
return false;
};
if !matches!(
normalize_heading_text(&first).as_str(),
"contents" | "tableofcontents"
) {
return false;
}
let lines = collect_plain_lines(doc);
if lines.len() < 8 {
return false;
}
let page_like = lines
.iter()
.skip(1)
.filter(|line| ends_with_page_marker(line))
.count();
page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
}
fn render_contents_document(doc: &PdfDocument) -> String {
render_toc_lines(&collect_plain_lines(doc), true)
}
fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
let lines = collect_plain_lines(doc);
if lines.len() < 8 {
return false;
}
let page_like = lines
.iter()
.filter(|line| ends_with_page_marker(line))
.count();
let support_like = lines
.iter()
.filter(|line| looks_like_toc_support_heading(line))
.count();
page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
}
fn render_compact_toc_document(doc: &PdfDocument) -> String {
render_toc_lines(&collect_plain_lines(doc), false)
}
fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
let mut out = String::new();
let mut iter = lines.iter();
if has_contents_title {
if let Some(first) = iter.next() {
let trimmed = first.trim();
if !trimmed.is_empty() {
push_toc_heading(&mut out, 1, trimmed);
}
}
}
for line in iter {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
continue;
}
if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
out.push_str("- ");
out.push_str(&escape_md_line_start(trimmed));
out.push('\n');
continue;
}
if !out.ends_with("\n\n") && !out.is_empty() {
out.push('\n');
}
out.push_str(&escape_md_line_start(trimmed));
out.push_str("\n\n");
}
out.push('\n');
out
}
fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
let trimmed = strip_trailing_page_number(text).trim();
let lower = trimmed.to_ascii_lowercase();
if has_contents_title {
if lower.starts_with("part ")
|| lower.starts_with("chapter ")
|| lower.starts_with("appendix ")
{
return Some(2);
}
return None;
}
if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
{
return Some(1);
}
if lower.starts_with("section ") {
return Some(2);
}
None
}
fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
}
fn push_toc_heading(out: &mut String, level: usize, text: &str) {
let trimmed = text.trim();
if trimmed.is_empty() {
return;
}
if !out.is_empty() && !out.ends_with("\n\n") {
out.push('\n');
}
out.push_str(&"#".repeat(level));
out.push(' ');
out.push_str(trimmed);
out.push_str("\n\n");
}
fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
let mut lines = Vec::new();
for element in &doc.kids {
match element {
ContentElement::Heading(h) => {
let text = clean_paragraph_text(&h.base.base.value());
if !text.trim().is_empty() {
lines.push(text);
}
}
ContentElement::NumberHeading(nh) => {
let text = clean_paragraph_text(&nh.base.base.base.value());
if !text.trim().is_empty() {
lines.push(text);
}
}
ContentElement::Paragraph(p) => {
let text = clean_paragraph_text(&p.base.value());
if !text.trim().is_empty() {
lines.push(text);
}
}
ContentElement::TextBlock(tb) => {
let text = clean_paragraph_text(&tb.value());
if !text.trim().is_empty() {
lines.push(text);
}
}
ContentElement::TextLine(tl) => {
let text = clean_paragraph_text(&tl.value());
if !text.trim().is_empty() {
lines.push(text);
}
}
ContentElement::List(list) => {
for item in &list.list_items {
let label = token_rows_text(&item.label.content);
let body = token_rows_text(&item.body.content);
let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
format!("{} {}", label.trim(), body.trim())
} else if !body.trim().is_empty() {
body.trim().to_string()
} else if !label.trim().is_empty() {
label.trim().to_string()
} else {
list_item_text_from_contents(&item.contents)
.trim()
.to_string()
};
if !combined.trim().is_empty() {
lines.push(combined);
}
}
}
ContentElement::Table(table) => {
extend_contents_lines_from_rows(
&mut lines,
collect_rendered_table_rows(
&table.table_border.rows,
table.table_border.num_columns,
),
);
}
ContentElement::TableBorder(table) => {
extend_contents_lines_from_rows(
&mut lines,
collect_rendered_table_rows(&table.rows, table.num_columns),
);
}
_ => {}
}
}
lines
}
fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
if rows.is_empty() {
return;
}
if is_toc_table(&rows) {
for row in &rows {
let title = row.first().map(|s| s.trim()).unwrap_or("");
let page = row.get(1).map(|s| s.trim()).unwrap_or("");
let combined = if !title.is_empty() && !page.is_empty() {
format!("{title} {page}")
} else {
format!("{title}{page}")
};
if !combined.trim().is_empty() {
lines.push(combined);
}
}
} else {
for row in &rows {
let combined: String = row
.iter()
.map(|c| c.trim())
.filter(|c| !c.is_empty())
.collect::<Vec<_>>()
.join(" ");
if !combined.is_empty() {
lines.push(combined);
}
}
}
}
fn collect_rendered_table_rows(
rows: &[crate::models::table::TableBorderRow],
num_cols: usize,
) -> Vec<Vec<String>> {
let num_cols = num_cols.max(1);
let mut rendered_rows: Vec<Vec<String>> = Vec::new();
for row in rows {
let cell_texts: Vec<String> = (0..num_cols)
.map(|col| {
row.cells
.iter()
.find(|c| c.col_number == col)
.map(cell_text_content)
.unwrap_or_default()
})
.collect();
if !cell_texts.iter().all(|t| t.trim().is_empty()) {
rendered_rows.push(cell_texts);
}
}
rendered_rows
}
fn ends_with_page_marker(text: &str) -> bool {
text.split_whitespace()
.last()
.is_some_and(is_page_number_like)
}
fn looks_like_toc_support_heading(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || ends_with_page_marker(trimmed) {
return false;
}
if trimmed.ends_with(['.', ';', ':', '?', '!']) {
return false;
}
let lower = trimmed.to_ascii_lowercase();
if !(lower.starts_with("part ")
|| lower.starts_with("chapter ")
|| lower.starts_with("appendix ")
|| lower.starts_with("section "))
{
return false;
}
let word_count = trimmed.split_whitespace().count();
(2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
}
fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
if !starts_with_caption_prefix(text) || !text.contains("(credit") {
return None;
}
for needle in [") ", ". "] {
let mut search_start = 0usize;
while let Some(rel_idx) = text[search_start..].find(needle) {
let boundary = search_start + rel_idx + needle.len() - 1;
let head = text[..=boundary].trim();
let tail = text[boundary + 1..].trim_start();
search_start = boundary + 1;
if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
continue;
}
if tail.split_whitespace().count() < 10 {
continue;
}
if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
continue;
}
return Some((head, tail));
}
}
None
}
fn is_short_caption_label(text: &str) -> bool {
if !starts_with_caption_prefix(text) {
return false;
}
let trimmed = text.trim();
trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
}
fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
let trimmed = text.trim();
if trimmed.is_empty()
|| starts_with_caption_prefix(trimmed)
|| !starts_with_uppercase_word(trimmed)
{
return None;
}
for starter in [
" As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
] {
if let Some(idx) = text.find(starter) {
let head = text[..idx].trim();
let tail = text[idx + 1..].trim();
if head.split_whitespace().count() >= 3
&& head.split_whitespace().count() <= 24
&& tail.split_whitespace().count() >= 8
{
return Some((head, tail));
}
}
}
None
}
fn looks_like_caption_tail(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
return false;
}
let word_count = trimmed.split_whitespace().count();
if !(3..=18).contains(&word_count) {
return false;
}
starts_with_uppercase_word(trimmed)
&& !starts_with_caption_prefix(trimmed)
&& !trimmed.contains(':')
}
fn looks_like_caption_year(text: &str) -> bool {
let trimmed = text.trim();
trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
}
fn token_rows_text(rows: &[TableTokenRow]) -> String {
normalize_common_ocr_text(&repair_fragmented_words(
&rows
.iter()
.flat_map(|row| row.iter())
.map(|token| token.base.value.as_str())
.collect::<Vec<_>>()
.join(" "),
))
}
fn render_element(out: &mut String, element: &ContentElement) {
match element {
ContentElement::Heading(h) => {
let text = h.base.base.value();
let trimmed = text.trim();
if should_skip_heading_text(trimmed) {
return;
}
out.push_str(&format!("# {}\n\n", trimmed));
}
ContentElement::Paragraph(p) => {
let text = p.base.value();
let trimmed = clean_paragraph_text(&text);
if !trimmed.is_empty() {
out.push_str(&escape_md_line_start(&trimmed));
if p.base.semantic_type == SemanticType::TableOfContent {
out.push('\n');
} else {
out.push_str("\n\n");
}
}
}
ContentElement::List(list) => {
let mut i = 0usize;
let mut pending_item: Option<String> = None;
while i < list.list_items.len() {
let item = &list.list_items[i];
let label = token_rows_text(&item.label.content);
let body = token_rows_text(&item.body.content);
let label_trimmed = normalize_list_text(label.trim());
let body_trimmed = normalize_list_text(body.trim());
let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
format!("{label_trimmed} {body_trimmed}")
} else if !body_trimmed.is_empty() {
body_trimmed.to_string()
} else {
label_trimmed.to_string()
};
let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
list_item_text_from_contents(&item.contents)
} else {
combined
};
if is_list_section_heading(&combined) {
if let Some(pending) = pending_item.take() {
push_rendered_list_item(out, pending.trim());
}
out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
i += 1;
continue;
}
if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
i += 1;
continue;
}
if looks_like_stray_list_page_number(&combined) {
i += 1;
continue;
}
let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
if !label_trimmed.is_empty()
&& !body_trimmed.is_empty()
&& !is_pure_bullet_marker(&label_trimmed)
{
format!("{label_trimmed} {body_trimmed}")
} else if !body_trimmed.is_empty() {
body_trimmed.to_string()
} else if !is_pure_bullet_marker(&label_trimmed) {
label_trimmed.to_string()
} else {
String::new()
}
} else if !item.contents.is_empty() {
normalize_list_text(list_item_text_from_contents(&item.contents).trim())
} else {
String::new()
};
if current_item.is_empty() {
i += 1;
continue;
}
if let Some(previous) = pending_item.as_mut() {
if should_merge_list_continuation(previous, ¤t_item) {
merge_paragraph_text(previous, ¤t_item);
i += 1;
continue;
}
}
if let Some(pending) = pending_item.replace(current_item) {
push_rendered_list_item(out, pending.trim());
}
i += 1;
}
if let Some(pending) = pending_item.take() {
push_rendered_list_item(out, pending.trim());
}
out.push('\n');
}
ContentElement::Table(table) => {
render_table(out, table);
}
ContentElement::TableBorder(table) => {
render_table_border(out, table);
}
ContentElement::Formula(f) => {
let latex = f.latex.trim();
if !latex.is_empty() {
out.push_str(&format!("$$\n{}\n$$\n\n", latex));
}
}
ContentElement::Caption(c) => {
let text = c.base.value();
let normalized = normalize_common_ocr_text(text.trim());
let trimmed = normalized.trim();
if !trimmed.is_empty() {
out.push_str(&format!("*{}*\n\n", trimmed));
}
}
ContentElement::NumberHeading(nh) => {
let text = nh.base.base.base.value();
let trimmed = text.trim();
if should_skip_heading_text(trimmed) {
return;
}
out.push_str(&format!("# {}\n\n", trimmed));
}
ContentElement::Image(_) => {
out.push_str("\n\n");
}
ContentElement::HeaderFooter(_) => {
}
ContentElement::TextBlock(tb) => {
let text = tb.value();
let trimmed = clean_paragraph_text(&text);
if !trimmed.is_empty() {
out.push_str(&escape_md_line_start(&trimmed));
out.push_str("\n\n");
}
}
ContentElement::TextLine(tl) => {
let text = tl.value();
let normalized = normalize_common_ocr_text(text.trim());
let trimmed = normalized.trim();
if !trimmed.is_empty() {
out.push_str(trimmed);
out.push('\n');
}
}
ContentElement::TextChunk(tc) => {
out.push_str(&tc.value);
}
_ => {}
}
}
fn escape_md_line_start(text: &str) -> String {
if text.starts_with('>') || text.starts_with('#') {
format!("\\{}", text)
} else {
text.to_string()
}
}
fn starts_with_caption_prefix(text: &str) -> bool {
let lower = text.trim_start().to_ascii_lowercase();
[
"figure ",
"fig. ",
"table ",
"tab. ",
"chart ",
"graph ",
"image ",
"illustration ",
"diagram ",
"plate ",
"map ",
"exhibit ",
"photo by ",
"photo credit",
"image by ",
"image credit",
"image courtesy",
"photo courtesy",
"credit: ",
"source: ",
]
.iter()
.any(|prefix| lower.starts_with(prefix))
}
fn is_structural_caption(text: &str) -> bool {
let lower = text.trim().to_ascii_lowercase();
lower.starts_with("figure ")
|| lower.starts_with("table ")
|| lower.starts_with("diagram ")
|| lower.starts_with("chart ")
}
fn normalize_chart_like_markdown(markdown: &str) -> String {
let blocks: Vec<&str> = markdown
.split("\n\n")
.map(str::trim)
.filter(|block| !block.is_empty())
.collect();
if blocks.is_empty() {
return markdown.trim().to_string();
}
let mut normalized = Vec::new();
let mut i = 0usize;
while i < blocks.len() {
if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
normalized.push(rendered);
break;
}
if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
normalized.push(rendered);
i += consumed;
continue;
}
if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
normalized.push(rendered);
i += consumed;
continue;
}
if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
normalized.push(rendered);
i += consumed;
continue;
}
if should_drop_artifact_table_block(&blocks, i) {
i += 1;
continue;
}
if !looks_like_footer_banner(blocks[i]) {
normalized.push(blocks[i].to_string());
}
i += 1;
}
normalized.join("\n\n").trim().to_string() + "\n"
}
fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
if start != 0 {
return None;
}
let rows = parse_pipe_table_block(blocks.first()?.trim())?;
let body_rows = rows.len().saturating_sub(2);
let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
if body_rows < 8 || max_cols < 8 {
return None;
}
let caption = blocks.get(1)?.trim();
if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
return None;
}
let has_following_section = blocks.iter().skip(2).any(|block| {
let trimmed = block.trim();
trimmed.starts_with("# ")
|| trimmed.starts_with("## ")
|| trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& trimmed.contains(" Main Results")
});
has_following_section.then_some(blocks[0].trim().to_string())
}
fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
let caption = blocks.get(start)?.trim();
if !is_structural_caption(caption) {
return None;
}
let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
if rows.len() != 2 {
return None;
}
let pairs = extract_value_year_pairs_from_cells(&rows[0]);
if pairs.len() < 4 {
return None;
}
let mut source = String::new();
let mut consumed = 2usize;
if let Some(next_block) = blocks.get(start + 2) {
let next = next_block.trim();
if next.to_ascii_lowercase().starts_with("source:") {
source = next.to_string();
consumed += 1;
}
}
let mut out = String::new();
let heading_prefix = if start == 0 { "# " } else { "## " };
out.push_str(heading_prefix);
out.push_str(caption);
out.push_str("\n\n");
out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
out.push_str("| --- | --- |\n");
for (year, value) in pairs {
out.push_str(&format!("| {} | {} |\n", year, value));
}
out.push('\n');
if !source.is_empty() {
out.push('*');
out.push_str(&escape_md_line_start(&source));
out.push_str("*\n\n");
}
Some((out.trim().to_string(), consumed))
}
fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
let mut consumed = 1usize;
let mut source = String::new();
let mut labels = Vec::new();
if let Some(next_block) = blocks.get(start + 1) {
let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
if !candidate_source.is_empty() || !candidate_labels.is_empty() {
labels = candidate_labels;
source = candidate_source;
consumed += 1;
}
}
while let Some(block) = blocks.get(start + consumed) {
if looks_like_numeric_noise_block(block) {
consumed += 1;
continue;
}
break;
}
let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
let mut out = String::new();
out.push_str("## ");
out.push_str(caption.trim());
out.push_str("\n\n");
if labels.len() >= 3 && labels.len() == value_tokens.len() {
let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
"Year"
} else {
"Label"
};
let value_header = chart_value_header(&caption);
out.push_str(&format!("| {} | {} |\n", label_header, value_header));
out.push_str("| --- | --- |\n");
for (label, value) in labels.iter().zip(value_tokens.iter()) {
out.push_str(&format!("| {} | {} |\n", label, value));
}
out.push('\n');
}
if !source.is_empty() {
out.push('*');
out.push_str(&escape_md_line_start(&source));
out.push_str("*\n\n");
}
Some((out.trim().to_string(), consumed))
}
fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
let block = blocks.get(start)?.trim();
if !is_structural_caption(block) || block.contains('|') {
return None;
}
let mut caption = collapse_inline_whitespace(block);
let mut consumed = 1usize;
if let Some(next_block) = blocks.get(start + 1) {
let next = next_block.trim();
if looks_like_caption_continuation(next) {
caption.push(' ');
caption.push_str(next.trim_end_matches('.'));
consumed += 1;
} else if !looks_like_isolated_caption_context(block, next) {
return None;
}
} else {
return None;
}
Some((format!("## {}", caption.trim()), consumed))
}
fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
let trimmed = block.trim();
if !is_structural_caption(trimmed) {
return None;
}
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
if first_numeric_idx < 3 {
return None;
}
let caption = tokens[..first_numeric_idx].join(" ");
let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
.iter()
.filter_map(|token| sanitize_numberish_token(token))
.collect();
if numeric_tokens.len() < 4 {
return None;
}
Some((caption, numeric_tokens))
}
fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
let lines: Vec<&str> = block
.lines()
.map(str::trim)
.filter(|line| !line.is_empty())
.collect();
if lines.len() < 2 {
return None;
}
let header = split_pipe_row(lines[0])?;
if !is_pipe_separator_row(lines[1], header.len()) {
return None;
}
let mut rows = vec![header];
rows.push(split_pipe_row(lines[1]).unwrap_or_default());
for line in lines.iter().skip(2) {
let row = split_pipe_row(line)?;
rows.push(row);
}
Some(rows)
}
fn split_pipe_row(line: &str) -> Option<Vec<String>> {
let trimmed = line.trim();
if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
return None;
}
Some(
trimmed[1..trimmed.len() - 1]
.split('|')
.map(|cell| cell.trim().to_string())
.collect(),
)
}
fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
let Some(cells) = split_pipe_row(line) else {
return false;
};
if cells.len() != expected_cols || expected_cols == 0 {
return false;
}
cells.iter().all(|cell| {
let stripped = cell.trim_matches(':').trim();
!stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
})
}
fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
let mut pairs = Vec::new();
for cell in cells {
let tokens: Vec<&str> = cell.split_whitespace().collect();
if tokens.len() != 2 {
continue;
}
if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
if let Some(value) = sanitize_numberish_token(tokens[1]) {
pairs.push((tokens[0].to_string(), value));
}
continue;
}
if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
if let Some(value) = sanitize_numberish_token(tokens[0]) {
pairs.push((tokens[1].to_string(), value));
}
}
}
pairs.sort_by(|left, right| left.0.cmp(&right.0));
pairs
}
fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
let Some(rows) = parse_pipe_table_block(blocks[start]) else {
return false;
};
let prev = start
.checked_sub(1)
.and_then(|idx| blocks.get(idx))
.map(|block| block.trim())
.unwrap_or("");
let next = blocks
.get(start + 1)
.map(|block| block.trim())
.unwrap_or("");
if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
let header = rows[0][0].trim();
if looks_like_url_fragment(header) {
return true;
}
if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
return true;
}
}
let stats = pipe_table_stats(&rows);
stats.fill_ratio < 0.5
&& stats.long_cell_count == 0
&& !is_structural_caption(prev)
&& (looks_like_citation_block(next) || is_structural_caption(next))
}
fn previous_block_announces_table(block: &str) -> bool {
let lower = block.trim().to_ascii_lowercase();
lower.ends_with("as follows:")
|| lower.ends_with("following details:")
|| lower.ends_with("following detail:")
|| lower.contains("the following details")
}
fn looks_like_url_fragment(text: &str) -> bool {
let trimmed = text.trim();
(!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
|| (trimmed.contains('/') && !trimmed.contains(' '))
}
fn looks_like_numeric_axis_blob(text: &str) -> bool {
let numeric_values: Vec<i64> = text
.split_whitespace()
.filter_map(parse_integer_token)
.collect();
numeric_values.len() >= 8
&& !detect_axis_progression(&numeric_values).is_empty()
&& text.chars().any(char::is_alphabetic)
}
fn looks_like_citation_block(block: &str) -> bool {
let trimmed = block.trim();
trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
}
struct PipeTableStats {
fill_ratio: f64,
long_cell_count: usize,
}
fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
let body = rows.len().saturating_sub(2);
let mut nonempty = 0usize;
let mut long_cell_count = 0usize;
for row in rows.iter().skip(2) {
for cell in row {
if !cell.trim().is_empty() {
nonempty += 1;
if cell.split_whitespace().count() >= 3 {
long_cell_count += 1;
}
}
}
}
let fill_ratio = if body == 0 {
0.0
} else {
nonempty as f64 / (body * cols) as f64
};
PipeTableStats {
fill_ratio,
long_cell_count,
}
}
fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
let trimmed = block.trim();
let lower = trimmed.to_ascii_lowercase();
let source_idx = lower.find("source:");
let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
let source = source_idx
.map(|idx| trimmed[idx..].trim().to_string())
.unwrap_or_default();
let labels = parse_chart_labels(label_region);
(labels, source)
}
fn parse_chart_labels(text: &str) -> Vec<String> {
let tokens: Vec<&str> = text.split_whitespace().collect();
let mut labels = Vec::new();
let mut i = 0usize;
while i < tokens.len() {
let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
if looks_like_year_token(token) {
let mut label = token.to_string();
if let Some(next) = tokens.get(i + 1) {
let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
label.push(' ');
label.push_str(next_trimmed);
i += 1;
}
}
labels.push(label);
} else if looks_like_category_label(token) {
labels.push(token.to_string());
}
i += 1;
}
labels
}
fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
if expected_count == 0 {
return Vec::new();
}
if tokens.len() == expected_count {
return tokens.to_vec();
}
let numeric_values: Vec<i64> = tokens
.iter()
.filter_map(|token| parse_integer_token(token))
.collect();
if numeric_values.len() != tokens.len() {
return Vec::new();
}
let axis_series = detect_axis_progression(&numeric_values);
if axis_series.is_empty() {
return Vec::new();
}
let mut remaining = Vec::new();
let mut removable = axis_series;
for token in tokens {
let Some(value) = parse_integer_token(token) else {
continue;
};
if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
removable.remove(pos);
} else {
remaining.push(token.clone());
}
}
if remaining.len() == expected_count {
remaining
} else {
Vec::new()
}
}
fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
if values.len() < 6 {
return Vec::new();
}
let mut sorted = values.to_vec();
sorted.sort_unstable();
sorted.dedup();
if sorted.len() < 6 {
return Vec::new();
}
let mut best = Vec::new();
for window in sorted.windows(2) {
let step = window[1] - window[0];
if step <= 0 {
continue;
}
let mut series = vec![window[0]];
let mut current = window[0];
loop {
let next = current + step;
if sorted.binary_search(&next).is_ok() {
series.push(next);
current = next;
} else {
break;
}
}
if series.len() > best.len() {
best = series;
}
}
if best.len() >= 6 {
best
} else {
Vec::new()
}
}
fn chart_value_header(caption: &str) -> String {
let trimmed = caption.trim();
let title = strip_structural_caption_prefix(trimmed);
let mut base = title.to_string();
if let Some(idx) = base.rfind(" in ") {
let tail = base[idx + 4..].trim();
if tail.split_whitespace().count() <= 2
&& tail.chars().next().is_some_and(char::is_uppercase)
{
base.truncate(idx);
}
}
if let Some(start) = title.rfind('(') {
if title.ends_with(')') {
let unit = title[start + 1..title.len() - 1].trim();
if let Some(idx) = base.rfind('(') {
base.truncate(idx);
}
let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
return format!("{} ({})", base.trim(), normalized_unit);
}
}
let trimmed = base.trim();
if trimmed.is_empty() {
"Value".to_string()
} else {
trimmed.to_string()
}
}
fn strip_structural_caption_prefix(text: &str) -> &str {
let trimmed = text.trim();
let mut parts = trimmed.splitn(3, ' ');
let Some(first) = parts.next() else {
return trimmed;
};
let Some(second) = parts.next() else {
return trimmed;
};
let Some(rest) = parts.next() else {
return trimmed;
};
let first_lower = first.to_ascii_lowercase();
if matches!(
first_lower.as_str(),
"figure" | "table" | "diagram" | "chart"
) && second
.chars()
.all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
{
rest.trim()
} else {
trimmed
}
}
fn looks_like_footer_banner(block: &str) -> bool {
let trimmed = block.trim();
if trimmed.contains('\n') || trimmed.len() < 8 {
return false;
}
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
if !(2..=6).contains(&tokens.len()) {
return false;
}
let Some(last) = tokens.last() else {
return false;
};
if !last.chars().all(|ch| ch.is_ascii_digit()) {
return false;
}
tokens[..tokens.len() - 1].iter().all(|token| {
matches!(
token.to_ascii_lowercase().as_str(),
"of" | "and" | "the" | "for" | "in" | "on"
) || token.chars().next().is_some_and(char::is_uppercase)
})
}
fn looks_like_caption_continuation(block: &str) -> bool {
let trimmed = block.trim();
!trimmed.is_empty()
&& trimmed.split_whitespace().count() <= 8
&& trimmed.chars().next().is_some_and(char::is_uppercase)
&& !trimmed.contains(':')
}
fn collapse_inline_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn drop_isolated_noise_lines(markdown: &str) -> String {
let lines: Vec<&str> = markdown.lines().collect();
let mut kept = Vec::with_capacity(lines.len());
for (idx, line) in lines.iter().enumerate() {
if should_drop_isolated_noise_line(&lines, idx) {
continue;
}
kept.push(*line);
}
let mut result = kept.join("\n");
if markdown.ends_with('\n') {
result.push('\n');
}
result
}
fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
let trimmed = lines[idx].trim();
if trimmed.len() != 1 {
return false;
}
let ch = trimmed.chars().next().unwrap_or_default();
if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
return false;
}
let prev = previous_nonempty_line(lines, idx);
let next = next_nonempty_line(lines, idx);
let (Some(prev), Some(next)) = (prev, next) else {
return false;
};
is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
}
fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
lines[..idx]
.iter()
.rev()
.find(|line| !line.trim().is_empty())
.copied()
}
fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
lines[idx + 1..]
.iter()
.find(|line| !line.trim().is_empty())
.copied()
}
fn is_substantive_markdown_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
return true;
}
trimmed.split_whitespace().count() >= 2
}
fn normalize_common_ocr_text(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let mut normalized = text
.replace("ߤL", "μL")
.replace(" oC", "°C")
.replace("37 C", "37°C")
.replace("-20 oC", "-20°C")
.replace("1- 20-μL", "1-20-μL")
.replace("1- 20 μL", "1-20 μL")
.replace("1- 2 0 μL", "1-20 μL")
.replace("1- 2 0 μL", "1-20 μL");
normalized = normalize_degree_spacing(&normalized);
collapse_inline_whitespace(&normalized)
}
fn normalize_degree_spacing(text: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let mut out = String::with_capacity(text.len());
let mut i = 0usize;
while i < chars.len() {
let ch = chars[i];
if ch == ' '
&& i > 0
&& i + 2 < chars.len()
&& chars[i - 1].is_ascii_digit()
&& matches!(chars[i + 1], 'C' | 'F')
&& !chars[i + 2].is_ascii_alphabetic()
{
out.push('°');
out.push(chars[i + 1]);
i += 2;
continue;
}
out.push(ch);
i += 1;
}
out
}
fn normalize_list_text(text: &str) -> String {
let normalized = normalize_common_ocr_text(text);
let trimmed = normalized
.trim_start_matches(|ch: char| is_bullet_like(ch))
.trim();
trimmed.to_string()
}
fn push_rendered_list_item(out: &mut String, item: &str) {
if starts_with_enumerated_marker(item) {
out.push_str(item);
out.push('\n');
} else {
out.push_str(&format!("- {}\n", item));
}
}
fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
let trimmed = current.trim();
if trimmed.is_empty()
|| looks_like_stray_list_page_number(trimmed)
|| is_list_section_heading(trimmed)
|| looks_like_numbered_section(trimmed)
|| starts_with_enumerated_marker(trimmed)
{
return false;
}
if previous.ends_with('-')
&& previous
.chars()
.rev()
.nth(1)
.is_some_and(|c| c.is_alphabetic())
&& trimmed.chars().next().is_some_and(char::is_lowercase)
{
return true;
}
trimmed
.chars()
.next()
.is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
}
fn is_pure_bullet_marker(text: &str) -> bool {
let trimmed = text.trim();
!trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
}
fn looks_like_stray_list_page_number(text: &str) -> bool {
let trimmed = text.trim();
(1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
}
fn is_bullet_like(ch: char) -> bool {
matches!(
ch,
'•' | '◦'
| '▪'
| '▸'
| '▹'
| '►'
| '▻'
| '●'
| '○'
| '■'
| '□'
| '◆'
| '◇'
| '-'
)
}
fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
let next = next_block.trim();
if next.is_empty() {
return false;
}
let next_lower = next.to_ascii_lowercase();
if next_lower.starts_with("source:")
|| next_lower.starts_with("note:")
|| next_lower.starts_with("*source:")
|| next_lower.starts_with("*note:")
{
return true;
}
caption.split_whitespace().count() <= 14
&& next.split_whitespace().count() <= 45
&& (next.contains(':') || next.contains('='))
}
fn looks_like_numeric_noise_block(block: &str) -> bool {
let trimmed = block.trim();
!trimmed.is_empty()
&& trimmed.split_whitespace().all(|token| {
sanitize_numberish_token(token)
.as_deref()
.is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
})
}
fn looks_like_yearish_label(label: &str) -> bool {
label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
}
fn looks_like_year_token(token: &str) -> bool {
token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
}
fn looks_like_category_label(token: &str) -> bool {
token
.chars()
.all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
&& token.chars().any(|ch| ch.is_ascii_alphabetic())
}
fn is_numberish_token(token: &str) -> bool {
sanitize_numberish_token(token).is_some()
}
fn sanitize_numberish_token(token: &str) -> Option<String> {
let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
if trimmed.is_empty() {
return None;
}
let candidate = trimmed.trim_end_matches('%').replace(',', "");
if candidate.chars().all(|ch| ch.is_ascii_digit()) {
Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
} else {
None
}
}
fn parse_integer_token(token: &str) -> Option<i64> {
sanitize_numberish_token(token)?
.replace(',', "")
.parse::<i64>()
.ok()
}
fn starts_with_uppercase_word(text: &str) -> bool {
for ch in text.trim_start().chars() {
if ch.is_alphabetic() {
return ch.is_uppercase();
}
if !matches!(ch, '"' | '\'' | '(' | '[') {
break;
}
}
false
}
fn clean_paragraph_text(text: &str) -> String {
let trimmed = text.trim();
if trimmed.is_empty() {
return String::new();
}
let mut result = String::with_capacity(trimmed.len());
let mut prev_space = false;
for ch in trimmed.chars() {
if ch == ' ' || ch == '\t' {
if !prev_space {
result.push(' ');
prev_space = true;
}
} else {
result.push(ch);
prev_space = false;
}
}
normalize_common_ocr_text(&result)
}
fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
match element {
Some(ContentElement::Paragraph(p)) => {
let text = clean_paragraph_text(&p.base.value());
let trimmed = text.trim();
if trimmed.is_empty()
|| should_render_element_as_heading(element.unwrap(), trimmed, None)
{
None
} else {
Some(trimmed.to_string())
}
}
Some(ContentElement::TextBlock(tb)) => {
let text = clean_paragraph_text(&tb.value());
let trimmed = text.trim();
if trimmed.is_empty()
|| should_render_element_as_heading(element.unwrap(), trimmed, None)
{
None
} else {
Some(trimmed.to_string())
}
}
Some(ContentElement::TextLine(tl)) => {
let text = clean_paragraph_text(&tl.value());
let trimmed = text.trim();
if trimmed.is_empty()
|| should_render_element_as_heading(element.unwrap(), trimmed, None)
{
None
} else {
Some(trimmed.to_string())
}
}
_ => None,
}
}
fn should_render_paragraph_as_heading(
doc: &PdfDocument,
idx: usize,
text: &str,
next: Option<&ContentElement>,
) -> bool {
if looks_like_top_margin_running_header(doc, idx, text) {
return false;
}
if looks_like_hyphenated_table_title_continuation(doc, idx, text, next) {
return true;
}
if should_render_element_as_heading(&doc.kids[idx], text, next) {
return true;
}
let body_font_size = compute_body_font_size(doc);
if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
return false;
}
if !doc_has_explicit_headings(doc) {
if should_rescue_as_heading(doc, idx, text) {
return true;
}
if should_rescue_allcaps_heading(doc, idx, text) {
return true;
}
if should_rescue_numbered_heading(doc, idx, text) {
return true;
}
return false;
}
if heading_density(doc) < 0.10 {
if should_rescue_allcaps_heading(doc, idx, text) {
return true;
}
if should_rescue_numbered_heading(doc, idx, text) {
return true;
}
if body_font_size > 0.0 {
if let ContentElement::Paragraph(p) = &doc.kids[idx] {
if let Some(fs) = p.base.font_size {
if fs >= 1.15 * body_font_size
&& is_heading_rescue_candidate(doc, idx, text)
&& has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
{
return true;
}
}
}
}
}
false
}
fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
doc.kids.iter().any(|e| {
matches!(
e,
ContentElement::Heading(_) | ContentElement::NumberHeading(_)
)
})
}
fn compute_body_font_size(doc: &PdfDocument) -> f64 {
let mut font_sizes: Vec<f64> = doc
.kids
.iter()
.filter_map(|e| {
if let ContentElement::Paragraph(p) = e {
let word_count = p.base.value().split_whitespace().count();
if word_count > 10 {
p.base.font_size
} else {
None
}
} else {
None
}
})
.collect();
if font_sizes.is_empty() {
return 0.0;
}
font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
font_sizes[font_sizes.len() / 2]
}
fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
if body_font_size <= 0.0 {
return false;
}
if let ContentElement::Paragraph(p) = &doc_kids[idx] {
if let Some(fs) = p.base.font_size {
return fs < 0.95 * body_font_size;
}
}
false
}
fn heading_density(doc: &PdfDocument) -> f64 {
let total = doc.kids.len();
if total == 0 {
return 0.0;
}
let heading_count = doc
.kids
.iter()
.filter(|e| {
matches!(
e,
ContentElement::Heading(_) | ContentElement::NumberHeading(_)
)
})
.count();
heading_count as f64 / total as f64
}
fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
is_heading_rescue_candidate(doc, idx, text)
&& has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
}
fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let has_alpha = trimmed.chars().any(char::is_alphabetic);
if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
return false;
}
if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
return false;
}
if trimmed.starts_with('(') && trimmed.ends_with(')') {
return false;
}
if starts_with_caption_prefix(trimmed)
|| looks_like_chart_label_heading(&doc.kids[idx], trimmed)
{
return false;
}
let word_count = trimmed.split_whitespace().count();
if word_count > 6 || trimmed.len() > 60 {
return false;
}
if trimmed
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
{
return false;
}
if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
if first_alpha.is_lowercase() {
return false;
}
}
true
}
fn has_substantive_follow_up(
doc: &PdfDocument,
idx: usize,
word_count: usize,
max_lookahead: usize,
) -> bool {
for offset in 1..=max_lookahead {
let lookahead_idx = idx + offset;
if lookahead_idx >= doc.kids.len() {
break;
}
let look_elem = &doc.kids[lookahead_idx];
match look_elem {
ContentElement::Paragraph(p) => {
let next_text = p.base.value();
let nw = next_text.split_whitespace().count();
if nw >= word_count * 3 || nw > 15 {
return true;
}
}
ContentElement::TextBlock(tb) => {
let next_text = tb.value();
let nw = next_text.split_whitespace().count();
if nw >= word_count * 3 || nw > 15 {
return true;
}
}
ContentElement::TextLine(tl) => {
let next_text = tl.value();
let nw = next_text.split_whitespace().count();
if nw >= word_count * 3 || nw > 15 {
return true;
}
}
ContentElement::List(_)
| ContentElement::Table(_)
| ContentElement::TableBorder(_)
| ContentElement::Image(_)
| ContentElement::Figure(_) => {
return true;
}
_ => continue,
}
}
false
}
fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || trimmed.len() > 100 {
return false;
}
if !looks_like_numbered_section(trimmed) {
return false;
}
if trimmed.ends_with(['!', '?', ';', ',']) {
return false;
}
if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
return false;
}
if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
return false;
}
for offset in 1..=3 {
let lookahead_idx = idx + offset;
if lookahead_idx >= doc.kids.len() {
break;
}
match &doc.kids[lookahead_idx] {
ContentElement::Paragraph(p) => {
let nw = p.base.value().split_whitespace().count();
if nw > 10 {
return true;
}
}
ContentElement::TextBlock(tb) => {
let nw = tb.value().split_whitespace().count();
if nw > 10 {
return true;
}
}
ContentElement::TextLine(tl) => {
let nw = tl.value().split_whitespace().count();
if nw > 10 {
return true;
}
}
ContentElement::List(_)
| ContentElement::Table(_)
| ContentElement::TableBorder(_)
| ContentElement::Image(_)
| ContentElement::Figure(_) => {
return true;
}
_ => continue,
}
}
false
}
fn looks_like_numbered_section(text: &str) -> bool {
let bytes = text.as_bytes();
if bytes.is_empty() {
return false;
}
let mut idx = 0;
if bytes[0].is_ascii_digit() {
while idx < bytes.len() && bytes[idx].is_ascii_digit() {
idx += 1;
}
if idx >= bytes.len() {
return false;
}
while idx < bytes.len() && bytes[idx] == b'.' {
idx += 1;
let start = idx;
while idx < bytes.len() && bytes[idx].is_ascii_digit() {
idx += 1;
}
if idx == start {
break;
}
}
if idx >= bytes.len() {
return false;
}
if bytes[idx] == b' ' || bytes[idx] == b'\t' {
idx += 1;
if idx < bytes.len() && bytes[idx] == b'-' {
idx += 1;
if idx < bytes.len() && bytes[idx] == b' ' {
idx += 1;
}
}
} else if bytes[idx] == b'-' {
idx += 1;
if idx < bytes.len() && bytes[idx] == b' ' {
idx += 1;
}
} else {
return false;
}
let rest = &text[idx..].trim();
if rest.is_empty() {
return false;
}
if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
return c.is_uppercase();
}
return false;
}
if looks_like_keyword_numbered_section(text) {
return true;
}
false
}
const SECTION_KEYWORDS: &[&str] = &[
"activity",
"appendix",
"case",
"chapter",
"exercise",
"experiment",
"lab",
"lesson",
"module",
"part",
"phase",
"problem",
"question",
"section",
"stage",
"step",
"task",
"topic",
"unit",
];
fn looks_like_keyword_numbered_section(text: &str) -> bool {
let trimmed = text.trim();
let space_pos = match trimmed.find(' ') {
Some(p) => p,
None => return false,
};
let keyword = &trimmed[..space_pos];
if !SECTION_KEYWORDS
.iter()
.any(|k| keyword.eq_ignore_ascii_case(k))
{
return false;
}
let rest = trimmed[space_pos + 1..].trim_start();
if rest.is_empty() {
return false;
}
let rest = rest.strip_prefix('#').unwrap_or(rest);
let first_char = rest.chars().next().unwrap_or(' ');
if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
return false;
}
true
}
fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let word_count = trimmed.split_whitespace().count();
if word_count > 8 || trimmed.len() > 80 {
return false;
}
let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
return false;
}
if trimmed.ends_with(['.', ';', ',']) {
return false;
}
if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
return false;
}
if starts_with_caption_prefix(trimmed) {
return false;
}
if trimmed
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
{
return false;
}
for offset in 1..=4 {
let lookahead_idx = idx + offset;
if lookahead_idx >= doc.kids.len() {
break;
}
let look_elem = &doc.kids[lookahead_idx];
match look_elem {
ContentElement::Paragraph(p) => {
let nw = p.base.value().split_whitespace().count();
if nw > 6 {
return true;
}
}
ContentElement::TextBlock(tb) => {
let nw = tb.value().split_whitespace().count();
if nw > 6 {
return true;
}
}
ContentElement::TextLine(tl) => {
let nw = tl.value().split_whitespace().count();
if nw > 6 {
return true;
}
}
ContentElement::List(_)
| ContentElement::Table(_)
| ContentElement::TableBorder(_)
| ContentElement::Image(_)
| ContentElement::Figure(_) => {
return true;
}
_ => continue,
}
}
false
}
fn should_render_element_as_heading(
element: &ContentElement,
text: &str,
next: Option<&ContentElement>,
) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let lower = trimmed.to_ascii_lowercase();
if matches!(lower.as_str(), "contents" | "table of contents")
&& trimmed.starts_with(|c: char| c.is_uppercase())
{
return true;
}
let word_count = trimmed.split_whitespace().count();
let has_alpha = trimmed.chars().any(char::is_alphabetic);
let title_like = has_alpha
&& word_count <= 4
&& trimmed.len() <= 40
&& !trimmed.ends_with(['.', '!', '?', ';', ':']);
let is_attribution = {
let lower = trimmed.to_ascii_lowercase();
lower.starts_with("source:")
|| lower.starts_with("credit:")
|| lower.starts_with("photo by ")
|| lower.starts_with("photo credit")
|| lower.starts_with("image by ")
|| lower.starts_with("image credit")
};
title_like
&& matches!(next, Some(ContentElement::List(_)))
&& !looks_like_chart_label_heading(element, trimmed)
&& !is_attribution
}
fn looks_like_hyphenated_table_title_continuation(
doc: &PdfDocument,
idx: usize,
text: &str,
next: Option<&ContentElement>,
) -> bool {
if !matches!(
next,
Some(ContentElement::Table(_)) | Some(ContentElement::TableBorder(_))
) {
return false;
}
let trimmed = text.trim();
if trimmed.is_empty()
|| starts_with_caption_prefix(trimmed)
|| looks_like_numbered_section(trimmed)
|| looks_like_keyword_numbered_section(trimmed)
|| !trimmed.ends_with(':')
{
return false;
}
let word_count = trimmed.split_whitespace().count();
if !(3..=5).contains(&word_count) || trimmed.len() > 60 {
return false;
}
let Some(first_alpha) = trimmed.chars().find(|ch| ch.is_alphabetic()) else {
return false;
};
if first_alpha.is_lowercase() {
return false;
}
let Some(prev_idx) = idx.checked_sub(1) else {
return false;
};
let prev_text = extract_element_text(&doc.kids[prev_idx]);
let prev_trimmed = prev_text.trim();
!prev_trimmed.is_empty() && prev_trimmed.ends_with('-')
}
fn looks_like_table_header_duplicate_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty()
|| starts_with_caption_prefix(trimmed)
|| looks_like_numbered_section(trimmed)
|| looks_like_keyword_numbered_section(trimmed)
{
return false;
}
let word_count = trimmed.split_whitespace().count();
if !(3..=10).contains(&word_count) || trimmed.len() > 96 {
return false;
}
let Some(prev_idx) = idx.checked_sub(1) else {
return false;
};
let Some(previous_table) = table_border_from_element(&doc.kids[prev_idx]) else {
return false;
};
if previous_table.num_columns < 3 || previous_table.rows.len() < 3 {
return false;
}
let mut rendered_rows = collect_table_border_rows(previous_table);
if rendered_rows.is_empty() {
return false;
}
merge_continuation_rows(&mut rendered_rows);
trim_leading_table_carryover_rows(&mut rendered_rows);
let Some(header_row) = rendered_rows.first() else {
return false;
};
let header_text = header_row
.iter()
.map(|cell| cell.trim())
.filter(|cell| !cell.is_empty())
.collect::<Vec<_>>()
.join(" ");
if !equivalent_heading_text(trimmed, &header_text) {
return false;
}
let page_number = doc.kids[idx].page_number();
let mut short_fragments = 0usize;
let mut numeric_fragments = 0usize;
for candidate in doc.kids.iter().skip(idx + 1) {
if candidate.page_number() != page_number {
break;
}
if matches!(
candidate,
ContentElement::Table(_) | ContentElement::TableBorder(_)
) {
break;
}
let fragment = extract_element_text(candidate);
let fragment_trimmed = fragment.trim();
if fragment_trimmed.is_empty()
|| looks_like_margin_page_number(doc, candidate, fragment_trimmed)
{
continue;
}
let fragment_words = fragment_trimmed.split_whitespace().count();
if fragment_words > 6 {
return false;
}
short_fragments += 1;
if fragment_trimmed.chars().any(|ch| ch.is_ascii_digit()) {
numeric_fragments += 1;
}
if short_fragments >= 3 {
break;
}
}
short_fragments >= 2 && numeric_fragments >= 1
}
fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
return false;
}
let element = &doc.kids[idx];
let bbox = element.bbox();
if bbox.height() > 24.0 {
return false;
}
let Some(page) = element.page_number() else {
return false;
};
let mut page_tops = std::collections::HashMap::<u32, f64>::new();
for candidate in &doc.kids {
if let Some(p) = candidate.page_number() {
let top = page_tops.entry(p).or_insert(f64::MIN);
*top = top.max(candidate.bbox().top_y);
}
}
let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
if bbox.top_y < page_top - 24.0 {
return false;
}
let trimmed_lower = trimmed.to_lowercase();
for other_elem in &doc.kids {
let Some(other_page) = other_elem.page_number() else {
continue;
};
if other_page == page {
continue;
}
let other_bbox = other_elem.bbox();
if other_bbox.height() > 24.0 {
continue;
}
let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
if other_bbox.top_y < other_top - 24.0 {
continue;
}
let other_text = match other_elem {
ContentElement::Paragraph(p) => p.base.value(),
ContentElement::TextBlock(tb) => tb.value(),
ContentElement::TextLine(tl) => tl.value(),
ContentElement::Heading(h) => h.base.base.value(),
_ => continue,
};
if other_text.trim().to_lowercase() == trimmed_lower {
return true;
}
}
false
}
fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
let trimmed = text.trim();
let upper_words = trimmed
.split_whitespace()
.filter(|word| word.chars().any(char::is_alphabetic))
.all(|word| {
word.chars()
.filter(|ch| ch.is_alphabetic())
.all(|ch| ch.is_uppercase())
});
(trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
}
fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
let next_trimmed = next.trim();
if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
return false;
}
let normalized = normalize_heading_text(text);
if matches!(
normalized.as_str(),
"contents" | "tableofcontents" | "introduction" | "conclusion"
) {
return false;
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() < 3 {
return false;
}
words
.last()
.is_some_and(|word| is_sentence_fragment_tail(word))
}
fn is_sentence_fragment_tail(word: &str) -> bool {
matches!(
word.trim_matches(|c: char| !c.is_alphanumeric())
.to_ascii_lowercase()
.as_str(),
"a" | "an"
| "and"
| "as"
| "at"
| "by"
| "for"
| "from"
| "in"
| "into"
| "of"
| "on"
| "or"
| "that"
| "the"
| "to"
| "with"
)
}
fn is_list_section_heading(text: &str) -> bool {
let trimmed = text.trim();
trimmed.ends_with(':')
&& trimmed.len() <= 80
&& trimmed.split_whitespace().count() <= 8
&& trimmed.chars().any(char::is_alphabetic)
&& !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
&& !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
}
fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
let next_trimmed = next.trim();
if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
return false;
}
if starts_with_enumerated_marker(next_trimmed) {
return false;
}
if prev.ends_with('-')
&& prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
&& next_trimmed.chars().next().is_some_and(char::is_lowercase)
{
return true;
}
if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
return true;
}
let lower = next_trimmed.to_ascii_lowercase();
if lower.starts_with("http://")
|| lower.starts_with("https://")
|| lower.starts_with("arxiv")
|| lower.starts_with("doi:")
{
return true;
}
if matches!(
next_trimmed.split_whitespace().next(),
Some("In" | "Proceedings" | "Advances" | "Learning")
) {
return true;
}
!prev.ends_with(['.', '!', '?', ':'])
}
fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
let next_trimmed = next.trim();
if next_trimmed.is_empty() {
return false;
}
if starts_with_enumerated_marker(next_trimmed) {
return false;
}
if prev.ends_with('-')
&& prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
&& next_trimmed.chars().next().is_some_and(char::is_lowercase)
{
return true;
}
next_trimmed.chars().next().is_some_and(char::is_lowercase)
}
fn starts_with_enumerated_marker(text: &str) -> bool {
let first_token = match text.split_whitespace().next() {
Some(token) => token.trim_start_matches(['(', '[']),
None => return false,
};
if !first_token.ends_with(['.', ')', ':']) {
return false;
}
let marker = first_token.trim_end_matches(['.', ')', ':']);
if marker.is_empty() {
return false;
}
if marker.chars().all(|c| c.is_ascii_digit()) {
return true;
}
if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
return true;
}
let lower = marker.to_ascii_lowercase();
lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
}
fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
let trimmed = text.trim();
if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
return false;
}
let element = &doc.kids[idx];
let Some(page) = element.page_number() else {
return false;
};
let mut page_top = f64::MIN;
for candidate in &doc.kids {
if candidate.page_number() == Some(page)
&& matches!(
candidate,
ContentElement::Paragraph(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_)
| ContentElement::Heading(_)
| ContentElement::NumberHeading(_)
| ContentElement::Caption(_)
)
{
page_top = page_top.max(candidate.bbox().top_y);
}
}
if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
return false;
}
for prior_idx in 0..idx {
let prior = &doc.kids[prior_idx];
let prior_text = extract_element_text(prior);
let prior_trimmed = prior_text.trim();
if prior_trimmed.is_empty()
|| is_standalone_page_number(prior_trimmed)
|| looks_like_footer_banner(prior_trimmed)
{
continue;
}
match prior {
ContentElement::Paragraph(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_) => {
if !starts_with_caption_prefix(prior_trimmed)
&& !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
{
return false;
}
}
ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
if !should_skip_heading_text(prior_trimmed) {
return false;
}
}
_ => return false,
}
}
for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
let next = &doc.kids[lookahead_idx];
if next.page_number() != Some(page) {
break;
}
let next_text = extract_element_text(next);
let next_trimmed = next_text.trim();
if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
continue;
}
let is_numbered_heading = match next {
ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
looks_like_numbered_section(next_trimmed)
|| looks_like_keyword_numbered_section(next_trimmed)
}
ContentElement::Paragraph(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_) => {
should_render_paragraph_as_heading(
doc,
lookahead_idx,
next_trimmed,
doc.kids.get(lookahead_idx + 1),
) && (looks_like_numbered_section(next_trimmed)
|| looks_like_keyword_numbered_section(next_trimmed))
}
_ => false,
};
if is_numbered_heading {
return true;
}
if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
{
return false;
}
}
false
}
fn merge_paragraph_text(target: &mut String, next: &str) {
let next_trimmed = next.trim();
if target.ends_with('-')
&& target
.chars()
.rev()
.nth(1)
.is_some_and(|c| c.is_alphabetic())
&& next_trimmed.chars().next().is_some_and(char::is_lowercase)
{
target.pop();
target.push_str(next_trimmed);
} else {
if !target.ends_with(' ') {
target.push(' ');
}
target.push_str(next_trimmed);
}
}
fn is_standalone_page_number(text: &str) -> bool {
let trimmed = text.trim();
!trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
}
fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
if !is_standalone_page_number(text) {
return false;
}
let bbox = element.bbox();
if bbox.height() > 24.0 {
return false;
}
let Some(page) = element.page_number() else {
return false;
};
let mut page_top = f64::MIN;
let mut page_bottom = f64::MAX;
for candidate in &doc.kids {
if candidate.page_number() == Some(page) {
let candidate_bbox = candidate.bbox();
page_top = page_top.max(candidate_bbox.top_y);
page_bottom = page_bottom.min(candidate_bbox.bottom_y);
}
}
if !page_top.is_finite() || !page_bottom.is_finite() {
return false;
}
bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
}
fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
let element = &doc.kids[idx];
let bbox = element.bbox();
if bbox.height() > 30.0 {
return false;
}
let Some(page) = element.page_number() else {
return false;
};
let mut page_bottom = f64::MAX;
for candidate in &doc.kids {
if candidate.page_number() == Some(page) {
page_bottom = page_bottom.min(candidate.bbox().bottom_y);
}
}
if !page_bottom.is_finite() {
return false;
}
bbox.bottom_y <= page_bottom + 24.0
}
fn should_demote_period_heading(text: &str) -> bool {
let trimmed = text.trim();
if !trimmed.ends_with('.') {
return false;
}
if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
return false;
}
let without_dot = trimmed.trim_end_matches('.');
let word_count = without_dot.split_whitespace().count();
if word_count <= 2 {
return true;
}
false
}
fn should_demote_comma_heading(text: &str) -> bool {
text.trim().ends_with(',')
}
fn should_demote_math_heading(text: &str) -> bool {
text.chars().any(|c| {
matches!(
c,
'¼' | '½'
| '¾'
| '≪'
| '≫'
| 'þ'
| 'ð'
| '∑'
| '∫'
| '∂'
| '∏'
| '√'
| '∞'
| '≈'
| '÷'
)
})
}
fn should_demote_percentage_heading(text: &str) -> bool {
text.contains('%')
}
fn should_demote_bibliography_heading(text: &str) -> bool {
let t = text.trim();
if t.len() < 6 {
return false;
}
let bytes = t.as_bytes();
bytes[0..4].iter().all(|b| b.is_ascii_digit())
&& bytes[4] == b'.'
&& (bytes[5] == b' ' || t.len() == 5)
}
fn strip_trailing_page_number(text: &str) -> &str {
let trimmed = text.trim();
if let Some(last_space) = trimmed.rfind(' ') {
let suffix = &trimmed[last_space + 1..];
if !suffix.is_empty()
&& suffix.len() <= 4
&& suffix.chars().all(|c| c.is_ascii_digit())
&& trimmed[..last_space].split_whitespace().count() >= 3
{
return trimmed[..last_space].trim();
}
}
trimmed
}
fn find_merged_subsection_split(text: &str) -> Option<usize> {
let bytes = text.as_bytes();
let mut i = 3;
while i < bytes.len() {
if bytes[i - 1] == b' ' {
if bytes[i].is_ascii_digit() {
if let Some(dot_pos) = text[i..].find('.') {
let after_dot = i + dot_pos + 1;
if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
return Some(i);
}
}
}
if bytes[i].is_ascii_uppercase()
&& i + 2 < bytes.len()
&& bytes[i + 1] == b'.'
&& bytes[i + 2].is_ascii_digit()
{
return Some(i);
}
}
i += 1;
}
None
}
fn should_skip_heading_text(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || is_standalone_page_number(trimmed) {
return true;
}
let lower = trimmed.to_ascii_lowercase();
if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
&& trimmed.contains('|')
{
return true;
}
let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
}
fn repair_fragmented_words(text: &str) -> String {
const STOPWORDS: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
"is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
];
let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
if parts.len() < 2 {
return text.to_string();
}
let mut i = 0usize;
while i + 1 < parts.len() {
let left = parts[i].clone();
let right = parts[i + 1].clone();
let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
let left_lower = left_clean.to_ascii_lowercase();
let right_lower = right_clean.to_ascii_lowercase();
let should_join = !left_clean.is_empty()
&& !right_clean.is_empty()
&& left_clean.chars().all(char::is_alphabetic)
&& right_clean.chars().all(char::is_alphabetic)
&& (left_clean.len() <= 4 || right_clean.len() <= 4)
&& left_clean.len() + right_clean.len() >= 6
&& !right_clean.chars().next().is_some_and(char::is_uppercase)
&& !STOPWORDS.contains(&left_lower.as_str())
&& !STOPWORDS.contains(&right_lower.as_str());
if should_join {
let next = parts.remove(i + 1);
parts[i].push_str(&next);
} else {
i += 1;
}
}
parts.join(" ")
}
fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
let mut text = String::new();
for elem in contents {
let part = match elem {
ContentElement::Paragraph(p) => p.base.value(),
ContentElement::TextBlock(tb) => tb.value(),
ContentElement::TextLine(tl) => tl.value(),
ContentElement::TextChunk(tc) => tc.value.clone(),
_ => String::new(),
};
if !text.is_empty() && !part.is_empty() {
text.push(' ');
}
text.push_str(&part);
}
text
}
fn has_internal_header_gap(row: &[String]) -> bool {
let mut seen_filled = false;
let mut seen_gap_after_fill = false;
for cell in row {
if cell.trim().is_empty() {
if seen_filled {
seen_gap_after_fill = true;
}
continue;
}
if seen_gap_after_fill {
return true;
}
seen_filled = true;
}
false
}
fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
let anchor_cols: Vec<usize> = parent
.iter()
.enumerate()
.filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
.collect();
if anchor_cols.is_empty() {
return parent.to_vec();
}
let mut expanded = parent.to_vec();
for (col_idx, child_cell) in child.iter().enumerate() {
if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
continue;
}
let mut best_anchor = anchor_cols[0];
let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
for &anchor_idx in &anchor_cols[1..] {
let distance = usize::abs_diff(anchor_idx, col_idx);
if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
best_anchor = anchor_idx;
best_distance = distance;
}
}
expanded[col_idx] = parent[best_anchor].trim().to_string();
}
expanded
}
fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
return false;
}
if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
return false;
}
if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
return false;
}
let first_filled = rows[0]
.iter()
.filter(|cell| !cell.trim().is_empty())
.count();
let second_filled = rows[1]
.iter()
.filter(|cell| !cell.trim().is_empty())
.count();
if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
return false;
}
rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
true
}
fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
if rows.len() < 2 {
return;
}
if preserve_grouped_header_rows(rows) {
return;
}
if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
return;
}
let mut merge_count = 0usize;
for (i, row_i) in rows.iter().enumerate().skip(1) {
let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
if !first_empty {
break; }
let all_short = row_i
.iter()
.all(|c| c.trim().is_empty() || c.trim().len() <= 30);
if !all_short {
break;
}
merge_count = i;
}
if merge_count == 0 {
return;
}
for i in 1..=merge_count {
let (head, tail) = rows.split_at_mut(i);
let ncols = head[0].len().min(tail[0].len());
for (target, src) in head[0]
.iter_mut()
.take(ncols)
.zip(tail[0].iter().take(ncols))
{
let fragment = src.trim().to_string();
if !fragment.is_empty() {
let target_str = target.trim().to_string();
*target = if target_str.is_empty() {
fragment
} else {
format!("{} {}", target_str, fragment)
};
}
}
}
rows.drain(1..=merge_count);
}
fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
while first_body_row_looks_like_carryover(rows) {
rows.remove(1);
}
}
fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
if rows.len() < 3 {
return false;
}
let key_col_count = infer_leading_key_column_count(&rows[1..]);
if key_col_count == 0 {
return false;
}
let candidate = &rows[1];
if candidate
.iter()
.take(key_col_count)
.any(|cell| !cell.trim().is_empty())
{
return false;
}
let non_empty_cols = candidate
.iter()
.enumerate()
.filter(|(_, cell)| !cell.trim().is_empty())
.map(|(idx, _)| idx)
.collect::<Vec<_>>();
if non_empty_cols.len() != 1 {
return false;
}
let only_col = non_empty_cols[0];
if only_col < key_col_count {
return false;
}
if candidate[only_col].split_whitespace().count() < 4 {
return false;
}
rows[2]
.iter()
.take(key_col_count)
.all(|cell| !cell.trim().is_empty())
}
fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
if rows.len() < 2 {
return 0;
}
let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
let mut key_cols = 0usize;
for col_idx in 0..num_cols {
let mut occupancy = 0usize;
let mut word_counts = Vec::new();
for row in rows {
let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
let trimmed = cell.trim();
if trimmed.is_empty() {
continue;
}
occupancy += 1;
word_counts.push(trimmed.split_whitespace().count());
}
if occupancy == 0 {
break;
}
word_counts.sort_unstable();
let median_words = word_counts[word_counts.len() / 2];
let occupancy_ratio = occupancy as f64 / rows.len() as f64;
if occupancy_ratio < 0.6 || median_words > 3 {
break;
}
key_cols += 1;
}
key_cols
}
fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
render_table_border(out, &table.table_border);
}
#[derive(Clone, Debug)]
struct GeometricTableRegion {
start_idx: usize,
end_idx: usize,
rendered: String,
}
#[derive(Clone)]
struct ChunkLine {
bbox: BoundingBox,
chunks: Vec<TextChunk>,
}
#[derive(Clone)]
struct SlotFragment {
slot_idx: usize,
bbox: BoundingBox,
text: String,
}
fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
let mut regions = Vec::new();
let mut occupied_until = 0usize;
for (idx, element) in doc.kids.iter().enumerate() {
if idx < occupied_until {
continue;
}
let Some(table) = table_border_from_element(element) else {
continue;
};
let Some(region) = build_geometric_table_region(doc, idx, table) else {
continue;
};
occupied_until = region.end_idx.saturating_add(1);
regions.push(region);
}
let mut occupied = regions
.iter()
.flat_map(|region| region.start_idx..=region.end_idx)
.collect::<HashSet<_>>();
for region in detect_footnote_citation_regions(doc) {
if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
continue;
}
occupied.extend(region.start_idx..=region.end_idx);
regions.push(region);
}
regions.sort_by_key(|region| region.start_idx);
regions
}
fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
let body_font_size = compute_running_body_font_size(doc);
if body_font_size <= 0.0 {
return Vec::new();
}
let mut regions = Vec::new();
let mut idx = 0usize;
while idx < doc.kids.len() {
let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
idx += 1;
continue;
};
idx = region.end_idx.saturating_add(1);
regions.push(region);
}
regions
}
fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
doc.kids
.iter()
.filter_map(|element| {
let ContentElement::Paragraph(paragraph) = element else {
return None;
};
let text = paragraph.base.value();
(text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
})
.fold(0.0_f64, f64::max)
}
fn build_footnote_citation_region(
doc: &PdfDocument,
start_idx: usize,
body_font_size: f64,
) -> Option<GeometricTableRegion> {
let element = doc.kids.get(start_idx)?;
if !is_geometric_text_candidate(element) {
return None;
}
let start_text = extract_element_text(element);
let trimmed_start = start_text.trim();
if trimmed_start.is_empty() {
return None;
}
let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
let mut lead_prefix = None;
let mut fragments = Vec::new();
let page_number = element.page_number()?;
let mut column_bbox = element.bbox().clone();
let mut region_start_idx = start_idx;
let mut end_idx = start_idx;
if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
&& starts_with_footnote_marker(trimmed_start)
{
if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
doc,
start_idx,
page_number,
&column_bbox,
small_font_threshold,
) {
lead_prefix = Some(prefix);
fragments.extend(leading_fragments);
region_start_idx = attach_idx;
}
fragments.push(footnote_fragment_text(element));
} else {
let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
let next = doc.kids.get(start_idx + 1)?;
if !is_geometric_text_candidate(next)
|| next.page_number() != Some(page_number)
|| !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
{
return None;
}
if !same_column_region(&column_bbox, next.bbox()) {
return None;
}
lead_prefix = Some(prefix);
fragments.push(first_tail);
}
let mut consecutive_small = 0usize;
for idx in start_idx + 1..doc.kids.len() {
let candidate = &doc.kids[idx];
if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
break;
}
let candidate_text = extract_element_text(candidate);
let trimmed = candidate_text.trim();
if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
break;
}
let Some(font_size) = element_font_size(candidate) else {
break;
};
if font_size > small_font_threshold {
break;
}
if !same_column_region(&column_bbox, candidate.bbox()) {
break;
}
column_bbox = column_bbox.union(candidate.bbox());
fragments.push(footnote_fragment_text(candidate));
consecutive_small += 1;
end_idx = idx;
}
if consecutive_small == 0 && lead_prefix.is_some() {
return None;
}
let rows = parse_footnote_citation_rows(&fragments);
if rows.len() < 3 {
return None;
}
let numeric_markers = rows
.iter()
.filter_map(|(marker, _)| marker.parse::<u32>().ok())
.collect::<Vec<_>>();
if numeric_markers.len() != rows.len() {
return None;
}
let sequential_steps = numeric_markers
.windows(2)
.filter(|pair| pair[1] == pair[0] + 1)
.count();
if sequential_steps + 1 < rows.len().saturating_sub(1) {
return None;
}
let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
rendered_rows.extend(
rows.into_iter()
.map(|(marker, citation)| vec![marker, citation]),
);
let mut rendered = String::new();
if let Some(prefix) = lead_prefix {
rendered.push_str(&escape_md_line_start(prefix.trim()));
rendered.push_str("\n\n");
}
rendered.push_str(&render_html_table(&rendered_rows));
Some(GeometricTableRegion {
start_idx: region_start_idx,
end_idx,
rendered,
})
}
fn leading_footnote_attachment(
doc: &PdfDocument,
start_idx: usize,
page_number: u32,
column_bbox: &BoundingBox,
small_font_threshold: f64,
) -> Option<(usize, String, Vec<String>)> {
let mut idx = start_idx.checked_sub(1)?;
let mut leading_fragments = Vec::new();
let mut scanned = 0usize;
loop {
let candidate = doc.kids.get(idx)?;
scanned += 1;
if scanned > 6 || candidate.page_number() != Some(page_number) {
return None;
}
if !is_geometric_text_candidate(candidate) {
if idx == 0 {
return None;
}
idx -= 1;
continue;
}
let text = extract_element_text(candidate);
let trimmed = text.trim();
if trimmed.is_empty() {
if idx == 0 {
return None;
}
idx -= 1;
continue;
}
if !same_column_region(candidate.bbox(), column_bbox) {
return None;
}
if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
leading_fragments.push(footnote_fragment_text(candidate));
if idx == 0 {
return None;
}
idx -= 1;
continue;
}
let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
leading_fragments.push(first_tail);
leading_fragments.reverse();
return Some((idx, prefix, leading_fragments));
}
}
fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
let mut rows = Vec::new();
let mut current_marker = None::<String>;
let mut current_citation = String::new();
for fragment in fragments {
let markers = find_footnote_marker_positions(fragment);
if markers.is_empty() {
if current_marker.is_some() {
merge_paragraph_text(&mut current_citation, fragment.trim());
}
continue;
}
let mut cursor = 0usize;
for (pos, marker, skip_len) in markers {
let prefix = fragment[cursor..pos].trim();
if current_marker.is_some() && !prefix.is_empty() {
merge_paragraph_text(&mut current_citation, prefix);
}
if let Some(marker_value) = current_marker.take() {
let trimmed = current_citation.trim();
if !trimmed.is_empty() {
rows.push((marker_value, trimmed.to_string()));
}
current_citation.clear();
}
current_marker = Some(marker);
cursor = pos + skip_len;
}
let tail = fragment[cursor..].trim();
if current_marker.is_some() && !tail.is_empty() {
merge_paragraph_text(&mut current_citation, tail);
}
}
if let Some(marker_value) = current_marker {
let trimmed = current_citation.trim();
if !trimmed.is_empty() {
rows.push((marker_value, trimmed.to_string()));
}
}
rebalance_adjacent_footnote_citations(&mut rows);
rows
}
fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
for idx in 0..rows.len().saturating_sub(1) {
if !rows[idx].1.trim_end().ends_with(',') {
continue;
}
let next = rows[idx + 1].1.trim().to_string();
let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
continue;
};
let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
continue;
};
if first_sentence.split_whitespace().count() < 2 {
continue;
}
merge_paragraph_text(&mut rows[idx].1, first_sentence);
rows[idx + 1].1 = if trailing.is_empty() {
stub.to_string()
} else {
format!("{stub} {trailing}")
};
}
}
fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
let comma_idx = text.find(',')?;
if comma_idx > 8 {
return None;
}
let stub = text[..=comma_idx].trim();
let remainder = text[comma_idx + 1..].trim();
(!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
}
fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
let period_idx = text.find(". ")?;
let first = text[..=period_idx].trim();
let trailing = text[period_idx + 2..].trim();
(!first.is_empty()).then_some((first, trailing))
}
fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
let chars = text.char_indices().collect::<Vec<_>>();
let mut markers = Vec::new();
let mut idx = 0usize;
while idx < chars.len() {
let (byte_idx, ch) = chars[idx];
if !ch.is_ascii_digit() {
idx += 1;
continue;
}
let at_boundary = idx == 0
|| chars[idx - 1].1.is_whitespace()
|| matches!(
chars[idx - 1].1,
'.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
);
if !at_boundary {
idx += 1;
continue;
}
let mut end_idx = idx;
while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
end_idx += 1;
}
let digits = &text[byte_idx
..chars
.get(end_idx)
.map(|(pos, _)| *pos)
.unwrap_or(text.len())];
if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
idx += 1;
continue;
}
let mut lookahead = end_idx;
while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
lookahead += 1;
}
let Some((_, next_ch)) = chars.get(lookahead) else {
idx += 1;
continue;
};
if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
idx += 1;
continue;
}
let skip_end = chars
.get(lookahead)
.map(|(pos, _)| *pos)
.unwrap_or(text.len());
markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
idx = lookahead;
}
markers
}
fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
let markers = find_footnote_marker_positions(text);
let (pos, marker, skip_len) = markers.last()?.clone();
let prefix = text[..pos].trim();
let tail = text[pos + skip_len..].trim();
if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
return None;
}
Some((prefix.to_string(), format!("{marker} {tail}")))
}
fn starts_with_footnote_marker(text: &str) -> bool {
find_footnote_marker_positions(text)
.first()
.is_some_and(|(pos, _, _)| *pos == 0)
}
fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
let min_width = left.width().min(right.width()).max(1.0);
overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
}
fn footnote_fragment_text(element: &ContentElement) -> String {
let text = extract_element_text(element);
if element_font_name(element)
.as_deref()
.is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
{
format!("*{}*", text.trim())
} else {
text
}
}
fn element_font_size(element: &ContentElement) -> Option<f64> {
match element {
ContentElement::Paragraph(p) => p.base.font_size,
ContentElement::Heading(h) => h.base.base.font_size,
ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
ContentElement::TextBlock(tb) => Some(tb.font_size),
ContentElement::TextLine(tl) => Some(tl.font_size),
_ => None,
}
}
fn element_font_name(element: &ContentElement) -> Option<String> {
match element {
ContentElement::Paragraph(p) => p.base.font_name.clone(),
ContentElement::Heading(h) => h.base.base.font_name.clone(),
ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
_ => None,
}
}
fn table_border_from_element(
element: &ContentElement,
) -> Option<&crate::models::table::TableBorder> {
match element {
ContentElement::TableBorder(table) => Some(table),
ContentElement::Table(table) => Some(&table.table_border),
_ => None,
}
}
fn build_geometric_table_region(
doc: &PdfDocument,
table_idx: usize,
table: &crate::models::table::TableBorder,
) -> Option<GeometricTableRegion> {
let mut table_rows = collect_table_border_rows(table);
if table_rows.is_empty() || table.num_columns < 3 {
return None;
}
merge_continuation_rows(&mut table_rows);
let column_ranges = table_column_ranges(table)?;
let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
if candidate_indices.is_empty() {
return None;
}
let needs_external_stub =
infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
let supports_embedded_stub_header =
supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
if !needs_external_stub && !supports_embedded_stub_header {
return None;
}
let slot_ranges = if needs_external_stub {
slot_ranges(&column_ranges, doc, &candidate_indices, true)?
} else {
column_ranges.clone()
};
let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
if header_rows.is_empty() {
return None;
}
if needs_external_stub {
normalize_leading_stub_header(&mut header_rows);
} else {
promote_embedded_stub_header(&mut header_rows, &table_rows);
}
let slot_count = slot_ranges.len();
let dense_header_rows = header_rows
.iter()
.filter(|row| {
row.iter().filter(|cell| !cell.trim().is_empty()).count()
>= slot_count.saturating_sub(1).max(2)
})
.count();
if dense_header_rows == 0 {
return None;
}
let mut combined_rows = Vec::new();
combined_rows.extend(header_rows);
let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
let trailing_rows =
reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
vec![merge_panel_body_row(
&table_rows,
&trailing_rows,
slot_count,
)]
} else if needs_external_stub {
table_rows
.iter()
.map(|row| {
let mut shifted = vec![String::new()];
shifted.extend(row.iter().cloned());
shifted
})
.collect()
} else {
table_rows
};
if body_rows.is_empty() {
return None;
}
combined_rows.extend(body_rows);
let rendered = render_pipe_rows(&combined_rows);
Some(GeometricTableRegion {
start_idx: candidate_indices[0],
end_idx: following_indices.last().copied().unwrap_or(table_idx),
rendered,
})
}
fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
if table.num_columns == 0 {
return None;
}
let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
for row in &table.rows {
for cell in &row.cells {
if cell.col_number >= table.num_columns {
continue;
}
let range = &mut ranges[cell.col_number];
range.0 = range.0.min(cell.bbox.left_x);
range.1 = range.1.max(cell.bbox.right_x);
}
}
if ranges
.iter()
.any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
{
return None;
}
Some(ranges)
}
fn collect_table_header_candidate_indices(
doc: &PdfDocument,
table_idx: usize,
table: &crate::models::table::TableBorder,
) -> Vec<usize> {
let mut indices = Vec::new();
let table_page = table.bbox.page_number;
let table_top = table.bbox.top_y;
let mut cursor = table_idx;
while let Some(prev_idx) = cursor.checked_sub(1) {
let element = &doc.kids[prev_idx];
if element.page_number() != table_page {
break;
}
if !is_geometric_text_candidate(element) {
break;
}
let bbox = element.bbox();
let vertical_gap = bbox.bottom_y - table_top;
if !(-6.0..=260.0).contains(&vertical_gap) {
break;
}
indices.push(prev_idx);
cursor = prev_idx;
if indices.len() >= 10 {
break;
}
}
indices.reverse();
indices
}
fn collect_table_footer_candidate_indices(
doc: &PdfDocument,
table_idx: usize,
table: &crate::models::table::TableBorder,
) -> Vec<usize> {
let mut indices = Vec::new();
let table_page = table.bbox.page_number;
let table_bottom = table.bbox.bottom_y;
for idx in table_idx + 1..doc.kids.len() {
let element = &doc.kids[idx];
if element.page_number() != table_page {
break;
}
if !is_geometric_text_candidate(element) {
break;
}
if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
break;
}
let bbox = element.bbox();
let gap = table_bottom - bbox.top_y;
if !(-6.0..=28.0).contains(&gap) {
break;
}
indices.push(idx);
if indices.len() >= 4 {
break;
}
}
indices
}
fn is_geometric_text_candidate(element: &ContentElement) -> bool {
matches!(
element,
ContentElement::Paragraph(_)
| ContentElement::Heading(_)
| ContentElement::NumberHeading(_)
| ContentElement::TextBlock(_)
| ContentElement::TextLine(_)
)
}
fn infer_left_stub_requirement(
doc: &PdfDocument,
candidate_indices: &[usize],
table_rows: &[Vec<String>],
column_ranges: &[(f64, f64)],
) -> bool {
if column_ranges.is_empty() {
return false;
}
let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
let has_left_label = candidate_indices.iter().any(|idx| {
let bbox = doc.kids[*idx].bbox();
bbox.right_x <= column_ranges[0].0 + first_width * 0.12
&& bbox.width() <= first_width * 0.45
});
if !has_left_label {
return false;
}
let mut first_col_word_counts: Vec<usize> = table_rows
.iter()
.filter_map(|row| row.first())
.map(|cell| cell.split_whitespace().count())
.collect();
if first_col_word_counts.is_empty() {
return false;
}
first_col_word_counts.sort_unstable();
let median = first_col_word_counts[first_col_word_counts.len() / 2];
median >= 5
}
fn supports_embedded_stub_header(
table_rows: &[Vec<String>],
column_ranges: &[(f64, f64)],
doc: &PdfDocument,
candidate_indices: &[usize],
) -> bool {
if table_rows.len() < 2 || column_ranges.len() < 3 {
return false;
}
let first_row = &table_rows[0];
if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
return false;
}
if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
return false;
}
let data_fill = first_row
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count();
if data_fill + 1 < column_ranges.len() {
return false;
}
let labeled_rows = table_rows
.iter()
.skip(1)
.filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
.count();
if labeled_rows == 0 {
return false;
}
let slot_ranges = column_ranges.to_vec();
let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
header_rows.iter().any(|row| {
row.first().is_none_or(|cell| cell.trim().is_empty())
&& row
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count()
>= column_ranges.len().saturating_sub(1)
})
}
fn slot_ranges(
column_ranges: &[(f64, f64)],
doc: &PdfDocument,
candidate_indices: &[usize],
needs_stub: bool,
) -> Option<Vec<(f64, f64)>> {
let mut slots = Vec::new();
if needs_stub {
let first_left = column_ranges.first()?.0;
let left_stub_start = candidate_indices
.iter()
.map(|idx| doc.kids[*idx].bbox().left_x)
.fold(first_left, f64::min);
let stub_right = first_left - 1.0;
if stub_right <= left_stub_start {
return None;
}
slots.push((left_stub_start, stub_right));
}
slots.extend(column_ranges.iter().copied());
Some(slots)
}
fn reconstruct_aligned_rows(
doc: &PdfDocument,
candidate_indices: &[usize],
slot_ranges: &[(f64, f64)],
drop_wide_singletons: bool,
min_filled_slots: usize,
) -> Vec<Vec<String>> {
if candidate_indices.is_empty() || slot_ranges.is_empty() {
return Vec::new();
}
let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
for idx in candidate_indices {
for line in extract_chunk_lines(&doc.kids[*idx]) {
let fragments = split_line_into_slot_fragments(&line, slot_ranges);
if fragments.is_empty() {
continue;
}
if drop_wide_singletons && fragments.len() == 1 {
let only = &fragments[0];
let span_width = only.bbox.width();
let table_width =
slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
if span_width >= table_width * 0.55 {
continue;
}
}
let line_center = line.bbox.center_y();
let tolerance = line
.chunks
.iter()
.map(|chunk| chunk.font_size)
.fold(8.0, f64::max)
* 0.8;
let mut target_row = None;
for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
if (bbox.center_y() - line_center).abs() <= tolerance {
target_row = Some(row_idx);
break;
}
}
if let Some(row_idx) = target_row {
let (bbox, cells) = &mut row_bands[row_idx];
*bbox = bbox.union(&line.bbox);
for fragment in fragments {
append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
}
} else {
let mut cells = vec![String::new(); slot_ranges.len()];
for fragment in fragments {
append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
}
row_bands.push((line.bbox.clone(), cells));
}
}
}
row_bands.sort_by(|left, right| {
right
.0
.top_y
.partial_cmp(&left.0.top_y)
.unwrap_or(std::cmp::Ordering::Equal)
});
row_bands
.into_iter()
.map(|(_, cells)| cells)
.filter(|cells| {
let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
filled >= min_filled_slots
})
.collect()
}
fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
match element {
ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
ContentElement::TextBlock(tb) => tb
.text_lines
.iter()
.map(|line| ChunkLine {
bbox: line.bbox.clone(),
chunks: line.text_chunks.clone(),
})
.collect(),
ContentElement::TextLine(tl) => vec![ChunkLine {
bbox: tl.bbox.clone(),
chunks: tl.text_chunks.clone(),
}],
_ => Vec::new(),
}
}
fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
let mut lines = Vec::new();
for column in &node.columns {
for block in &column.text_blocks {
for line in &block.text_lines {
lines.push(ChunkLine {
bbox: line.bbox.clone(),
chunks: line.text_chunks.clone(),
});
}
}
}
lines
}
fn split_line_into_slot_fragments(
line: &ChunkLine,
slot_ranges: &[(f64, f64)],
) -> Vec<SlotFragment> {
let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
for chunk in line
.chunks
.iter()
.filter(|chunk| !chunk.value.trim().is_empty())
.cloned()
{
let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
let gap = chunk.bbox.left_x - prev_bbox.right_x;
if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
*prev_bbox = prev_bbox.union(&chunk.bbox);
prev_chunks.push(chunk);
continue;
}
}
groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
}
groups
.into_iter()
.filter_map(|(slot_idx, chunks, bbox)| {
let text = normalize_common_ocr_text(
&crate::models::text::TextLine::concatenate_chunks(&chunks),
);
if text.trim().is_empty() {
None
} else {
Some(SlotFragment {
slot_idx,
bbox,
text,
})
}
})
.collect()
}
fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
let mut best_idx = 0usize;
let mut best_overlap = f64::NEG_INFINITY;
let center_x = bbox.center_x();
for (idx, (left, right)) in slot_ranges.iter().enumerate() {
let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
let score = if overlap > 0.0 {
overlap / bbox.width().max(1.0)
} else {
-((center_x - ((*left + *right) / 2.0)).abs())
};
if score > best_overlap {
best_overlap = score;
best_idx = idx;
}
}
best_idx
}
fn append_cell_text(cell: &mut String, fragment: &str) {
let trimmed = fragment.trim();
if trimmed.is_empty() {
return;
}
if !cell.is_empty() {
cell.push(' ');
}
cell.push_str(trimmed);
}
fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
return;
}
if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
return;
}
let first_row_filled = rows[0]
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count();
let second_row_filled = rows[1]
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count();
if first_row_filled < 2 || second_row_filled < 2 {
return;
}
rows[0][0] = rows[1][0].trim().to_string();
rows[1][0].clear();
}
fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
let Some(header_row) = header_rows.first_mut() else {
return;
};
let Some(first_body_row) = table_rows.first() else {
return;
};
if header_row.is_empty() || first_body_row.is_empty() {
return;
}
if !header_row[0].trim().is_empty() {
return;
}
let promoted = first_body_row[0].trim();
if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
return;
}
let header_fill = header_row
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count();
let body_fill = first_body_row
.iter()
.skip(1)
.filter(|cell| !cell.trim().is_empty())
.count();
if header_fill < header_row.len().saturating_sub(1)
|| body_fill < first_body_row.len().saturating_sub(1)
{
return;
}
header_row[0] = promoted.to_string();
}
fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
rows.len() >= 3
&& rows
.iter()
.all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
}
fn merge_panel_body_row(
table_rows: &[Vec<String>],
trailing_rows: &[Vec<String>],
slot_count: usize,
) -> Vec<String> {
let mut merged = vec![String::new(); slot_count];
for row in table_rows {
for (col_idx, cell) in row.iter().enumerate() {
if col_idx + 1 >= slot_count {
break;
}
append_cell_text(&mut merged[col_idx + 1], cell);
}
}
for row in trailing_rows {
for (col_idx, cell) in row.iter().enumerate() {
if col_idx >= slot_count {
break;
}
append_cell_text(&mut merged[col_idx], cell);
}
}
merged
}
fn render_pipe_rows(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
if num_cols == 0 {
return String::new();
}
let mut out = String::new();
for (row_idx, row) in rows.iter().enumerate() {
out.push('|');
for col_idx in 0..num_cols {
let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
out.push_str(&format!(" {} |", cell.trim()));
}
out.push('\n');
if row_idx == 0 {
out.push('|');
for _ in 0..num_cols {
out.push_str(" --- |");
}
out.push('\n');
}
}
out.push('\n');
out
}
fn render_html_table(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
if num_cols == 0 {
return String::new();
}
let mut out = String::from("<table>\n");
for (row_idx, row) in rows.iter().enumerate() {
out.push_str("<tr>");
for col_idx in 0..num_cols {
let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
if row_idx == 0 {
out.push_str("<th>");
out.push_str(&cell);
out.push_str("</th>");
} else {
out.push_str("<td>");
out.push_str(&cell);
out.push_str("</td>");
}
}
out.push_str("</tr>\n");
}
out.push_str("</table>\n\n");
out
}
fn escape_html_text(text: &str) -> String {
text.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
fn normalized_numeric_marker(text: &str) -> Option<String> {
let digits = text
.chars()
.filter(|ch| ch.is_ascii_digit())
.collect::<String>();
(!digits.is_empty() && digits.len() <= 2).then_some(digits)
}
fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
return None;
}
let marker = normalized_numeric_marker(rows[0][0].trim())?;
if rows[0][1].split_whitespace().count() < 4 {
return None;
}
if rows
.iter()
.skip(1)
.any(|row| normalized_numeric_marker(row[0].trim()).is_some())
{
return None;
}
if rows
.iter()
.skip(1)
.any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
{
return None;
}
let body = rows
.iter()
.filter_map(|row| row.get(1))
.map(|cell| cell.trim())
.filter(|cell| !cell.is_empty())
.collect::<Vec<_>>()
.join(" ");
if body.split_whitespace().count() < 8 {
return None;
}
Some(format!("{marker}. {body}\n\n"))
}
fn extract_element_text(element: &ContentElement) -> String {
match element {
ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
_ => String::new(),
}
}
fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
let num_cols = table.num_columns.max(1);
let mut rendered_rows: Vec<Vec<String>> = Vec::new();
for row in &table.rows {
let cell_texts: Vec<String> = (0..num_cols)
.map(|col| {
row.cells
.iter()
.find(|c| c.col_number == col)
.map(cell_text_content)
.unwrap_or_default()
})
.collect();
if !cell_texts.iter().all(|t| t.trim().is_empty()) {
rendered_rows.push(cell_texts);
}
}
rendered_rows
}
fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
if table.rows.is_empty() {
return;
}
let mut rendered_rows = collect_table_border_rows(table);
if rendered_rows.is_empty() {
return;
}
if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
out.push_str(&rendered);
return;
}
merge_continuation_rows(&mut rendered_rows);
trim_leading_table_carryover_rows(&mut rendered_rows);
if is_toc_table(&rendered_rows) {
render_toc_rows(out, &rendered_rows);
return;
}
out.push_str(&render_pipe_rows(&rendered_rows));
}
fn is_page_number_like(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
return true;
}
let lower = t.to_ascii_lowercase();
if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
return true;
}
false
}
fn is_toc_table(rows: &[Vec<String>]) -> bool {
if rows.is_empty() {
return false;
}
if rows.len() < 2 {
return false;
}
if !rows.iter().all(|r| r.len() == 2) {
return false;
}
let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
if non_empty_right < 2 {
return false;
}
let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
}
fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
for row in rows {
let title = row[0].trim();
let page = row[1].trim();
if title.is_empty() && page.is_empty() {
continue;
}
if !title.is_empty() && !page.is_empty() {
out.push_str(title);
out.push(' ');
out.push_str(page);
} else {
out.push_str(title);
out.push_str(page);
}
out.push('\n');
}
out.push('\n');
}
fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
if !cell.content.is_empty() {
let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
&chunks,
));
}
let mut text = String::new();
for elem in &cell.contents {
match elem {
ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
ContentElement::TextLine(tl) => text.push_str(&tl.value()),
ContentElement::TextChunk(tc) => text.push_str(&tc.value),
_ => {}
}
}
normalize_common_ocr_text(&repair_fragmented_words(&text))
}
fn merge_adjacent_pipe_tables(markdown: &str) -> String {
let lines: Vec<&str> = markdown.lines().collect();
if lines.len() < 4 {
return markdown.to_string();
}
fn count_pipe_cols(line: &str) -> usize {
let t = line.trim();
if !t.starts_with('|') || !t.ends_with('|') {
return 0;
}
t.split('|').count().saturating_sub(2)
}
fn is_separator(line: &str) -> bool {
let t = line.trim();
if !t.starts_with('|') || !t.ends_with('|') {
return false;
}
let cells: Vec<&str> = t.split('|').collect();
if cells.len() < 3 {
return false;
}
cells[1..cells.len() - 1].iter().all(|c| {
let s = c.trim();
!s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
})
}
fn is_pipe_row(line: &str) -> bool {
let t = line.trim();
t.starts_with('|') && t.ends_with('|') && t.len() > 2
}
fn pipe_cells(line: &str) -> Vec<String> {
let t = line.trim();
if !is_pipe_row(t) {
return Vec::new();
}
let parts = t.split('|').collect::<Vec<_>>();
parts[1..parts.len() - 1]
.iter()
.map(|cell| cell.trim().to_string())
.collect()
}
fn normalize_header_cell(cell: &str) -> String {
cell.chars()
.filter(|ch| ch.is_alphanumeric())
.flat_map(|ch| ch.to_lowercase())
.collect()
}
fn looks_like_header_row(line: &str) -> bool {
let cells = pipe_cells(line);
if cells.len() < 2 {
return false;
}
let non_empty = cells
.iter()
.filter(|cell| !cell.trim().is_empty())
.collect::<Vec<_>>();
if non_empty.len() < 2 {
return false;
}
let headerish = non_empty.iter().all(|cell| {
let trimmed = cell.trim();
let word_count = trimmed.split_whitespace().count();
let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
has_alpha && word_count <= 4 && trimmed.len() <= 28
});
headerish
}
fn header_overlap_ratio(left: &str, right: &str) -> f64 {
let left_cells = pipe_cells(left)
.into_iter()
.map(|cell| normalize_header_cell(&cell))
.collect::<Vec<_>>();
let right_cells = pipe_cells(right)
.into_iter()
.map(|cell| normalize_header_cell(&cell))
.collect::<Vec<_>>();
let width = left_cells.len().min(right_cells.len());
if width == 0 {
return 0.0;
}
let matches = (0..width)
.filter(|idx| {
!left_cells[*idx].is_empty()
&& !right_cells[*idx].is_empty()
&& left_cells[*idx] == right_cells[*idx]
})
.count();
matches as f64 / width as f64
}
fn header_schema_matches(left: &str, right: &str) -> bool {
let left_cells = pipe_cells(left)
.into_iter()
.map(|cell| normalize_header_cell(&cell))
.collect::<Vec<_>>();
let right_cells = pipe_cells(right)
.into_iter()
.map(|cell| normalize_header_cell(&cell))
.collect::<Vec<_>>();
if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
return false;
}
let mut aligned_non_empty = 0usize;
for (left, right) in left_cells.iter().zip(right_cells.iter()) {
if left.is_empty() || right.is_empty() {
continue;
}
aligned_non_empty += 1;
if left != right {
return false;
}
}
aligned_non_empty >= 2
}
fn pad_pipe_row(line: &str, target_cols: usize) -> String {
let t = line.trim();
let current_cols = count_pipe_cols(t);
if current_cols >= target_cols {
return t.to_string();
}
let mut result = t.to_string();
for _ in current_cols..target_cols {
result.push_str(" |");
}
result
}
struct Block {
start: usize,
sep: usize,
end: usize, cols: usize,
}
let mut blocks: Vec<Block> = Vec::new();
let mut i = 0;
while i < lines.len() {
if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
let cols = count_pipe_cols(lines[i]);
let sep = i + 1;
let mut end = sep;
let mut j = sep + 1;
while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
end = j;
j += 1;
}
blocks.push(Block {
start: i,
sep,
end,
cols,
});
i = end + 1;
} else {
i += 1;
}
}
if blocks.len() < 2 {
return markdown.to_string();
}
let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
for bi in 1..blocks.len() {
let prev = &blocks[bi - 1];
let curr = &blocks[bi];
let gap_range = prev.end + 1..curr.start;
let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
let effective_prev_cols = group_cols[leader_idx];
let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
let non_blank: Vec<usize> = gap_range
.clone()
.filter(|li| !lines[*li].trim().is_empty())
.collect();
!non_blank.is_empty()
&& non_blank.len() <= 2
&& non_blank.iter().all(|li| {
let t = lines[*li].trim();
t.starts_with('#') && t.len() < 100
})
} else {
false
};
let gap_short_fragment =
if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
let non_blank: Vec<usize> = gap_range
.clone()
.filter(|li| !lines[*li].trim().is_empty())
.collect();
non_blank.len() == 1 && {
let t = lines[non_blank[0]].trim();
t.len() < 30
&& !t.starts_with('#')
&& !t.starts_with('-')
&& !t.starts_with('*')
&& !t.contains(':')
&& !t.contains("TABLE")
}
} else {
false
};
let prev_has_header = looks_like_header_row(lines[prev.start]);
let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
let curr_has_distinct_header = prev_has_header
&& curr_has_header
&& !header_schema_matches(lines[prev.start], lines[curr.start])
&& (curr.cols != prev.cols
|| header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
if (gap_all_blank || gap_heading_only || gap_short_fragment)
&& prev.cols > 0
&& curr.cols > 0
&& !curr_has_distinct_header
{
merge_leader[bi] = Some(leader_idx);
if curr.cols > group_cols[leader_idx] {
group_cols[leader_idx] = curr.cols;
}
}
}
let mut pad_target: Vec<usize> = vec![0; blocks.len()];
for bi in 0..blocks.len() {
let leader = merge_leader[bi].unwrap_or(bi);
pad_target[bi] = group_cols[leader];
}
let mut skip = vec![false; lines.len()];
let mut convert_to_pipe_row = vec![false; lines.len()];
for (bi, leader) in merge_leader.iter().enumerate() {
if leader.is_none() {
continue;
}
let prev_end = blocks[bi - 1].end;
let curr = &blocks[bi];
for li in (prev_end + 1)..curr.start {
if lines[li].trim().is_empty() {
skip[li] = true;
} else {
convert_to_pipe_row[li] = true;
}
}
skip[curr.sep] = true;
}
let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
for (bi, block) in blocks.iter().enumerate() {
line_to_block[block.start..=block.end].fill(Some(bi));
}
for (bi, leader) in merge_leader.iter().enumerate() {
if leader.is_none() {
continue;
}
let prev_end = blocks[bi - 1].end;
let curr = &blocks[bi];
for li in (prev_end + 1)..curr.start {
if convert_to_pipe_row[li] {
line_to_block[li] = Some(bi - 1);
}
}
}
let mut result = String::new();
for (li, line) in lines.iter().enumerate() {
if skip[li] {
continue;
}
if convert_to_pipe_row[li] {
let text = line.trim().trim_start_matches('#').trim();
if let Some(bi) = line_to_block[li] {
let target = pad_target[bi];
if target > 0 && !text.is_empty() {
result.push_str(&format!("| {} ", text));
for _ in 1..target {
result.push_str("| ");
}
result.push_str("|\n");
continue;
}
}
result.push_str(line);
result.push('\n');
continue;
}
if let Some(bi) = line_to_block[li] {
let target = pad_target[bi];
if target > 0 && is_pipe_row(line) && !is_separator(line) {
result.push_str(&pad_pipe_row(line, target));
result.push('\n');
} else if target > 0 && is_separator(line) {
result.push('|');
for _ in 0..target {
result.push_str(" --- |");
}
result.push('\n');
} else {
result.push_str(line);
result.push('\n');
}
} else {
result.push_str(line);
result.push('\n');
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::bbox::BoundingBox;
use crate::models::chunks::TextChunk;
use crate::models::content::ContentElement;
use crate::models::enums::{PdfLayer, TextFormat, TextType};
use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
use crate::models::table::{
TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
};
use crate::models::text::{TextBlock, TextColumn, TextLine};
#[test]
fn test_empty_doc() {
let doc = PdfDocument::new("test.pdf".to_string());
let md = to_markdown(&doc).unwrap();
assert!(md.contains("No content extracted"));
}
#[test]
fn test_with_title() {
let mut doc = PdfDocument::new("test.pdf".to_string());
doc.title = Some("My Title".to_string());
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with("# My Title\n"));
}
#[test]
fn test_empty_title_not_rendered() {
let mut doc = PdfDocument::new("test.pdf".to_string());
doc.title = Some(" ".to_string());
let md = to_markdown(&doc).unwrap();
assert!(
!md.contains("# "),
"Empty/whitespace title should not produce a heading"
);
}
#[test]
fn test_repair_fragmented_words() {
assert_eq!(
repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
"Jurisdiction Foreign Requirements"
);
}
#[test]
fn test_normalize_common_ocr_text_repairs_units() {
assert_eq!(
normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
"10 μL at 37°C and -20°C"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
let lines = vec![
"Key Functions by Main Service Flow".to_string(),
"".to_string(),
" Service Stage Function Name Explanation Expected Benefit".to_string(),
"".to_string(),
" 1. Project creation Project creation and Select document type to automatically run project creation, Pipeline configuration with The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
"".to_string(),
" management recommended Modelset and Endpoint deployment the entire process from project creation to deployment, improving work efficiency".to_string(),
"".to_string(),
" Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
" 2. Data labeling and Data storage management Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
" (search using image metadata, sorting, filtering, hashtags settings on image data) service".to_string(),
" fine-tuning".to_string(),
" Image data bookmark for Qualitative Evaluation".to_string(),
"".to_string(),
" Create and manage Labeling Creating a Labeling Space to manage raw data annotation, managing labeling resources Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
" (Ontology, Characters to be Recognized), data set dump, data set version management supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
" Space".to_string(),
" 3 increases both efficiency and convenience.".to_string(),
" Various basic models for each selected 5".to_string(),
" document, information comparison between".to_string(),
" Model training Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
" models, basic model training, training pause function, re-training, cancel function, and OCR model specialized to the customers’ needs".to_string(),
" configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
" modified while developing specialized models".to_string(),
];
let header = find_layout_header_candidate(&lines).unwrap();
let rows =
build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
assert_eq!(
header.headers,
vec![
"Service Stage".to_string(),
"Function Name".to_string(),
"Explanation".to_string(),
"Expected Benefit".to_string()
]
);
assert_eq!(rows.len(), 4);
assert_eq!(rows[0][0], "1. Project creation");
assert_eq!(rows[0][1], "Project creation and management");
assert!(rows[1][0].contains("fine-tuning"));
assert_eq!(rows[2][1], "Create and manage Labeling Space");
assert_eq!(rows[3][1], "Model training");
assert!(rows[3][2].contains("Various basic models for each selected document"));
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
let lines = vec![
"AI Pack".to_string(),
"Upstage offers 3 AI packs that process unstructured information and data".to_string(),
"".to_string(),
" OCR Recommendation Product semantic search".to_string(),
"".to_string(),
" A solution that recognizes characters in an A solution that recommends the best products and A solution that enables semantic search, analyzes and".to_string(),
" image and extracts necessary information contents organizes key information in unstructured text data".to_string(),
" Pack".to_string(),
" into a standardized form (DB)".to_string(),
"".to_string(),
" Applicable to all fields that require text extraction Applicable to all fields that use any form of Applicable to all fields that deal with various types of".to_string(),
" from standardized documents, such as receipts, recommendation including alternative products, unstructured data containing text information that".to_string(),
"Application bills, credit cards, ID cards, certificates, and medical products and contents that are likely to be require semantic search and conversion into a DB".to_string(),
" receipts purchased next".to_string(),
"".to_string(),
" Achieved 1st place in the OCR World Competition Team with specialists and technologies that Creation of the first natural language evaluation".to_string(),
" The team includes specialists who have received Kaggle’s Gold Medal recommendation system in Korean (KLUE)".to_string(),
" presented 14 papers in the world’s most (Education platform) World’s No.1 in Kaggle text embedding competition in".to_string(),
" Highlight".to_string(),
" renowned AI conferences Proven superior performance of more than 170% E-commerce subject (Shopee)".to_string(),
" compared to other global top-tier recommendation".to_string(),
" models".to_string(),
];
let header = find_layout_panel_header_candidate(&lines).unwrap();
let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
assert_eq!(
header.headers,
vec![
"OCR".to_string(),
"Recommendation".to_string(),
"Product semantic search".to_string()
]
);
assert_eq!(rows.len(), 3);
assert_eq!(rows[0][0], "Pack");
assert!(rows[0][1].contains("image and extracts necessary information"));
assert_eq!(rows[1][0], "Application");
assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
assert_eq!(rows[2][0], "Highlight");
assert!(rows[2][2].contains("top-tier recommendation models"));
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_extract_layout_toc_entries_merges_wrapped_entry() {
let lines = vec![
"Table of Contents".to_string(),
"".to_string(),
"Executive Summary 4".to_string(),
"Legal Framework 6".to_string(),
"Election Administration 11".to_string(),
"Civil Society Engagement 15".to_string(),
"Political Parties, Candidates Registration and Election 18".to_string(),
"Campaign".to_string(),
"Media Freedom and Access to Information 25".to_string(),
"Voter Education and Awareness 29".to_string(),
"Participation of Marginalized Sectors 31".to_string(),
"Recommendations 39".to_string(),
];
let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
assert_eq!(title, "Table of Contents");
assert_eq!(entries.len(), 9);
assert_eq!(entries[0].title, "Executive Summary");
assert_eq!(entries[0].page, "4");
assert_eq!(
entries[4].title,
"Political Parties, Candidates Registration and Election Campaign"
);
assert_eq!(entries[4].page, "18");
}
#[cfg(not(target_arch = "wasm32"))]
fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
make_bbox_layout_line_in_block(0, words, bottom, top)
}
#[cfg(not(target_arch = "wasm32"))]
fn make_bbox_layout_line_in_block(
block_id: usize,
words: &[(&str, f64, f64)],
bottom: f64,
top: f64,
) -> BBoxLayoutLine {
BBoxLayoutLine {
block_id,
bbox: BoundingBox::new(
Some(1),
words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
bottom,
words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
top,
),
words: words
.iter()
.map(|(text, left, right)| BBoxLayoutWord {
bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
text: (*text).to_string(),
})
.collect(),
}
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
let lines = vec![
make_bbox_layout_line(
&[
("Fish", 60.0, 76.0),
("species", 78.0, 107.0),
("on", 109.0, 119.0),
("IUCN", 121.0, 142.0),
("Red", 144.0, 159.0),
("List", 161.0, 176.0),
],
649.0,
660.0,
),
make_bbox_layout_line(
&[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
632.0,
643.0,
),
make_bbox_layout_line(
&[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
632.0,
643.0,
),
make_bbox_layout_line(
&[
("La", 60.0, 69.0),
("Palma", 71.0, 94.0),
("Pupfish", 96.0, 124.0),
("Cyprinodon", 132.0, 176.0),
("longidorsalis", 178.0, 224.0),
],
616.0,
627.0,
),
make_bbox_layout_line(
&[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
600.0,
611.0,
),
make_bbox_layout_line(
&[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
600.0,
611.0,
),
make_bbox_layout_line(
&[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
584.0,
595.0,
),
make_bbox_layout_line(
&[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
584.0,
595.0,
),
make_bbox_layout_line(
&[
("Table", 56.0, 74.0),
("6.1:", 76.0, 87.0),
("Four", 89.0, 105.0),
("fish", 107.0, 119.0),
("species", 121.0, 145.0),
("on", 147.0, 155.0),
("IUCN", 157.0, 176.0),
("Red", 178.0, 190.0),
("List", 192.0, 205.0),
("held", 279.0, 293.0),
("in", 295.0, 302.0),
("public", 304.0, 325.0),
("aquariums.", 327.0, 365.0),
],
556.0,
566.0,
),
];
let plate = detect_layout_open_plate(576.0, &lines).unwrap();
assert_eq!(plate.heading, "Fish species on IUCN Red List");
assert_eq!(
plate.header_row,
vec![
"Fish species on IUCN Red List".to_string(),
"Scientific name".to_string()
]
);
assert_eq!(plate.rows.len(), 4);
assert_eq!(
plate.rows[1],
vec![
"La Palma Pupfish".to_string(),
"Cyprinodon longidorsalis".to_string()
]
);
assert!(plate
.caption
.starts_with("Table 6.1: Four fish species on IUCN Red List"));
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
let plate = OpenPlateCandidate {
heading: "Fish species on IUCN Red List".to_string(),
header_row: vec![
"Fish species on IUCN Red List".to_string(),
"Scientific name".to_string(),
],
rows: vec![],
caption: "Table 6.1".to_string(),
cutoff_top_y: 560.0,
};
let lines = vec![
make_bbox_layout_line(
&[
("Public", 56.0, 83.0),
("aquariums,", 88.0, 135.0),
("because", 140.0, 174.0),
],
509.0,
521.0,
),
make_bbox_layout_line(
&[
("of", 180.0, 188.0),
("their", 194.0, 214.0),
("in-", 220.0, 233.0),
],
509.0,
521.0,
),
make_bbox_layout_line(
&[
("house", 56.0, 82.0),
("expertise,", 84.0, 125.0),
("can", 128.0, 143.0),
],
495.0,
507.0,
),
make_bbox_layout_line(
&[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
495.0,
507.0,
),
make_bbox_layout_line_in_block(
1,
&[
("Figure", 242.0, 265.0),
("6.3:", 267.0, 280.0),
("Photo", 282.0, 303.0),
],
355.0,
366.0,
),
make_bbox_layout_line_in_block(
1,
&[
("of", 305.0, 312.0),
("the", 314.0, 325.0),
("species.", 327.0, 360.0),
],
355.0,
366.0,
),
make_bbox_layout_line(
&[
("The", 56.0, 73.0),
("breeding", 77.0, 114.0),
("colonies", 118.0, 153.0),
],
330.0,
342.0,
),
make_bbox_layout_line(
&[
("of", 157.0, 165.0),
("the", 169.0, 183.0),
("Butterfly", 187.0, 224.0),
("Splitfin", 228.0, 258.0),
("at", 314.0, 323.0),
("the", 327.0, 341.0),
("London", 345.0, 377.0),
("Zoo", 381.0, 397.0),
("and", 401.0, 416.0),
("elsewhere", 420.0, 463.0),
("serve", 467.0, 489.0),
("as", 493.0, 502.0),
("ark", 506.0, 519.0),
],
330.0,
342.0,
),
make_bbox_layout_line(
&[
("Figure", 56.0, 79.0),
("6.4:", 81.0, 94.0),
("Lake", 96.0, 116.0),
("Sturgeon", 118.0, 158.0),
],
104.0,
116.0,
),
];
let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
assert!(bridge
.bridge_paragraph
.as_deref()
.is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
assert_eq!(bridge.deferred_captions.len(), 2);
assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
assert!(bridge.deferred_captions[0].contains("species."));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
assert_eq!(
dashboard.title,
"Base Model Performance Evaluation of Upstage OCR Pack"
);
assert_eq!(dashboard.left_columns.len(), 2);
assert_eq!(
dashboard.left_columns[0],
"Scene (Photographed document image)"
);
assert_eq!(
dashboard.left_rows[0],
vec![
"Company A²".to_string(),
"70.23".to_string(),
"80.41".to_string()
]
);
assert_eq!(
dashboard.right_rows[0],
vec![
"OCR-Recall³".to_string(),
"73.2".to_string(),
"94.2".to_string(),
"94.1".to_string()
]
);
assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
assert_eq!(dashboard.right_rows[3][1], "68.0");
assert_eq!(dashboard.right_rows[3][2], "82.65");
assert_eq!(dashboard.right_rows[3][3], "82.65");
assert!(!dashboard.definition_notes.is_empty());
assert!(!dashboard.source_notes.is_empty());
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_split_layout_line_spans_handles_unicode_boundaries() {
let line = "Title “Podcast #EP32: SDGs dan Anak Muda” 2024";
let spans = split_layout_line_spans(line);
assert_eq!(spans.len(), 3);
assert_eq!(spans[0].1, "Title");
assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
assert!(spans[1].1.ends_with('”'));
assert!(spans[2].1.ends_with("24"));
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_render_layout_single_caption_chart_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
.unwrap()
.kids,
..PdfDocument::new("01030000000037.pdf".to_string())
};
let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
assert!(rendered.contains("# 3. Impact on Business Operations"));
assert!(rendered.contains("## 3.1. Status of Business Operations"));
assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
assert!(
rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
);
assert!(
rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
);
assert!(!rendered.contains("| Lockdown Period |"));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
.unwrap()
.kids,
..PdfDocument::new("01030000000072.pdf".to_string())
};
let md = to_markdown(&doc).unwrap();
assert!(md.contains("## Diagram 5"), "{md}");
assert!(
md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
"{md}"
);
assert!(
md.contains(
"As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
),
"{md}"
);
assert!(md.contains("**Figure 4**"), "{md}");
assert!(
md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
"{md}"
);
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
.unwrap()
.kids,
..PdfDocument::new("01030000000073.pdf".to_string())
};
let md = to_markdown(&doc).unwrap();
assert!(
md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
"{md}"
);
assert!(
md.contains("Examples of such greetings are as follows:"),
"{md}"
);
assert!(md.contains("*Image*"), "{md}");
assert!(md.contains("**Figure 6**"), "{md}");
assert!(md.contains("**DPN Argentina**"), "{md}");
assert!(
md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
"{md}"
);
assert!(md.contains("**Footnote:**"), "{md}");
assert!(
md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
"{md}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
.unwrap()
.kids,
..PdfDocument::new("01030000000014.pdf".to_string())
};
assert!(render_layout_captioned_media_document(&doc).is_none());
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
.unwrap()
.kids,
..PdfDocument::new("01030000000014.pdf".to_string())
};
let md = to_markdown(&doc).unwrap();
assert!(
md.contains("These images also show that different areas are used by men and by women"),
"{md}"
);
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_render_layout_recommendation_infographic_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: Vec::new(),
..PdfDocument::new("01030000000183.pdf".to_string())
};
let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
assert!(rendered.contains("# Recommendation Pack: Track Record"));
assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
assert!(rendered.contains("## Education Content Platform PoC Case"));
assert!(rendered.contains("| DKT Model | 0.882 |"));
assert!(rendered.contains("Compared to regular model"));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_render_layout_stacked_bar_report_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: Vec::new(),
..PdfDocument::new("01030000000038.pdf".to_string())
};
let rendered = render_layout_stacked_bar_report_document(&doc);
if rendered.is_none() {
let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
let blocks = collect_bbox_layout_blocks(&lines);
let figures = collect_layout_figure_captions(&blocks);
let narrative = detect_layout_stacked_bar_narrative(&blocks);
eprintln!("page_width={page_width} figures={}", figures.len());
if let Some(first) = figures.first() {
eprintln!("figure1={}", bbox_layout_block_text(first));
}
if let Some(second) = figures.get(1) {
eprintln!("figure2={}", bbox_layout_block_text(second));
}
eprintln!("narrative={}", narrative.is_some());
if let Some(narrative) = &narrative {
eprintln!("heading={}", narrative.heading);
eprintln!("paragraphs={}", narrative.paragraphs.len());
eprintln!("footnote={:?}", narrative.footnote);
}
for block in &blocks {
let text = bbox_layout_block_text(block);
if text.contains("July")
|| text.contains("October")
|| text.contains("January")
|| text.contains("Will ")
|| text.contains("Don’t")
|| text.starts_with("6.2.")
|| text.starts_with("5.")
{
eprintln!(
"block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
block.bbox.top_y,
block.bbox.bottom_y,
block.bbox.left_x,
block.bbox.right_x,
text
);
}
}
if figures.len() >= 2 {
let first = detect_layout_three_month_stacked_figure(
&blocks,
&lines,
page_width,
figures[0].clone(),
figures[1].bbox.top_y,
);
eprintln!("figure_one_ok={}", first.is_some());
if let Some(narrative) = &narrative {
let second = detect_layout_sector_bar_figure(
&blocks,
&lines,
page_width,
figures[1].clone(),
narrative.top_y,
);
eprintln!("figure_two_ok={}", second.is_some());
}
}
}
let rendered = rendered.unwrap();
assert!(rendered.contains("# Figure 6.1.1:"));
assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
let doc = PdfDocument {
title: None,
source_path: Some(path.to_string_lossy().to_string()),
number_of_pages: 1,
kids: Vec::new(),
..PdfDocument::new("01030000000076.pdf".to_string())
};
let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
assert!(rendered.contains("# Figures from the Document"));
assert!(
rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
);
assert!(rendered.contains("| 2016 | 3,230 |"));
assert!(rendered.contains("| 2021 | 2,693 |"));
assert!(
rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
);
assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
assert!(rendered.contains(
"Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_render_layout_open_plate_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let rendered = render_layout_open_plate_document(&doc).unwrap();
assert!(rendered.contains("# Fish species on IUCN Red List"));
assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
assert!(rendered.contains("---"));
assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_open_plate_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
assert!(
md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
"{md}"
);
assert!(
md.contains("| Golden Skiffia | Skiffia francesae |"),
"{md}"
);
assert!(
md.contains("*Table 6.1: Four fish species on IUCN Red List"),
"{md}"
);
assert!(
md.contains("The breeding colonies of the Butterfly Splitfin"),
"{md}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
assert!(
md.contains("In July 2020, the survey established a general profile"),
"{md}"
);
assert!(
md.contains(
"The tourism sub-sectors interviewed included lodging, restaurants and bars"
),
"{md}"
);
assert!(
!md.starts_with("# Business characteristics. Business size was"),
"{md}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(
md.contains(
"Thailand, Philippines and Indonesia in particular, identifying known experts"
),
"{md}"
);
assert!(
md.contains("Figure 1: Age by gender of respondents"),
"{md}"
);
assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
assert!(
!md.starts_with("# Thailand, Philippines and Indonesia in"),
"{md}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("estuarine influenced areas."), "{md}");
assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
assert!(
md.contains("The port of Manila has been documented"),
"{md}"
);
assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_detect_footnote_citation_regions_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let regions = detect_footnote_citation_regions(&doc);
assert!(!regions.is_empty(), "{regions:?}");
assert!(
regions.iter().any(|region| {
region.rendered.contains("<table>")
&& region.rendered.contains("<td>25</td>")
&& region.rendered.contains("<td>29</td>")
}),
"{regions:#?}"
);
assert!(
regions.iter().any(|region| {
region.rendered.contains("<table>")
&& region.rendered.contains("<td>30</td>")
&& region.rendered.contains("<td>33</td>")
}),
"{regions:#?}"
);
}
#[cfg(not(target_arch = "wasm32"))]
#[test]
fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("<table>"), "{md}");
assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
assert!(
md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
"{md}"
);
assert!(
md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
"{md}"
);
assert!(
md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
"{md}"
);
assert!(
md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
"{md}"
);
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_projection_sheet_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("# Table and Figure from the Document"), "{md}");
assert!(md.contains("| A | B | C | D | E |"), "{md}");
assert!(
md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
"{md}"
);
assert!(
md.contains("**Figure 13.3. Graph of Projection Estimates**"),
"{md}"
);
assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
assert!(
md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
"{md}"
);
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_appendix_tables_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.contains("# Appendices"), "{md}");
assert!(
md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
"{md}"
);
assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
assert!(
md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
"{md}"
);
assert!(
md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
"{md}"
);
assert!(
md.contains(
"| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
),
"{md}"
);
assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
assert!(
md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
"{md}"
);
assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
assert!(
md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
"{md}"
);
assert!(
md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
"{md}"
);
assert!(
md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
"{md}"
);
assert!(
md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
"{md}"
);
assert!(
md.contains("*These are real data from three NBFCs*"),
"{md}"
);
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_registration_report_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(
md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
"{md}"
);
assert!(
md.contains(
"| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
),
"{md}"
);
assert!(
md.contains("| | Total | | 84,208 | | 86,092 | +1,884 |"),
"{md}"
);
assert!(!md.contains("| | Democracy Party |"), "{md}");
}
#[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
#[test]
fn test_to_markdown_dual_table_article_document_on_real_pdf() {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
let md = to_markdown(&doc).unwrap();
assert!(
md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
"{md}"
);
assert!(
md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
"{md}"
);
assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
assert!(!md.contains("| Merge v1"), "{md}");
}
#[test]
fn test_normalize_list_text_strips_redundant_bullets() {
assert_eq!(
normalize_list_text("• Collected via surveys"),
"Collected via surveys"
);
assert!(is_pure_bullet_marker("•"));
}
#[test]
fn test_reference_continuation_detected() {
assert!(should_merge_paragraph_text(
"Scaling laws for transfer.",
"arXiv preprint arXiv:2102.01293."
));
}
#[test]
fn test_enumerated_markers_are_detected() {
assert!(starts_with_enumerated_marker("iii. Third item"));
assert!(starts_with_enumerated_marker("1) First item"));
assert!(starts_with_enumerated_marker("a. Lettered item"));
assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
assert!(!starts_with_enumerated_marker("Natural dispersal"));
}
fn make_heading(text: &str) -> ContentElement {
let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
let chunk = TextChunk {
value: text.to_string(),
bbox: bbox.clone(),
font_name: "Lato-Bold".to_string(),
font_size: 12.0,
font_weight: 700.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
};
let line = TextLine {
bbox: bbox.clone(),
index: None,
level: None,
font_size: 12.0,
base_line: 702.0,
slant_degree: 0.0,
is_hidden_text: false,
text_chunks: vec![chunk],
is_line_start: true,
is_line_end: true,
is_list_line: false,
connected_line_art_label: None,
};
let block = TextBlock {
bbox: bbox.clone(),
index: None,
level: None,
font_size: 12.0,
base_line: 702.0,
slant_degree: 0.0,
is_hidden_text: false,
text_lines: vec![line],
has_start_line: true,
has_end_line: true,
text_alignment: None,
};
let column = TextColumn {
bbox: bbox.clone(),
index: None,
level: None,
font_size: 12.0,
base_line: 702.0,
slant_degree: 0.0,
is_hidden_text: false,
text_blocks: vec![block],
};
ContentElement::Heading(SemanticHeading {
base: SemanticParagraph {
base: SemanticTextNode {
bbox,
index: None,
level: None,
semantic_type: crate::models::enums::SemanticType::Heading,
correct_semantic_score: None,
columns: vec![column],
font_weight: Some(700.0),
font_size: Some(12.0),
text_color: None,
italic_angle: None,
font_name: Some("Lato-Bold".to_string()),
text_format: None,
max_font_size: Some(12.0),
background_color: None,
is_hidden_text: false,
},
enclosed_top: false,
enclosed_bottom: false,
indentation: 0,
},
heading_level: Some(1),
})
}
fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
let chunk = TextChunk {
value: text.to_string(),
bbox: bbox.clone(),
font_name: "Lato-Bold".to_string(),
font_size: top - bottom,
font_weight: 700.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
};
let line = TextLine {
bbox: bbox.clone(),
index: None,
level: None,
font_size: top - bottom,
base_line: bottom + 2.0,
slant_degree: 0.0,
is_hidden_text: false,
text_chunks: vec![chunk],
is_line_start: true,
is_line_end: true,
is_list_line: false,
connected_line_art_label: None,
};
let block = TextBlock {
bbox: bbox.clone(),
index: None,
level: None,
font_size: top - bottom,
base_line: bottom + 2.0,
slant_degree: 0.0,
is_hidden_text: false,
text_lines: vec![line],
has_start_line: true,
has_end_line: true,
text_alignment: None,
};
let column = TextColumn {
bbox: bbox.clone(),
index: None,
level: None,
font_size: top - bottom,
base_line: bottom + 2.0,
slant_degree: 0.0,
is_hidden_text: false,
text_blocks: vec![block],
};
ContentElement::Heading(SemanticHeading {
base: SemanticParagraph {
base: SemanticTextNode {
bbox,
index: None,
level: None,
semantic_type: crate::models::enums::SemanticType::Heading,
correct_semantic_score: None,
columns: vec![column],
font_weight: Some(700.0),
font_size: Some(top - bottom),
text_color: None,
italic_angle: None,
font_name: Some("Lato-Bold".to_string()),
text_format: None,
max_font_size: Some(top - bottom),
background_color: None,
is_hidden_text: false,
},
enclosed_top: false,
enclosed_bottom: false,
indentation: 0,
},
heading_level: None,
})
}
fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
make_paragraph_at(72.0, bottom, 300.0, top, text)
}
fn make_paragraph_at(
left: f64,
bottom: f64,
right: f64,
top: f64,
text: &str,
) -> ContentElement {
let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
let chunk = TextChunk {
value: text.to_string(),
bbox: bbox.clone(),
font_name: "Lato-Regular".to_string(),
font_size: (top - bottom).max(1.0),
font_weight: 400.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
};
let line = TextLine {
bbox: bbox.clone(),
index: None,
level: None,
font_size: chunk.font_size,
base_line: bottom + 2.0,
slant_degree: 0.0,
is_hidden_text: false,
text_chunks: vec![chunk],
is_line_start: true,
is_line_end: true,
is_list_line: false,
connected_line_art_label: None,
};
let block = TextBlock {
bbox: bbox.clone(),
index: None,
level: None,
font_size: line.font_size,
base_line: line.base_line,
slant_degree: 0.0,
is_hidden_text: false,
text_lines: vec![line],
has_start_line: true,
has_end_line: true,
text_alignment: None,
};
let column = TextColumn {
bbox: bbox.clone(),
index: None,
level: None,
font_size: block.font_size,
base_line: block.base_line,
slant_degree: 0.0,
is_hidden_text: false,
text_blocks: vec![block],
};
ContentElement::Paragraph(SemanticParagraph {
base: SemanticTextNode {
bbox,
index: None,
level: None,
semantic_type: crate::models::enums::SemanticType::Paragraph,
correct_semantic_score: None,
columns: vec![column],
font_weight: Some(400.0),
font_size: Some(top - bottom),
text_color: None,
italic_angle: None,
font_name: Some("Lato-Regular".to_string()),
text_format: None,
max_font_size: Some(top - bottom),
background_color: None,
is_hidden_text: false,
},
enclosed_top: false,
enclosed_bottom: false,
indentation: 0,
})
}
fn make_fallback_list(items: &[&str]) -> ContentElement {
let mut list_items = Vec::new();
for (idx, text) in items.iter().enumerate() {
let top = 700.0 - idx as f64 * 18.0;
let bottom = top - 12.0;
let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
list_items.push(ListItem {
bbox: bbox.clone(),
index: None,
level: None,
label: ListLabel {
bbox: bbox.clone(),
content: vec![],
semantic_type: None,
},
body: ListBody {
bbox: bbox.clone(),
content: vec![],
semantic_type: None,
},
label_length: 0,
contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
semantic_type: None,
});
}
ContentElement::List(PDFList {
bbox: BoundingBox::new(
Some(1),
72.0,
700.0 - items.len() as f64 * 18.0,
320.0,
700.0,
),
index: None,
level: None,
list_items,
numbering_style: Some("bullets".to_string()),
common_prefix: None,
previous_list_id: None,
next_list_id: None,
})
}
fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
let mut table_rows = Vec::new();
for (ri, (title, page)) in rows.iter().enumerate() {
let top = 680.0 - ri as f64 * 18.0;
let bottom = top - 12.0;
let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
table_rows.push(TableBorderRow {
bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
index: None,
level: None,
row_number: ri,
cells: vec![
TableBorderCell {
bbox: left_bbox.clone(),
index: None,
level: None,
row_number: ri,
col_number: 0,
row_span: 1,
col_span: 1,
content: vec![TableToken {
base: TextChunk {
value: (*title).to_string(),
bbox: left_bbox,
font_name: "Lato-Regular".to_string(),
font_size: 10.0,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
},
token_type: TableTokenType::Text,
}],
contents: vec![],
semantic_type: None,
},
TableBorderCell {
bbox: right_bbox.clone(),
index: None,
level: None,
row_number: ri,
col_number: 1,
row_span: 1,
col_span: 1,
content: vec![TableToken {
base: TextChunk {
value: (*page).to_string(),
bbox: right_bbox,
font_name: "Lato-Regular".to_string(),
font_size: 10.0,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
},
token_type: TableTokenType::Text,
}],
contents: vec![],
semantic_type: None,
},
],
semantic_type: None,
});
}
ContentElement::TableBorder(TableBorder {
bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
index: None,
level: Some("1".to_string()),
x_coordinates: vec![72.0, 320.0, 360.0],
x_widths: vec![0.0, 0.0, 0.0],
y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
y_widths: vec![0.0, 0.0, 0.0, 0.0],
rows: table_rows,
num_rows: rows.len(),
num_columns: 2,
is_bad_table: false,
is_table_transformer: false,
previous_table: None,
next_table: None,
})
}
#[test]
fn test_contents_document_renders_toc_table_rows() {
let mut doc = PdfDocument::new("contents.pdf".to_string());
doc.kids.push(make_heading("CONTENTS"));
doc.kids.push(make_toc_table(&[
("Experiment #1: Hydrostatic Pressure", "3"),
("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
("Experiment #3: Energy Loss in Pipe Fittings", "24"),
("Experiment #4: Energy Loss in Pipes", "33"),
("Experiment #5: Impact of a Jet", "43"),
("Experiment #6: Orifice and Free Jet Flow", "50"),
("Experiment #7: Osborne Reynolds' Demonstration", "59"),
("References", "101"),
]));
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with("# CONTENTS\n\n"));
assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
assert!(md.contains("- References 101\n"));
}
#[test]
fn test_toc_semantic_paragraphs_render_without_blank_lines() {
let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
let mut first = make_paragraph(
"Part V. Chapter Five - Comparing Associations Between Multiple Variables",
700.0,
712.0,
);
let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
if let ContentElement::Paragraph(p) = &mut first {
p.base.semantic_type = SemanticType::TableOfContent;
}
if let ContentElement::Paragraph(p) = &mut second {
p.base.semantic_type = SemanticType::TableOfContent;
}
doc.kids.push(first);
doc.kids.push(second);
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
));
}
#[test]
fn test_compact_toc_document_renders_without_blank_lines() {
let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
doc.kids.push(make_paragraph(
"Part V. Chapter Five - Comparing Associations Between Multiple Variables",
700.0,
712.0,
));
doc.kids.push(make_paragraph(
"Section 5.1: The Linear Model 35",
684.0,
696.0,
));
doc.kids.push(make_paragraph(
"Part VI. Chapter Six - Comparing Three or More Group Means",
668.0,
680.0,
));
doc.kids.push(make_paragraph(
"Section 6.1: Between Versus Within Group Analyses 49",
652.0,
664.0,
));
doc.kids.push(make_paragraph(
"Part VII. Chapter Seven - Moderation and Mediation Analyses",
636.0,
648.0,
));
doc.kids.push(make_paragraph(
"Section 7.1: Mediation and Moderation Models 64",
620.0,
632.0,
));
doc.kids
.push(make_paragraph("References 101", 604.0, 616.0));
doc.kids.push(make_paragraph(
"Section 8.1: Factor Analysis Definitions 75",
588.0,
600.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
));
assert!(md.contains(
"# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
));
assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
}
#[test]
fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
let mut doc = PdfDocument::new("caption-body.pdf".to_string());
doc.kids.push(make_paragraph(
"Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
500.0,
540.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
}
#[test]
fn test_short_caption_label_merges_with_following_tail_and_body() {
let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
doc.kids.push(make_paragraph(
"Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
520.0,
532.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
));
}
#[test]
fn test_short_caption_label_merges_with_tail_and_year() {
let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
doc.kids.push(make_paragraph(
"Komnas HAM's YouTube channel as of 1 December",
520.0,
532.0,
));
doc.kids.push(make_paragraph("2021", 500.0, 512.0));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
assert!(!md.contains("\n\n2021"));
}
#[test]
fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
let mut doc = PdfDocument::new("chart.pdf".to_string());
doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
doc.kids.push(make_paragraph("100", 520.0, 528.0));
doc.kids
.push(make_paragraph("Body text continues here.", 400.0, 412.0));
doc.kids.push(make_paragraph("36", 20.0, 28.0));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("100"));
assert!(!md.lines().any(|line| line.trim() == "36"));
}
#[test]
fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
doc.kids.push(make_paragraph(
"First semantic paragraph ends here.",
520.0,
532.0,
));
doc.kids.push(make_paragraph(
"Second semantic paragraph starts here.",
500.0,
512.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
));
}
#[test]
fn test_lowercase_semantic_paragraph_continuation_is_merged() {
let mut doc = PdfDocument::new("continuation.pdf".to_string());
doc.kids.push(make_paragraph(
"You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
520.0,
532.0,
));
doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
));
}
#[test]
fn test_semantic_enumerated_paragraphs_are_not_merged() {
let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
doc.kids.push(make_paragraph(
"iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
520.0,
532.0,
));
doc.kids.push(make_paragraph(
"iv. This business model will be operating cost-oriented not capital cost-oriented.",
500.0,
512.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
));
}
#[test]
fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_paragraph_at(
72.0,
742.0,
540.0,
756.0,
"Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
));
doc.kids.push(make_heading_at(
72.0,
680.0,
260.0,
696.0,
"5. Natural dispersal",
));
doc.kids.push(make_paragraph_at(
72.0,
640.0,
540.0,
654.0,
"Dispersal by purely natural means is not included as a pathway of biological invasions.",
));
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with("# 5. Natural dispersal"));
assert!(!md.contains("Figure 6. Mytella strigata"));
}
#[test]
fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
let mut doc = PdfDocument::new("bullets.pdf".to_string());
doc.kids.push(make_fallback_list(&[
"• First item",
"•",
"• Second item",
"133",
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("- First item"));
assert!(md.contains("- Second item"));
assert!(!md.contains("- • First item"));
assert!(!md.contains("\n- •\n"));
assert!(!md.contains("\n- 133\n"));
}
#[test]
fn test_list_renderer_merges_wrapped_continuation_items() {
let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
doc.kids.push(make_fallback_list(&[
"Use a micropipette to add 2 μL of loading dye",
"and down a couple of times to mix the loading dye with the digested DNA.",
"Use a fresh pipet tip for each reaction tube.",
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
));
assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
assert!(!md.contains("\n- and down"));
}
#[test]
fn test_list_renderer_keeps_enumerated_items_separate() {
let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
doc.kids.push(make_fallback_list(&[
"iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
"iv. This business model will be operating cost-oriented not capital cost-oriented.",
"v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
assert!(!md.contains("- iii."));
}
#[test]
fn test_postprocess_drops_isolated_single_char_noise_lines() {
let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
let cleaned = drop_isolated_noise_lines(markdown);
assert!(!cleaned.contains("\n1\n"));
assert!(!cleaned.contains("\no\n"));
assert!(cleaned.contains("To get started."));
assert!(cleaned.contains("NOTE: Keep going."));
}
fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
let mut table_rows = Vec::new();
for (row_number, (left, right)) in rows.iter().enumerate() {
let top = 656.0 - row_number as f64 * 18.0;
let bottom = top - 16.0;
let mut cells = Vec::new();
for (col_number, (text, left_x, right_x)) in
[(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
.into_iter()
.enumerate()
{
let content = if text.is_empty() {
Vec::new()
} else {
vec![TableToken {
base: TextChunk {
value: text.to_string(),
bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
font_name: "Test".to_string(),
font_size: 11.0,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "[0.0]".to_string(),
contrast_ratio: 21.0,
symbol_ends: Vec::new(),
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
},
token_type: TableTokenType::Text,
}]
};
cells.push(TableBorderCell {
bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
index: None,
level: None,
row_number,
col_number,
row_span: 1,
col_span: 1,
content,
contents: vec![],
semantic_type: None,
});
}
table_rows.push(TableBorderRow {
bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
index: None,
level: None,
row_number,
cells,
semantic_type: None,
});
}
ContentElement::TableBorder(TableBorder {
bbox: BoundingBox::new(
Some(1),
72.0,
656.0 - rows.len() as f64 * 18.0 - 16.0,
420.0,
656.0,
),
index: None,
level: Some("1".to_string()),
x_coordinates: vec![72.0, 220.0, 420.0],
x_widths: vec![0.0; 3],
y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
y_widths: vec![0.0; rows.len() + 1],
rows: table_rows,
num_rows: rows.len(),
num_columns: 2,
is_bad_table: false,
is_table_transformer: false,
previous_table: None,
next_table: None,
})
}
fn make_chunked_paragraph_line(
segments: &[(&str, f64, f64)],
bottom: f64,
top: f64,
) -> ContentElement {
let bbox = BoundingBox::new(
Some(1),
segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
bottom,
segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
top,
);
let chunks = segments
.iter()
.map(|(text, left, right)| TextChunk {
value: (*text).to_string(),
bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
font_name: "Lato-Regular".to_string(),
font_size: top - bottom,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "#000000".to_string(),
contrast_ratio: 21.0,
symbol_ends: vec![],
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
})
.collect::<Vec<_>>();
let line = TextLine {
bbox: bbox.clone(),
index: None,
level: None,
font_size: top - bottom,
base_line: bottom + 2.0,
slant_degree: 0.0,
is_hidden_text: false,
text_chunks: chunks,
is_line_start: true,
is_line_end: true,
is_list_line: false,
connected_line_art_label: None,
};
let block = TextBlock {
bbox: bbox.clone(),
index: None,
level: None,
font_size: line.font_size,
base_line: line.base_line,
slant_degree: 0.0,
is_hidden_text: false,
text_lines: vec![line],
has_start_line: true,
has_end_line: true,
text_alignment: None,
};
let column = TextColumn {
bbox: bbox.clone(),
index: None,
level: None,
font_size: block.font_size,
base_line: block.base_line,
slant_degree: 0.0,
is_hidden_text: false,
text_blocks: vec![block],
};
ContentElement::Paragraph(SemanticParagraph {
base: SemanticTextNode {
bbox,
index: None,
level: None,
semantic_type: SemanticType::Paragraph,
correct_semantic_score: None,
columns: vec![column],
font_weight: Some(400.0),
font_size: Some(top - bottom),
text_color: None,
italic_angle: None,
font_name: Some("Lato-Regular".to_string()),
text_format: None,
max_font_size: Some(top - bottom),
background_color: None,
is_hidden_text: false,
},
enclosed_top: false,
enclosed_bottom: false,
indentation: 0,
})
}
fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
let mut table_rows = Vec::new();
for (row_number, row_values) in rows.iter().enumerate() {
let top = 656.0 - row_number as f64 * 18.0;
let bottom = top - 16.0;
let mut cells = Vec::new();
for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
let text = row_values.get(col_number).copied().unwrap_or("");
let content = if text.is_empty() {
Vec::new()
} else {
vec![TableToken {
base: TextChunk {
value: text.to_string(),
bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
font_name: "Test".to_string(),
font_size: 11.0,
font_weight: 400.0,
italic_angle: 0.0,
font_color: "[0.0]".to_string(),
contrast_ratio: 21.0,
symbol_ends: Vec::new(),
text_format: TextFormat::Normal,
text_type: TextType::Regular,
pdf_layer: PdfLayer::Main,
ocg_visible: true,
index: None,
page_number: Some(1),
level: None,
mcid: None,
},
token_type: TableTokenType::Text,
}]
};
cells.push(TableBorderCell {
bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
index: None,
level: None,
row_number,
col_number,
row_span: 1,
col_span: 1,
content,
contents: vec![],
semantic_type: None,
});
}
table_rows.push(TableBorderRow {
bbox: BoundingBox::new(
Some(1),
column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
bottom,
column_bounds
.last()
.map(|(_, right)| *right)
.unwrap_or(420.0),
top,
),
index: None,
level: None,
row_number,
cells,
semantic_type: None,
});
}
let left = column_bounds
.first()
.map(|(value, _)| *value)
.unwrap_or(72.0);
let right = column_bounds
.last()
.map(|(_, value)| *value)
.unwrap_or(420.0);
let x_coordinates = std::iter::once(left)
.chain(column_bounds.iter().map(|(_, right)| *right))
.collect::<Vec<_>>();
ContentElement::TableBorder(TableBorder {
bbox: BoundingBox::new(
Some(1),
left,
656.0 - rows.len() as f64 * 18.0 - 16.0,
right,
656.0,
),
index: None,
level: Some("1".to_string()),
x_coordinates,
x_widths: vec![0.0; column_bounds.len() + 1],
y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
y_widths: vec![0.0; rows.len() + 1],
rows: table_rows,
num_rows: rows.len(),
num_columns: column_bounds.len(),
is_bad_table: false,
is_table_transformer: false,
previous_table: None,
next_table: None,
})
}
#[test]
fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
let mut doc = PdfDocument::new("cec-table.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_two_column_table(&[
("Mineral or colloid type", "CEC of pure colloid"),
("", "cmolc/kg"),
("kaolinite", "10"),
("illite", "30"),
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| kaolinite | 10 |"));
}
#[test]
fn test_single_caption_chart_renderer_skips_documents_with_populated_tables() {
let mut doc = PdfDocument::new("table-with-caption.pdf".to_string());
doc.number_of_pages = 1;
for idx in 0..10 {
let bottom = 720.0 - idx as f64 * 18.0;
doc.kids.push(make_paragraph(
"Explanatory body text that should remain outside the chart-only renderer.",
bottom,
bottom + 10.0,
));
}
doc.kids.push(make_paragraph(
"Figure 7.2: Kinematic Viscosity of Water at Atmospheric Pressure.",
150.0,
162.0,
));
doc.kids.push(make_two_column_table(&[
("Temperature", "Viscosity"),
("20", "1.004"),
("25", "0.893"),
]));
assert!(render_layout_single_caption_chart_document(&doc).is_none());
}
#[test]
fn test_blank_right_column_table_is_not_misrendered_as_toc() {
let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_two_column_table(&[
(
"Added cation",
"Relative Size & Settling Rates of Floccules",
),
("K+", ""),
("Na+", ""),
("Ca2+", ""),
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
assert!(md.contains("| K+ | |"));
}
#[test]
fn test_infographic_card_table_renders_as_numbered_item() {
let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_two_column_table(&[
(
"1",
"We're all both consumers and creators of creative work.",
),
(
"",
"As consumers, we watch movies, listen to music, read books, and more.",
),
]));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
));
assert!(!md.contains("| 1 |"));
}
#[test]
fn test_grouped_header_rows_are_preserved_without_flattening() {
let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_n_column_table(
&[
vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
vec![
"",
"Alpaca-GPT4",
"OpenOrca",
"Synth. Math-Instruct",
"Orca DPO Pairs",
"Ultrafeedback Cleaned",
"Synth. Math-Alignment",
],
vec![
"Total # Samples",
"52K",
"2.91M",
"126K",
"12.9K",
"60.8K",
"126K",
],
],
&[
(72.0, 120.0),
(120.0, 170.0),
(170.0, 220.0),
(220.0, 280.0),
(280.0, 340.0),
(340.0, 410.0),
(410.0, 470.0),
],
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains(
"| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
));
assert!(md.contains(
"| | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
));
assert!(!md.contains("Instruction OpenOrca"));
assert!(!md.contains("Alignment Ultrafeedback"));
}
#[test]
fn test_top_table_plate_renderer_stops_before_article_body() {
let mut doc = PdfDocument::new("table-plate.pdf".to_string());
doc.number_of_pages = 1;
doc.kids
.push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
doc.kids.push(make_paragraph_at(
72.0,
704.0,
220.0,
716.0,
"Training datasets",
));
doc.kids.push(make_n_column_table(
&[
vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
vec![
"",
"Alpaca-GPT4",
"OpenOrca",
"Synth. Math-Instruct",
"Orca DPO Pairs",
"Ultrafeedback Cleaned",
"Synth. Math-Alignment",
],
vec![
"Total # Samples",
"52K",
"2.91M",
"126K",
"12.9K",
"60.8K",
"126K",
],
vec![
"Maximum # Samples Used",
"52K",
"100K",
"52K",
"12.9K",
"60.8K",
"20.1K",
],
vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
],
&[
(78.0, 125.0),
(125.0, 175.0),
(175.0, 225.0),
(225.0, 285.0),
(285.0, 345.0),
(345.0, 415.0),
(415.0, 490.0),
],
));
doc.kids.push(make_paragraph_at(
72.0,
500.0,
310.0,
514.0,
"Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
));
doc.kids.push(make_paragraph_at(
286.0,
484.0,
526.0,
498.0,
"Open source indicates whether the dataset is open-sourced.",
));
doc.kids.push(make_paragraph_at(
72.0,
360.0,
290.0,
388.0,
"Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("Table 1: Training datasets used for the instruction"));
assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
assert!(!md.contains("Comparison to other up-scaling methods"));
}
#[test]
fn test_late_section_boundary_renderer_drops_equation_carryover() {
let mut doc = PdfDocument::new("late-section.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_paragraph_at(
72.0,
700.0,
540.0,
714.0,
"The horizontal distance traveled by the jet is equal to:",
));
doc.kids.push(make_paragraph_at(
72.0,
640.0,
540.0,
654.0,
"The vertical position of the jet may be calculated as:",
));
doc.kids.push(make_paragraph_at(
72.0,
580.0,
260.0,
594.0,
"Rearranging Equation (8) gives:",
));
doc.kids.push(make_paragraph_at(
72.0,
520.0,
420.0,
534.0,
"Substitution into Equation 7 results in:",
));
doc.kids.push(make_paragraph_at(
72.0,
460.0,
280.0,
474.0,
"Equations (10) can be rearranged to find Cv:",
));
doc.kids.push(make_heading_at(
72.0,
350.0,
420.0,
366.0,
"7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
));
doc.kids.push(make_paragraph_at(
72.0,
326.0,
380.0,
340.0,
"If C_d is assumed to be constant, then a graph of Q plotted against",
));
doc.kids.push(make_paragraph_at(
400.0,
326.0,
540.0,
340.0,
"(Equation 6) will be linear, and",
));
doc.kids.push(make_paragraph_at(
72.0,
310.0,
240.0,
324.0,
"the slope of this graph will be:",
));
doc.kids.push(make_paragraph_at(
360.0,
36.0,
550.0,
48.0,
"EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
));
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
assert!(md.contains(
"If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
));
assert!(!md.contains("The horizontal distance traveled by the jet"));
assert!(!md.contains("EXPERIMENT #6"));
}
#[test]
fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_n_column_table(
&[
vec![
"Jurisdiction",
"GATS XVII Reservation (1994)",
"Foreign Ownership Permitted",
"Restrictions on Foreign Ownership",
"Foreign Ownership Reporting Requirements",
],
vec![
"",
"",
"",
"right required to acquire desert lands and continue the prior page",
"",
],
vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
vec!["France", "N", "Y", "None.", ""],
],
&[
(72.0, 150.0),
(150.0, 235.0),
(235.0, 330.0),
(330.0, 500.0),
(500.0, 560.0),
],
));
let md = to_markdown(&doc).unwrap();
assert!(!md.contains("right required to acquire desert lands"));
assert!(md.contains("| Finland | N | Y | Prior approval may be required. | |"));
}
#[test]
fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_paragraph_at(
140.0,
674.0,
474.0,
688.0,
"Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
));
doc.kids.push(make_n_column_table(
&[
vec![
"Jurisdiction",
"GATS XVII Reservation (1994)",
"Foreign Ownership Permitted",
"Restrictions on Foreign Ownership",
"Foreign Ownership Reporting Requirements",
],
vec![
"",
"",
"",
"right required to acquire desert lands and continue the prior page",
"",
],
vec![
"Finland",
"N",
"Y",
"Prior approval from the Government of Aland may be required.",
"",
],
vec!["France", "N", "Y", "None.", ""],
],
&[
(72.0, 150.0),
(150.0, 235.0),
(235.0, 330.0),
(330.0, 500.0),
(500.0, 560.0),
],
));
doc.kids.push(make_paragraph_at(
350.0,
36.0,
548.0,
48.0,
"The Law Library of Congress 7",
));
let md = to_markdown(&doc).unwrap();
assert!(md.starts_with(
"# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
));
assert!(!md.contains("right required to acquire desert lands"));
assert!(!md.contains("The Law Library of Congress 7"));
assert!(md.contains(
"| Finland | N | Y | Prior approval from the Government of Aland may be required. | |"
));
}
#[test]
fn test_hyphenated_table_title_continuation_renders_as_heading() {
let mut doc = PdfDocument::new("hyphenated-table-title.pdf".to_string());
doc.number_of_pages = 1;
doc.kids.push(make_paragraph_at(
72.0,
724.0,
520.0,
738.0,
"With this in mind, here we have the 7 key competence areas selected to form a part of Eco-",
));
doc.kids.push(make_paragraph_at(
72.0,
704.0,
260.0,
718.0,
"Circle's Competence Framework:",
));
doc.kids.push(make_n_column_table(
&[
vec!["Eco-Circle Competence Framework"],
vec!["#1: The 3 Rs: Recycle-Reuse-Reduce"],
vec!["#2: Lifecycle of Circular Economy"],
],
&[(140.0, 460.0)],
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("# Circle's Competence Framework:"), "{md}");
}
#[test]
fn test_duplicate_table_header_heading_is_demoted() {
let mut doc = PdfDocument::new("duplicate-table-header-heading.pdf".to_string());
doc.number_of_pages = 1;
doc.kids
.push(make_heading("MOHAVE COMMUNITY COLLEGE BIO181"));
doc.kids.push(make_n_column_table(
&[
vec![
"",
"Saccharometer",
"DI Water",
"Glucose Solution",
"Yeast Suspension",
],
vec!["1", "", "8 ml", "6 ml", "0 ml"],
vec!["2", "", "12 ml", "0 ml", "2 ml"],
vec!["3", "", "6 ml", "6 ml", "2 ml"],
],
&[
(72.0, 110.0),
(110.0, 210.0),
(210.0, 300.0),
(300.0, 430.0),
(430.0, 540.0),
],
));
doc.kids.push(make_heading_at(
72.0,
92.0,
390.0,
108.0,
"Saccharometer DI Water Glucose Solution Yeast Suspension",
));
doc.kids
.push(make_paragraph_at(72.0, 72.0, 120.0, 88.0, "below"));
doc.kids
.push(make_paragraph_at(72.0, 56.0, 240.0, 72.0, "1 16 ml 12 ml"));
doc.kids
.push(make_paragraph_at(296.0, 56.0, 340.0, 72.0, "0 ml"));
let md = to_markdown(&doc).unwrap();
assert!(
md.contains("Saccharometer DI Water Glucose Solution Yeast Suspension"),
"{md}"
);
assert!(
!md.contains("# Saccharometer DI Water Glucose Solution Yeast Suspension"),
"{md}"
);
}
#[test]
fn test_geometric_panel_headers_are_promoted_into_table() {
let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
doc.kids.push(make_chunked_paragraph_line(
&[("OCR", 220.0, 250.0)],
720.0,
732.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("Recommendation", 430.0, 540.0)],
720.0,
732.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("Product semantic search", 660.0, 860.0)],
720.0,
732.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("Pack", 72.0, 110.0)],
684.0,
696.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("A solution that recognizes characters", 140.0, 340.0)],
684.0,
696.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("A solution that recommends the best products", 390.0, 620.0)],
684.0,
696.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("A solution that enables semantic search", 650.0, 900.0)],
684.0,
696.0,
));
doc.kids.push(make_n_column_table(
&[
vec![
"Achieved 1st place in the OCR World Competition",
"Team with specialists and technologies",
"Creation of the first natural language evaluation",
],
vec![
"The team includes specialists who have",
"received Kaggle's Gold Medal recommendation",
"system in Korean (KLUE)",
],
vec![
"presented 14 papers in renowned AI conferences",
"top-tier recommendation",
"Shopee subject",
],
],
&[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
));
doc.kids.push(make_chunked_paragraph_line(
&[("models", 430.0, 490.0)],
552.0,
564.0,
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
assert!(md.contains(
"received Kaggle's Gold Medal recommendation top-tier recommendation models"
));
}
#[test]
fn test_embedded_stub_header_is_promoted_from_first_table_column() {
let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
doc.kids.push(make_chunked_paragraph_line(
&[("OCR", 220.0, 250.0)],
720.0,
732.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("Recommendation", 430.0, 540.0)],
720.0,
732.0,
));
doc.kids.push(make_chunked_paragraph_line(
&[("Product semantic search", 660.0, 860.0)],
720.0,
732.0,
));
doc.kids.push(make_n_column_table(
&[
vec![
"Pack",
"A solution that recognizes characters in an image and extracts necessary information",
"A solution that recommends the best products and contents",
"A solution that enables semantic search and organizes key information",
],
vec![
"Application",
"Applicable to all fields that require text extraction",
"Applicable to all fields that use any form of recommendation",
"Applicable to all fields that deal with unstructured data",
],
vec![
"Highlight",
"Achieved 1st place in the OCR World Competition",
"Received Kaggle's Gold Medal recommendation",
"Creation of the first natural language evaluation system in Korean",
],
],
&[
(72.0, 120.0),
(120.0, 360.0),
(360.0, 630.0),
(630.0, 910.0),
],
));
let md = to_markdown(&doc).unwrap();
assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
assert!(
md.contains("| Application | Applicable to all fields that require text extraction |")
);
assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
}
#[test]
fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
let line = make_chunked_paragraph_line(
&[
("Properties", 72.0, 145.0),
("Instruction", 180.0, 255.0),
("Alignment", 480.0, 545.0),
],
720.0,
732.0,
);
let chunk_lines = extract_chunk_lines(&line);
let fragments = split_line_into_slot_fragments(
&chunk_lines[0],
&[
(72.0, 170.0),
(170.0, 280.0),
(280.0, 380.0),
(380.0, 480.0),
(480.0, 600.0),
(600.0, 720.0),
(720.0, 850.0),
],
);
assert_eq!(fragments.len(), 3);
assert_eq!(fragments[0].slot_idx, 0);
assert_eq!(fragments[0].text, "Properties");
assert_eq!(fragments[1].slot_idx, 1);
assert_eq!(fragments[1].text, "Instruction");
assert_eq!(fragments[2].slot_idx, 4);
assert_eq!(fragments[2].text, "Alignment");
}
#[test]
fn test_merge_tables_across_heading() {
let input = "some text\n\n\
| Area | Competence |\n\
| --- | --- |\n\
| Row1 | Val1 |\n\
| Row2 | Val2 |\n\
\n\
# Heading Between\n\
\n\
| Row3 | Val3 |\n\
| --- | --- |\n\
\n\
more text\n";
let result = merge_adjacent_pipe_tables(input);
assert!(
result.contains("| Heading Between |"),
"Heading should be in pipe row: {}",
result
);
assert!(
!result.contains("# Heading Between"),
"Heading marker should be removed: {}",
result
);
assert!(
result.contains("| Row3 |") || result.contains("Row3"),
"Row3 should exist: {}",
result
);
}
#[test]
fn test_merge_tables_does_not_cross_distinct_headers() {
let input = "| Model | Score |\n\
| --- | --- |\n\
| A | 1 |\n\
\n\
Table 6: Performance comparison amongst the merge candidates.\n\
\n\
| Model | Method | Score |\n\
| --- | --- | --- |\n\
| B | Avg | 2 |\n";
let result = merge_adjacent_pipe_tables(input);
assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
assert!(result.contains("| Model | Score |"));
assert!(result.contains("| Model | Method | Score |"));
assert!(
!result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
);
}
#[test]
fn test_normalize_chart_like_markdown_extracts_series_tables() {
let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
ASEAN Migration Outlook 19\n";
let normalized = normalize_chart_like_markdown(input);
assert!(
normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
);
assert!(normalized.contains("| 2016 | 3,323 |"));
assert!(normalized.contains("| 2021 | 2,693 |"));
assert!(normalized.contains(
"*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
));
assert!(!normalized.contains("ASEAN Migration Outlook 19"));
}
#[test]
fn test_normalize_chart_like_markdown_promotes_structural_captions() {
let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
The Wonderful Lamp.\n\n\
Body paragraph.\n";
let normalized = normalize_chart_like_markdown(input);
assert!(normalized.contains(
"## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
));
assert!(normalized.contains("Body paragraph."));
}
#[test]
fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
| 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
| --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
let normalized = normalize_chart_like_markdown(input);
assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
assert!(normalized.contains("| 2014 | 126 |"));
assert!(normalized.contains("| 2019 | 147 |"));
assert!(!normalized.contains("| 8 | 800 200 |"));
}
#[test]
fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
| --- |\n\n\
Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
let normalized = normalize_chart_like_markdown(input);
assert!(!normalized.contains("| --- |"));
assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
}
#[test]
fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
| na/status/1379765916259483648 |\n\
| --- |\n\n\
98 DPN Argentina, accessed on 5 December 2021.\n";
let normalized = normalize_chart_like_markdown(input);
assert!(!normalized.contains("/status/1379765916259483648 |"));
assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
}
#[test]
fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
| in | cm | | Length | of | Gar | Fish | Age |\n\
| --- | --- | --- | --- | --- | --- | --- | --- |\n\
| 120) | 300 | | | | | | |\n\
| 100+ | 250 | | | | | | |\n\
| 80+ | 200 | | | | | | |\n\
| 20. | 50 | G | | | | | Vi |\n\
| 0 | 0 | | | | | | |\n\
| | 0 | 10 | 30 | | 40 | 50 | 60 |\n\n\
Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
let normalized = normalize_chart_like_markdown(input);
assert!(!normalized.contains("| in | cm |"));
assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
}
#[test]
fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
let input = "| A | B | C | D | E | F | G | H |\n\
| --- | --- | --- | --- | --- | --- | --- | --- |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
# 4.2 Main Results\n\n\
The surrounding prose should be dropped.\n";
let normalized = normalize_chart_like_markdown(input);
assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
assert!(!normalized.contains("Table 2:"));
assert!(!normalized.contains("4.2 Main Results"));
assert!(!normalized.contains("surrounding prose"));
}
}