mod assignment;
mod heading;
pub(super) mod layout_validation;
mod merge;
mod reading_order;
pub(super) mod table_recognition;
mod tables;
use crate::pdf::hierarchy::SegmentData;
use super::classify::classify_paragraphs;
use super::columns::split_segments_into_columns;
use super::lines::segments_to_lines;
use super::paragraphs::lines_to_paragraphs;
use super::types::{LayoutHint, LayoutHintClass, PdfParagraph};
pub(super) use heading::looks_like_figure_label;
#[cfg(feature = "layout-detection")]
pub(super) use table_recognition::recognize_tables_for_native_page;
#[cfg(feature = "layout-detection")]
pub(super) use table_recognition::recognize_tables_slanet;
pub(super) use tables::extract_tables_from_layout_hints;
struct LayoutRegion<'a> {
hint: &'a LayoutHint,
segment_indices: Vec<usize>,
merged_bbox: Option<(f32, f32, f32, f32)>, }
impl<'a> LayoutRegion<'a> {
fn bbox(&self) -> (f32, f32, f32, f32) {
self.merged_bbox
.unwrap_or((self.hint.left, self.hint.bottom, self.hint.right, self.hint.top))
}
}
#[allow(clippy::too_many_arguments)]
pub(super) fn assemble_region_paragraphs(
segments: Vec<SegmentData>,
hints: &[LayoutHint],
heading_map: &[(f32, Option<u8>)],
min_confidence: f32,
doc_body_font_size: Option<f32>,
page_index: usize,
extracted_table_bboxes: &[crate::types::BoundingBox],
hint_validations: &[layout_validation::RegionValidation],
) -> Vec<PdfParagraph> {
let (mut regions, unassigned_indices) = assignment::assign_segments_to_regions_refined(
&segments,
hints,
min_confidence,
extracted_table_bboxes,
hint_validations,
);
if regions.is_empty() {
tracing::trace!(page = page_index, "no layout regions — using fallback pipeline");
return assemble_fallback(segments, heading_map);
}
tracing::trace!(
page = page_index,
regions = regions.len(),
unassigned = unassigned_indices.len(),
"layout regions assigned"
);
let page_height = segments.iter().map(|s| s.y + s.height).fold(0.0_f32, f32::max);
merge_fragmented_regions(&mut regions);
reading_order::order_regions_reading_order(&mut regions, page_height);
let seg_count = segments.len();
let num_regions = regions.len();
const DEST_UNASSIGNED: usize = usize::MAX - 1;
let mut seg_destination = vec![usize::MAX; seg_count]; for (region_idx, region) in regions.iter().enumerate() {
for &seg_idx in ®ion.segment_indices {
if seg_idx < seg_count {
seg_destination[seg_idx] = region_idx;
}
}
}
for &idx in &unassigned_indices {
if idx < seg_count && seg_destination[idx] == usize::MAX {
seg_destination[idx] = DEST_UNASSIGNED;
}
}
let mut region_segments: Vec<Vec<SegmentData>> = (0..num_regions).map(|_| Vec::new()).collect();
let mut unassigned_segments: Vec<SegmentData> = Vec::new();
for (idx, seg) in segments.into_iter().enumerate() {
let dest = seg_destination[idx];
if dest < num_regions {
region_segments[dest].push(seg);
} else if dest == DEST_UNASSIGNED {
unassigned_segments.push(seg);
}
}
let mut all_paragraphs: Vec<PdfParagraph> = Vec::new();
for (ri, region) in regions.iter().enumerate() {
let region_segs = std::mem::take(&mut region_segments[ri]);
if region_segs.is_empty() {
continue;
}
let lines = segments_to_lines(region_segs);
let mut paragraphs = lines_to_paragraphs(lines);
if region.hint.class == LayoutHintClass::ListItem && paragraphs.len() > 1 {
let mut merged_lines = Vec::new();
for para in paragraphs.drain(..) {
merged_lines.extend(para.lines);
}
paragraphs.push(super::paragraphs::finalize_paragraph(merged_lines));
}
if !matches!(
region.hint.class,
LayoutHintClass::Text
| LayoutHintClass::SectionHeader
| LayoutHintClass::Title
| LayoutHintClass::Code
| LayoutHintClass::Formula
) {
let region_text: String = paragraphs
.iter()
.flat_map(|p| p.lines.iter())
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join("");
let total = region_text.chars().count();
let alnum = region_text
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.count();
if total >= 10 && (alnum as f32 / total as f32) < 0.15 {
tracing::trace!(
class = ?region.hint.class,
total_chars = total,
alnum_chars = alnum,
"skipping garbled region"
);
continue;
}
}
heading::apply_region_class(
&mut paragraphs,
region.hint,
heading_map,
doc_body_font_size,
page_height,
page_index,
);
all_paragraphs.extend(paragraphs);
}
if !unassigned_segments.is_empty() {
let mut fallback = assemble_fallback(unassigned_segments, heading_map);
all_paragraphs.append(&mut fallback);
}
merge::merge_continuation_paragraphs_region_aware(&mut all_paragraphs);
merge_cross_column_paragraphs(&mut all_paragraphs);
merge::merge_consecutive_code_blocks(&mut all_paragraphs);
merge::demote_non_code_blocks(&mut all_paragraphs);
merge::merge_list_continuations(&mut all_paragraphs);
associate_captions(&mut all_paragraphs);
associate_footnotes(&mut all_paragraphs);
all_paragraphs
}
fn merge_cross_column_paragraphs(paragraphs: &mut Vec<PdfParagraph>) {
if paragraphs.len() < 2 {
return;
}
let mut i = 0;
while i + 1 < paragraphs.len() {
let mut j = i + 1;
while j < paragraphs.len() && !is_body_text(¶graphs[j]) {
j += 1;
}
if j >= paragraphs.len() || !is_body_text(¶graphs[i]) {
i += 1;
continue;
}
let current = ¶graphs[i];
let next = ¶graphs[j];
let current_right_x = current
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.x + s.width)
.fold(f32::NEG_INFINITY, f32::max);
let next_left_x = next
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.x)
.fold(f32::INFINITY, f32::min);
let strictly_left = current_right_x < next_left_x;
let current_text = paragraph_text(current);
let next_text = paragraph_text(next);
let current_trimmed = current_text.trim_end();
let next_trimmed = next_text.trim_start();
let ends_with_continuation = current_trimmed
.chars()
.last()
.is_some_and(|c| c.is_ascii_lowercase() || c == ',' || c == '-');
let starts_with_lowercase = next_trimmed.chars().next().is_some_and(|c| c.is_ascii_lowercase());
if strictly_left && ends_with_continuation && starts_with_lowercase {
let merged = paragraphs.remove(j);
paragraphs[i].lines.extend(merged.lines);
} else {
i += 1;
}
}
}
fn is_body_text(p: &PdfParagraph) -> bool {
p.heading_level.is_none() && !p.is_list_item && !p.is_code_block && !p.is_formula && !p.is_page_furniture
}
fn paragraph_text(p: &PdfParagraph) -> String {
p.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ")
}
const MERGE_GAP_THRESHOLD: f32 = 5.0;
fn merge_fragmented_regions(regions: &mut Vec<LayoutRegion>) {
if regions.len() < 2 {
return;
}
let mut merged_count = 0;
let mut i = 0;
while i < regions.len() {
if !matches!(
regions[i].hint.class,
LayoutHintClass::Title | LayoutHintClass::SectionHeader
) {
i += 1;
continue;
}
let mut j = i + 1;
while j < regions.len() {
if regions[j].hint.class != regions[i].hint.class {
j += 1;
continue;
}
let h_gap = (regions[j].hint.left - regions[i].hint.right)
.max(regions[i].hint.left - regions[j].hint.right)
.max(0.0);
let v_gap = (regions[j].hint.bottom - regions[i].hint.top)
.max(regions[i].hint.bottom - regions[j].hint.top)
.max(0.0);
let i_height = regions[i].hint.top - regions[i].hint.bottom;
let j_height = regions[j].hint.top - regions[j].hint.bottom;
let min_height = i_height.min(j_height);
let i_cy = (regions[i].hint.top + regions[i].hint.bottom) / 2.0;
let j_cy = (regions[j].hint.top + regions[j].hint.bottom) / 2.0;
let cy_gap = (i_cy - j_cy).abs();
let in_same_band = cy_gap <= min_height.max(1.0) * 0.5;
let close_enough = h_gap <= MERGE_GAP_THRESHOLD && v_gap <= MERGE_GAP_THRESHOLD;
if close_enough || (in_same_band && (h_gap <= MERGE_GAP_THRESHOLD || v_gap <= MERGE_GAP_THRESHOLD)) {
let j_segments = std::mem::take(&mut regions[j].segment_indices);
regions[i].segment_indices.extend(j_segments);
let (i_left, i_bottom, i_right, i_top) = regions[i].bbox();
let (j_left, j_bottom, j_right, j_top) = regions[j].bbox();
regions[i].merged_bbox = Some((
i_left.min(j_left),
i_bottom.min(j_bottom),
i_right.max(j_right),
i_top.max(j_top),
));
regions.remove(j);
merged_count += 1;
} else {
j += 1;
}
}
i += 1;
}
if merged_count > 0 {
tracing::trace!(merged = merged_count, "merged fragmented Title/SectionHeader regions");
}
}
fn associate_captions(paragraphs: &mut [PdfParagraph]) {
let caption_indices: Vec<usize> = paragraphs
.iter()
.enumerate()
.filter(|(_, p)| p.layout_class == Some(LayoutHintClass::Caption))
.map(|(i, _)| i)
.collect();
if caption_indices.is_empty() {
return;
}
for &cap_idx in &caption_indices {
let backward = find_parent_backward(paragraphs, cap_idx);
let forward = find_parent_forward(paragraphs, cap_idx);
let parent_idx = match (backward, forward) {
(Some(b), None) => Some(b),
(None, Some(f)) => Some(f),
(Some(b), Some(f)) => {
if (cap_idx - b) <= (f - cap_idx) {
Some(b)
} else {
Some(f)
}
}
(None, None) => None, };
if let Some(pi) = parent_idx {
paragraphs[cap_idx].caption_for = Some(pi);
}
}
}
fn find_parent_backward(paragraphs: &[PdfParagraph], cap_idx: usize) -> Option<usize> {
for i in (0..cap_idx).rev() {
let class = paragraphs[i].layout_class;
if class == Some(LayoutHintClass::Table)
|| class == Some(LayoutHintClass::Picture)
|| class == Some(LayoutHintClass::Code)
{
return Some(i);
}
if class == Some(LayoutHintClass::Caption) {
continue; }
break; }
None
}
fn find_parent_forward(paragraphs: &[PdfParagraph], cap_idx: usize) -> Option<usize> {
for (offset, p) in paragraphs[(cap_idx + 1)..].iter().enumerate() {
let class = p.layout_class;
if class == Some(LayoutHintClass::Table)
|| class == Some(LayoutHintClass::Picture)
|| class == Some(LayoutHintClass::Code)
{
return Some(cap_idx + 1 + offset);
}
if class == Some(LayoutHintClass::Caption) {
continue;
}
break;
}
None
}
fn associate_footnotes(paragraphs: &mut [PdfParagraph]) {
let parent_indices: Vec<usize> = paragraphs
.iter()
.enumerate()
.filter(|(_, p)| {
matches!(
p.layout_class,
Some(LayoutHintClass::Table) | Some(LayoutHintClass::Picture)
)
})
.map(|(i, _)| i)
.collect();
for &parent_idx in &parent_indices {
for item in paragraphs.iter_mut().skip(parent_idx + 1) {
let class = item.layout_class;
if class == Some(LayoutHintClass::Footnote) {
item.caption_for = Some(parent_idx);
} else {
break; }
}
}
}
fn assemble_fallback(segments: Vec<SegmentData>, heading_map: &[(f32, Option<u8>)]) -> Vec<PdfParagraph> {
let mut paragraphs = assemble_standard_pipeline(segments);
classify_paragraphs(&mut paragraphs, heading_map);
paragraphs
}
pub(super) fn assemble_standard_pipeline(segments: Vec<SegmentData>) -> Vec<PdfParagraph> {
let column_groups = split_segments_into_columns(&segments);
if column_groups.len() <= 1 {
let lines = segments_to_lines(segments);
lines_to_paragraphs(lines)
} else {
let mut col_map = vec![0usize; segments.len()];
for (col_idx, group) in column_groups.iter().enumerate() {
for &seg_idx in group {
if seg_idx < col_map.len() {
col_map[seg_idx] = col_idx;
}
}
}
let num_cols = column_groups.len();
let mut col_segments: Vec<Vec<SegmentData>> = (0..num_cols).map(|_| Vec::new()).collect();
for (idx, seg) in segments.into_iter().enumerate() {
if idx < col_map.len() {
col_segments[col_map[idx]].push(seg);
}
}
let mut all_paragraphs = Vec::new();
for segs in col_segments {
let lines = segments_to_lines(segs);
all_paragraphs.extend(lines_to_paragraphs(lines));
}
all_paragraphs
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pdf::hierarchy::SegmentData;
fn make_segment(text: &str, x: f32, y: f32, width: f32, height: f32) -> SegmentData {
SegmentData {
text: text.to_string(),
x,
y,
width,
height,
font_size: height,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: y,
}
}
fn make_hint(class: LayoutHintClass, confidence: f32, left: f32, bottom: f32, right: f32, top: f32) -> LayoutHint {
LayoutHint {
class,
confidence,
left,
bottom,
right,
top,
}
}
#[test]
fn test_assign_segments_single_region() {
let segments = vec![
make_segment("Hello", 10.0, 700.0, 40.0, 12.0),
make_segment("world", 55.0, 700.0, 40.0, 12.0),
];
let hints = vec![make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].segment_indices.len(), 2);
assert!(unassigned.is_empty());
}
#[test]
fn test_assign_segments_two_columns() {
let segments = vec![
make_segment("Left", 10.0, 700.0, 40.0, 12.0),
make_segment("Right", 300.0, 700.0, 40.0, 12.0),
];
let hints = vec![
make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0),
make_hint(LayoutHintClass::Text, 0.9, 250.0, 690.0, 500.0, 720.0),
];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions.len(), 2);
assert_eq!(regions[0].segment_indices.len(), 1);
assert_eq!(regions[1].segment_indices.len(), 1);
assert!(unassigned.is_empty());
}
#[test]
fn test_assign_segments_unassigned() {
let segments = vec![
make_segment("Inside", 10.0, 700.0, 40.0, 12.0),
make_segment("Outside", 500.0, 100.0, 40.0, 12.0),
];
let hints = vec![make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions[0].segment_indices.len(), 1);
assert_eq!(unassigned.len(), 1);
}
#[test]
fn test_assign_segments_smallest_area_wins() {
let segments = vec![make_segment("text", 50.0, 700.0, 40.0, 12.0)];
let hints = vec![
make_hint(LayoutHintClass::Text, 0.9, 0.0, 0.0, 600.0, 800.0), make_hint(LayoutHintClass::Code, 0.9, 30.0, 690.0, 200.0, 720.0), ];
let (regions, _) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions[0].segment_indices.is_empty()); assert_eq!(regions[1].segment_indices.len(), 1); }
#[test]
fn test_overlap_partial_assigns() {
let segments = vec![make_segment("straddling", 180.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions[0].segment_indices.len(), 1);
assert!(unassigned.is_empty());
}
#[test]
fn test_overlap_below_threshold_unassigned() {
let segments = vec![make_segment("barely", 195.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions[0].segment_indices.is_empty());
assert_eq!(unassigned.len(), 1);
}
#[test]
fn test_highest_overlap_wins() {
let segments = vec![make_segment("overlapping", 90.0, 700.0, 40.0, 12.0)];
let hints = vec![
make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 110.0, 720.0),
make_hint(LayoutHintClass::Text, 0.9, 100.0, 690.0, 300.0, 720.0),
];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions[0].segment_indices.is_empty());
assert_eq!(regions[1].segment_indices.len(), 1);
assert!(unassigned.is_empty());
}
#[test]
fn test_center_point_regression() {
let segments = vec![make_segment("centered", 50.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions[0].segment_indices.len(), 1);
assert!(unassigned.is_empty());
}
#[test]
fn test_reading_order_two_columns() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 300.0, 400.0, 550.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 400.0, 250.0, 700.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert!(regions[0].hint.left < regions[1].hint.left);
}
#[test]
fn test_reading_order_vertical() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 500.0, 300.0), make_hint(LayoutHintClass::Title, 0.9, 10.0, 600.0, 500.0, 750.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert_eq!(regions[0].hint.class, LayoutHintClass::Title);
}
#[test]
fn test_assemble_code_region() {
let segments = vec![
make_segment("fn main() {", 10.0, 700.0, 80.0, 12.0),
make_segment("println!(\"hi\");", 10.0, 685.0, 100.0, 12.0),
make_segment("}", 10.0, 670.0, 10.0, 12.0),
];
let hints = vec![make_hint(LayoutHintClass::Code, 0.9, 0.0, 660.0, 200.0, 720.0)];
let paragraphs = assemble_region_paragraphs(segments, &hints, &[], 0.5, None, 0, &[], &[]);
assert!(!paragraphs.is_empty());
assert!(paragraphs[0].is_code_block);
}
#[test]
fn test_assemble_heading_region() {
let segments = vec![make_segment("1 Introduction", 10.0, 700.0, 120.0, 18.0)];
let hints = vec![make_hint(LayoutHintClass::SectionHeader, 0.9, 0.0, 690.0, 200.0, 725.0)];
let paragraphs = assemble_region_paragraphs(segments, &hints, &[], 0.5, None, 0, &[], &[]);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].heading_level, Some(2));
}
#[test]
fn test_low_confidence_hints_ignored() {
let segments = vec![make_segment("text", 10.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Code, 0.3, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions.is_empty());
assert_eq!(unassigned.len(), 1);
}
#[test]
fn test_table_segments_suppressed_when_extracted() {
let segments = vec![make_segment("table text", 10.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Table, 0.9, 0.0, 690.0, 200.0, 720.0)];
let extracted_bbox = crate::types::BoundingBox {
x0: 0.0,
y0: 690.0,
x1: 200.0,
y1: 720.0,
};
let (regions, unassigned) =
assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[extracted_bbox], &[]);
assert!(regions.is_empty());
assert!(unassigned.is_empty());
}
#[test]
fn test_table_segments_recovered_when_not_extracted() {
let segments = vec![make_segment("table text", 10.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Table, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions.is_empty()); assert_eq!(unassigned.len(), 1); }
#[test]
fn test_picture_regions_suppress_text() {
let segments = vec![make_segment("ab", 10.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(LayoutHintClass::Picture, 0.9, 0.0, 690.0, 200.0, 720.0)];
let (regions, unassigned) = assignment::assign_segments_to_regions(&segments, &hints, 0.5, &[], &[]);
assert!(regions.is_empty());
assert!(unassigned.is_empty()); }
#[test]
fn test_picture_text_suppressed_caption_preserved() {
let segments = vec![
make_segment("Parse PDF", 20.0, 700.0, 60.0, 12.0),
make_segment("OCR", 100.0, 700.0, 30.0, 12.0),
make_segment("Layout Analysis", 20.0, 680.0, 80.0, 12.0),
make_segment("Figure 1. Pipeline overview.", 20.0, 640.0, 150.0, 10.0),
];
let hints = vec![
make_hint(LayoutHintClass::Picture, 0.9, 0.0, 670.0, 200.0, 720.0),
make_hint(LayoutHintClass::Caption, 0.9, 0.0, 630.0, 200.0, 655.0),
];
let paragraphs = assemble_region_paragraphs(segments, &hints, &[], 0.5, None, 0, &[], &[]);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].layout_class, Some(LayoutHintClass::Caption));
let text: String = paragraphs[0]
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(text.contains("Figure 1"));
}
#[test]
fn test_assemble_mixed_regions() {
let segments = vec![
make_segment("Title Text", 10.0, 750.0, 100.0, 18.0),
make_segment("Body paragraph here.", 10.0, 700.0, 150.0, 12.0),
make_segment("let x = 1;", 10.0, 650.0, 80.0, 12.0),
];
let hints = vec![
make_hint(LayoutHintClass::Title, 0.9, 0.0, 740.0, 200.0, 775.0),
make_hint(LayoutHintClass::Text, 0.9, 0.0, 690.0, 200.0, 720.0),
make_hint(LayoutHintClass::Code, 0.9, 0.0, 640.0, 200.0, 665.0),
];
let paragraphs = assemble_region_paragraphs(segments, &hints, &[], 0.5, None, 0, &[], &[]);
assert_eq!(paragraphs.len(), 3);
assert_eq!(paragraphs[0].heading_level, Some(1)); assert_eq!(paragraphs[0].layout_class, Some(LayoutHintClass::Title));
assert!(paragraphs[1].heading_level.is_none()); assert!(paragraphs[2].is_code_block); }
#[test]
fn test_reading_order_fullwidth_interleaved_with_columns() {
let hints = [
make_hint(LayoutHintClass::Title, 0.9, 10.0, 700.0, 550.0, 780.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 300.0, 260.0, 690.0), make_hint(LayoutHintClass::Text, 0.9, 280.0, 300.0, 550.0, 690.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 550.0, 290.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert_eq!(regions[0].hint.class, LayoutHintClass::Title); assert!(regions[1].hint.right < 280.0, "Second should be left column"); assert!(regions[2].hint.left >= 270.0, "Third should be right column"); assert!(regions[3].hint.top < 300.0, "Fourth should be bottom full-width"); }
#[test]
fn test_reading_order_all_narrow_regression() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 300.0, 400.0, 550.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 400.0, 250.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 300.0, 100.0, 550.0, 390.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 250.0, 390.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert!(regions[0].hint.left < 260.0); assert!(regions[1].hint.left < 260.0); assert!(regions[2].hint.left >= 260.0); assert!(regions[3].hint.left >= 260.0); }
#[test]
fn test_dag_reading_order_three_columns() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 400.0, 100.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 150.0, 400.0, 240.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 290.0, 400.0, 380.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 100.0, 390.0), make_hint(LayoutHintClass::Text, 0.9, 150.0, 100.0, 240.0, 390.0), make_hint(LayoutHintClass::Text, 0.9, 290.0, 100.0, 380.0, 390.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
let left_positions: Vec<usize> = regions
.iter()
.enumerate()
.filter(|(_, r)| r.hint.left < 120.0)
.map(|(i, _)| i)
.collect();
let mid_positions: Vec<usize> = regions
.iter()
.enumerate()
.filter(|(_, r)| r.hint.left >= 120.0 && r.hint.left < 260.0)
.map(|(i, _)| i)
.collect();
let right_positions: Vec<usize> = regions
.iter()
.enumerate()
.filter(|(_, r)| r.hint.left >= 260.0)
.map(|(i, _)| i)
.collect();
assert_eq!(left_positions.len(), 2);
assert_eq!(mid_positions.len(), 2);
assert_eq!(right_positions.len(), 2);
assert!(left_positions.iter().all(|&lp| mid_positions.iter().all(|&mp| lp < mp)));
assert!(
mid_positions
.iter()
.all(|&mp| right_positions.iter().all(|&rp| mp < rp))
);
}
#[test]
fn test_dag_reading_order_header_footer_separation() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 200.0, 550.0, 600.0), make_hint(LayoutHintClass::PageFooter, 0.9, 10.0, 10.0, 550.0, 80.0), make_hint(LayoutHintClass::PageHeader, 0.9, 10.0, 720.0, 550.0, 790.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 550.0, 190.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert_eq!(regions[0].hint.class, LayoutHintClass::PageHeader);
assert_eq!(regions[3].hint.class, LayoutHintClass::PageFooter);
assert_eq!(regions[1].hint.class, LayoutHintClass::Text);
assert_eq!(regions[2].hint.class, LayoutHintClass::Text);
}
#[test]
fn test_dag_reading_order_asymmetric_columns() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 500.0, 80.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 120.0, 500.0, 500.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 120.0, 300.0, 500.0, 490.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 300.0, 80.0, 490.0), make_hint(LayoutHintClass::Text, 0.9, 120.0, 100.0, 500.0, 290.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
let sidebar_pos: Vec<usize> = regions
.iter()
.enumerate()
.filter(|(_, r)| r.hint.left < 100.0)
.map(|(i, _)| i)
.collect();
let body_pos: Vec<usize> = regions
.iter()
.enumerate()
.filter(|(_, r)| r.hint.left >= 100.0)
.map(|(i, _)| i)
.collect();
assert_eq!(sidebar_pos.len(), 2);
assert_eq!(body_pos.len(), 3);
assert!(sidebar_pos.iter().all(|&sp| body_pos.iter().all(|&bp| sp < bp)));
}
#[test]
fn test_dag_reading_order_single_column() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 550.0, 550.0, 700.0),
make_hint(LayoutHintClass::Text, 0.9, 10.0, 400.0, 550.0, 540.0),
make_hint(LayoutHintClass::Text, 0.9, 10.0, 250.0, 550.0, 390.0),
make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 550.0, 240.0),
];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
for i in 0..regions.len() - 1 {
assert!(
regions[i].hint.top > regions[i + 1].hint.top,
"region {} top={} should be above region {} top={}",
i,
regions[i].hint.top,
i + 1,
regions[i + 1].hint.top
);
}
}
#[test]
fn test_reading_order_few_regions_fallback() {
let hints = [
make_hint(LayoutHintClass::Text, 0.9, 10.0, 100.0, 550.0, 300.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 500.0, 550.0, 700.0), make_hint(LayoutHintClass::Text, 0.9, 10.0, 310.0, 550.0, 490.0), ];
let mut regions: Vec<LayoutRegion> = hints
.iter()
.map(|h| LayoutRegion {
hint: h,
segment_indices: Vec::new(),
merged_bbox: None,
})
.collect();
reading_order::order_regions_reading_order(&mut regions, 800.0);
assert!(regions[0].hint.bottom > regions[1].hint.bottom);
assert!(regions[1].hint.bottom > regions[2].hint.bottom);
}
#[test]
fn test_bbox_refinement_shrinks_oversized_region() {
let segments = vec![
make_segment("inside_large", 50.0, 700.0, 40.0, 12.0),
make_segment("near_small", 250.0, 700.0, 60.0, 12.0),
];
let hints = vec![
make_hint(LayoutHintClass::Text, 0.9, 0.0, 0.0, 600.0, 800.0), make_hint(LayoutHintClass::Code, 0.9, 200.0, 695.0, 400.0, 715.0), ];
let (regions, _) = assignment::assign_segments_to_regions_refined(&segments, &hints, 0.5, &[], &[]);
let code_region = regions.iter().find(|r| r.hint.class == LayoutHintClass::Code);
assert!(code_region.is_some(), "Code region should exist");
assert!(
!code_region.unwrap().segment_indices.is_empty(),
"Code region should contain at least one segment"
);
}
#[test]
fn test_bbox_refinement_preserves_original_class() {
let segments = vec![make_segment("text", 50.0, 700.0, 40.0, 12.0)];
let hints = vec![make_hint(
LayoutHintClass::SectionHeader,
0.85,
0.0,
690.0,
200.0,
720.0,
)];
let (regions, _) = assignment::assign_segments_to_regions_refined(&segments, &hints, 0.5, &[], &[]);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].hint.class, LayoutHintClass::SectionHeader);
assert!((regions[0].hint.confidence - 0.85).abs() < 1e-4);
}
fn make_paragraph(layout_class: Option<LayoutHintClass>) -> super::super::types::PdfParagraph {
super::super::types::PdfParagraph {
lines: vec![],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class,
caption_for: None,
block_bbox: None,
}
}
#[test]
fn test_caption_association_below_table() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Table)),
make_paragraph(Some(LayoutHintClass::Caption)),
make_paragraph(Some(LayoutHintClass::Text)),
];
associate_captions(&mut paragraphs);
assert_eq!(paragraphs[2].caption_for, Some(1));
assert_eq!(paragraphs[0].caption_for, None);
assert_eq!(paragraphs[1].caption_for, None);
assert_eq!(paragraphs[3].caption_for, None);
}
#[test]
fn test_caption_association_above_figure() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Caption)),
make_paragraph(Some(LayoutHintClass::Picture)),
make_paragraph(Some(LayoutHintClass::Text)),
];
associate_captions(&mut paragraphs);
assert_eq!(paragraphs[0].caption_for, Some(1));
}
#[test]
fn test_caption_no_parent() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Caption)),
make_paragraph(Some(LayoutHintClass::Text)),
];
associate_captions(&mut paragraphs);
assert_eq!(paragraphs[1].caption_for, None);
}
#[test]
fn test_caption_ambiguous_prefers_closer() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Table)),
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Caption)),
make_paragraph(Some(LayoutHintClass::Picture)),
];
associate_captions(&mut paragraphs);
assert_eq!(paragraphs[3].caption_for, Some(4));
}
#[test]
fn test_footnote_after_table() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Table)),
make_paragraph(Some(LayoutHintClass::Footnote)),
make_paragraph(Some(LayoutHintClass::Footnote)),
make_paragraph(Some(LayoutHintClass::Text)),
];
associate_footnotes(&mut paragraphs);
assert_eq!(paragraphs[2].caption_for, Some(1));
assert_eq!(paragraphs[3].caption_for, Some(1));
assert_eq!(paragraphs[0].caption_for, None);
assert_eq!(paragraphs[4].caption_for, None);
}
#[test]
fn test_footnote_stops_at_non_footnote() {
let mut paragraphs = vec![
make_paragraph(Some(LayoutHintClass::Table)),
make_paragraph(Some(LayoutHintClass::Footnote)),
make_paragraph(Some(LayoutHintClass::Text)),
make_paragraph(Some(LayoutHintClass::Footnote)),
];
associate_footnotes(&mut paragraphs);
assert_eq!(paragraphs[1].caption_for, Some(0));
assert_eq!(paragraphs[3].caption_for, None);
}
fn make_body_para(text: &str, x: f32, width: f32) -> PdfParagraph {
use super::super::types::PdfLine;
PdfParagraph {
lines: vec![PdfLine {
segments: vec![make_segment(text, x, 700.0, width, 12.0)],
baseline_y: 700.0,
dominant_font_size: 12.0,
is_bold: false,
is_monospace: false,
}],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: Some(LayoutHintClass::Text),
caption_for: None,
block_bbox: None,
}
}
fn make_heading_para(text: &str, x: f32, width: f32) -> PdfParagraph {
let mut p = make_body_para(text, x, width);
p.heading_level = Some(2);
p.layout_class = Some(LayoutHintClass::SectionHeader);
p
}
#[test]
fn test_cross_column_merge_basic() {
let mut paragraphs = vec![
make_body_para("word-", 0.0, 100.0),
make_body_para("continued", 200.0, 100.0),
];
merge_cross_column_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 1);
let merged_text = paragraph_text(¶graphs[0]);
assert!(merged_text.contains("word-"));
assert!(merged_text.contains("continued"));
}
#[test]
fn test_cross_column_no_merge_uppercase() {
let mut paragraphs = vec![
make_body_para("sentence.", 0.0, 100.0),
make_body_para("New", 200.0, 100.0),
];
merge_cross_column_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 2);
}
#[test]
fn test_cross_column_no_merge_heading() {
let mut paragraphs = vec![
make_heading_para("Introduction", 0.0, 100.0),
make_body_para("continued", 200.0, 100.0),
];
merge_cross_column_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 2);
}
}