use crate::extractors::text::{ArtifactType, PaginationSubtype};
use crate::geometry::Rect;
use crate::layout::TextSpan;
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct StructuredPage {
pub page_index: usize,
pub page_width: f32,
pub page_height: f32,
pub regions: Vec<StructuredRegion>,
}
#[derive(Debug, Clone, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub struct StructuredRegion {
pub kind: RegionRole,
pub text: String,
pub bbox: Rect,
pub spans: Vec<TextSpan>,
pub column_index: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none", default)]
pub section_id: Option<usize>,
}
#[derive(Debug, Default, Clone)]
pub(crate) struct McidStructInfo {
pub lbl: std::collections::HashSet<u32>,
pub section: std::collections::HashMap<u32, usize>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
#[cfg_attr(feature = "wasm", serde(rename_all = "camelCase"))]
pub enum RegionRole {
BodyBlock,
StructuralHeading {
level: u8,
},
MarginalLabel,
Header,
Footer,
PageNumber,
Artifact,
}
fn role_for_span(span: &TextSpan) -> RegionRole {
if let Some(at) = &span.artifact_type {
return match at {
ArtifactType::Pagination(PaginationSubtype::Header) => RegionRole::Header,
ArtifactType::Pagination(PaginationSubtype::Footer) => RegionRole::Footer,
ArtifactType::Pagination(PaginationSubtype::PageNumber) => RegionRole::PageNumber,
_ => RegionRole::Artifact,
};
}
if let Some(level) = span.heading_level {
return RegionRole::StructuralHeading { level };
}
if is_marginal_label(&span.text) {
return RegionRole::MarginalLabel;
}
RegionRole::BodyBlock
}
fn is_marginal_label(text: &str) -> bool {
let t = text.trim();
if t.is_empty() || t.chars().count() > 4 {
return false;
}
let is_arabic = t.chars().all(|c| c.is_ascii_digit());
let is_roman = !t.is_empty()
&& t.chars()
.all(|c| matches!(c, 'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm'));
is_arabic || is_roman
}
fn rect_union(a: &Rect, b: &Rect) -> Rect {
let x0 = a.x.min(b.x);
let y0 = a.y.min(b.y);
let x1 = (a.x + a.width).max(b.x + b.width);
let y1 = (a.y + a.height).max(b.y + b.height);
Rect::new(x0, y0, x1 - x0, y1 - y0)
}
const COLUMN_BRIDGE_FRACTION: f32 = 0.6;
fn detect_gutter_x(body: &[&TextSpan], page_width: f32) -> Option<f32> {
const MIN_GUTTER_PT: f32 = 8.0;
const MIN_SIDE_SPANS: usize = 2;
if body.len() < 4 || page_width <= 0.0 {
return None;
}
let all_extents: Vec<(f32, f32)> = body
.iter()
.filter(|s| {
s.bbox.width > 0.0
&& s.bbox.x.is_finite()
&& s.bbox.width.is_finite()
&& !s.text.trim().is_empty()
})
.map(|s| (s.bbox.x, s.bbox.x + s.bbox.width))
.collect();
if all_extents.len() < 4 {
return None;
}
let content_min = all_extents
.iter()
.map(|b| b.0)
.fold(f32::INFINITY, f32::min);
let content_max = all_extents
.iter()
.map(|b| b.1)
.fold(f32::NEG_INFINITY, f32::max);
let content_w = content_max - content_min;
if content_w < page_width * 0.25 {
return None; }
let bridge_w = content_w * COLUMN_BRIDGE_FRACTION;
let mut boxes: Vec<(f32, f32)> = all_extents
.into_iter()
.filter(|(l, r)| r - l <= bridge_w)
.collect();
if boxes.len() < 4 {
return None;
}
boxes.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let mut cover_right = boxes[0].1;
let mut best_gap = 0.0_f32;
let mut best_mid = 0.0_f32;
let mut left_count = 0usize;
for i in 1..boxes.len() {
let gap = boxes[i].0 - cover_right;
if gap > best_gap {
best_gap = gap;
best_mid = (cover_right + boxes[i].0) * 0.5;
left_count = i;
}
cover_right = cover_right.max(boxes[i].1);
}
let rel = best_mid / page_width;
let right_count = boxes.len() - left_count;
if best_gap >= MIN_GUTTER_PT
&& (0.3..=0.7).contains(&rel)
&& left_count >= MIN_SIDE_SPANS
&& right_count >= MIN_SIDE_SPANS
{
Some(best_mid)
} else {
None
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
#[cfg_attr(feature = "wasm", derive(serde::Serialize, serde::Deserialize))]
pub enum ColumnMode {
#[default]
Auto,
Two,
Single,
}
#[cfg(test)]
pub(crate) fn build_structured_page(
page_index: usize,
page_width: f32,
page_height: f32,
spans: Vec<TextSpan>,
) -> StructuredPage {
build_structured_page_full(
page_index,
page_width,
page_height,
spans,
ColumnMode::Auto,
&McidStructInfo::default(),
)
}
#[cfg(test)]
pub(crate) fn build_structured_page_with_mode(
page_index: usize,
page_width: f32,
page_height: f32,
spans: Vec<TextSpan>,
column_mode: ColumnMode,
) -> StructuredPage {
build_structured_page_full(
page_index,
page_width,
page_height,
spans,
column_mode,
&McidStructInfo::default(),
)
}
pub(crate) fn build_structured_page_full(
page_index: usize,
page_width: f32,
page_height: f32,
spans: Vec<TextSpan>,
column_mode: ColumnMode,
struct_info: &McidStructInfo,
) -> StructuredPage {
let body_refs: Vec<&TextSpan> = spans
.iter()
.filter(|s| matches!(role_for_span(s), RegionRole::BodyBlock | RegionRole::MarginalLabel))
.collect();
let gutter = match column_mode {
ColumnMode::Auto => detect_gutter_x(&body_refs, page_width),
ColumnMode::Two => detect_gutter_x(&body_refs, page_width).or(Some(page_width * 0.5)),
ColumnMode::Single => None,
};
let (content_min, content_max) = body_refs
.iter()
.filter(|s| {
s.bbox.width > 0.0
&& s.bbox.x.is_finite()
&& s.bbox.width.is_finite()
&& !s.text.trim().is_empty()
})
.fold((f32::INFINITY, f32::NEG_INFINITY), |(lo, hi), s| {
(lo.min(s.bbox.x), hi.max(s.bbox.x + s.bbox.width))
});
let bridge_w = (content_max - content_min) * COLUMN_BRIDGE_FRACTION;
let column_of = |span: &TextSpan| -> Option<usize> {
let g = gutter?;
if span.bbox.width > bridge_w {
return None; }
let center = span.bbox.x + span.bbox.width * 0.5;
Some(if center < g { 0 } else { 1 })
};
let mut regions: Vec<StructuredRegion> = Vec::new();
for span in spans {
if span.text.trim().is_empty() {
continue;
}
let mcid = span.mcid;
let is_lbl = mcid.is_some_and(|m| struct_info.lbl.contains(&m));
let kind = if is_lbl {
RegionRole::MarginalLabel
} else {
role_for_span(&span)
};
let col = match kind {
RegionRole::BodyBlock | RegionRole::MarginalLabel => column_of(&span),
_ => None,
};
let section = mcid.and_then(|m| struct_info.section.get(&m).copied());
let merge_idx = if col.is_some() {
regions
.iter()
.position(|r| r.kind == kind && r.column_index == col && r.section_id == section)
} else {
match regions.last() {
Some(r) if r.kind == kind && r.column_index == col && r.section_id == section => {
Some(regions.len() - 1)
},
_ => None,
}
};
if let Some(i) = merge_idx {
let r = &mut regions[i];
r.text.push(' ');
r.text.push_str(span.text.trim());
r.bbox = rect_union(&r.bbox, &span.bbox);
r.spans.push(span);
continue;
}
regions.push(StructuredRegion {
kind,
text: span.text.trim().to_string(),
bbox: span.bbox,
column_index: col,
section_id: section,
spans: vec![span],
});
}
StructuredPage {
page_index,
page_width,
page_height,
regions,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn span(text: &str, x: f32, y: f32, w: f32) -> TextSpan {
TextSpan {
text: text.to_string(),
bbox: Rect::new(x, y, w, 12.0),
..Default::default()
}
}
fn span_mcid(text: &str, x: f32, y: f32, w: f32, mcid: u32) -> TextSpan {
TextSpan {
mcid: Some(mcid),
..span(text, x, y, w)
}
}
#[test]
fn issue_734_kjf_reference_columns_get_indices() {
let pw = 432.0;
let mut spans = Vec::new();
spans.push(span("Le Troisième Livre de Moïse Appelé GENÈSE", 57.6, 720.0, 339.0));
spans.push(span("1", 224.6, 706.0, 8.0));
let mut y = 690.0;
for v in 1..=6u32 {
spans.push(span_mcid(
&format!("{v} Au commencement Dieu créa le ciel et la"),
57.6,
y,
139.0,
4 + v, ));
spans.push(span_mcid("terre.", 57.6, y - 14.0, 28.0, 4 + v));
y -= 30.0;
}
y = 690.0;
for v in 14..=19u32 {
spans.push(span_mcid(
&format!("{v} ¶ Et Dieu dit, Qu'il y ait des lumières dans le"),
230.5,
y,
165.0,
4 + v, ));
spans.push(span_mcid(
"firmament du ciel pour séparer le jour.",
230.5,
y - 14.0,
160.0,
4 + v,
));
y -= 30.0;
}
let page = build_structured_page(0, pw, 792.0, spans);
let cols: Vec<Option<usize>> = page.regions.iter().map(|r| r.column_index).collect();
assert!(
cols.contains(&Some(0)),
"left column (0) must be assigned for the KJF layout: {cols:?}"
);
assert!(
cols.contains(&Some(1)),
"right column (1) must be assigned for the KJF layout: {cols:?}"
);
let title = page
.regions
.iter()
.find(|r| r.text.contains("Troisième Livre"))
.expect("title region present");
assert_eq!(title.column_index, None, "gutter-bridging title must not be assigned a column");
}
#[test]
fn marginal_label_detects_short_numerals() {
assert!(is_marginal_label("12"));
assert!(is_marginal_label("iv"));
assert!(!is_marginal_label("Genesis"));
assert!(!is_marginal_label("12345")); }
#[test]
fn heading_and_body_roles_assigned() {
let mut h = span("Title", 100.0, 700.0, 80.0);
h.heading_level = Some(1);
let b = span("Body text here", 100.0, 680.0, 120.0);
let page = build_structured_page(0, 612.0, 792.0, vec![h, b]);
assert_eq!(page.regions.len(), 2);
assert_eq!(page.regions[0].kind, RegionRole::StructuralHeading { level: 1 });
assert_eq!(page.regions[1].kind, RegionRole::BodyBlock);
}
#[test]
fn two_column_body_gets_column_indices() {
let spans = vec![
span("left one", 60.0, 700.0, 120.0),
span("left two", 60.0, 680.0, 120.0),
span("right one", 360.0, 700.0, 120.0),
span("right two", 360.0, 680.0, 120.0),
];
let page = build_structured_page(0, 612.0, 792.0, spans);
let cols: Vec<Option<usize>> = page.regions.iter().map(|r| r.column_index).collect();
assert!(cols.contains(&Some(0)), "a left column (0) must be assigned: {cols:?}");
assert!(cols.contains(&Some(1)), "a right column (1) must be assigned: {cols:?}");
}
#[test]
fn narrow_gutter_short_line_columns_get_indices() {
let pw = 432.0;
let mut spans = Vec::new();
let mut y = 700.0;
for row in 0..6 {
spans.push(span(&format!("{}", row + 1), 36.0, y, 8.0));
spans.push(span("Au", 52.0, y, 26.0));
spans.push(span("commencement", 84.0, y, 110.0));
spans.push(span(&format!("{}", row + 14), 226.0, y, 12.0));
spans.push(span("Et", 244.0, y, 22.0));
spans.push(span("Dieu", 272.0, y, 40.0));
y -= 14.0;
}
let page = build_structured_page(0, pw, 792.0, spans);
let cols: Vec<Option<usize>> = page.regions.iter().map(|r| r.column_index).collect();
assert!(cols.contains(&Some(0)), "left column (0) not assigned: {cols:?}");
assert!(cols.contains(&Some(1)), "right column (1) not assigned: {cols:?}");
}
#[test]
fn column_mode_two_forces_split_when_auto_rejects() {
let pw = 432.0;
let spans = vec![
span("left body", 40.0, 700.0, 100.0), span("right body", 250.0, 700.0, 100.0), ];
let auto = build_structured_page_with_mode(0, pw, 792.0, spans.clone(), ColumnMode::Auto);
assert!(
auto.regions.iter().all(|r| r.column_index.is_none()),
"Auto must not split this sparse layout: {:?}",
auto.regions
.iter()
.map(|r| r.column_index)
.collect::<Vec<_>>()
);
let two = build_structured_page_with_mode(0, pw, 792.0, spans, ColumnMode::Two);
let cols: Vec<Option<usize>> = two.regions.iter().map(|r| r.column_index).collect();
assert!(cols.contains(&Some(0)), "Two must assign a left column: {cols:?}");
assert!(cols.contains(&Some(1)), "Two must assign a right column: {cols:?}");
}
#[test]
fn column_mode_single_suppresses_clear_columns() {
let pw = 612.0;
let spans = vec![
span("left one", 60.0, 700.0, 120.0),
span("left two", 60.0, 680.0, 120.0),
span("right one", 360.0, 700.0, 120.0),
span("right two", 360.0, 680.0, 120.0),
];
let auto = build_structured_page_with_mode(0, pw, 792.0, spans.clone(), ColumnMode::Auto);
assert!(auto.regions.iter().any(|r| r.column_index == Some(1)));
let single = build_structured_page_with_mode(0, pw, 792.0, spans, ColumnMode::Single);
assert!(
single.regions.iter().all(|r| r.column_index.is_none()),
"Single must suppress all columns: {:?}",
single
.regions
.iter()
.map(|r| r.column_index)
.collect::<Vec<_>>()
);
}
#[test]
fn single_column_prose_has_no_gutter() {
let pw = 612.0;
let mut spans = Vec::new();
let mut y = 700.0;
let widths = [430.0, 460.0, 410.0, 470.0, 440.0, 455.0, 425.0, 465.0];
for w in widths {
spans.push(span("a single column prose line of body text", 80.0, y, w));
y -= 14.0;
}
let page = build_structured_page(0, pw, 792.0, spans);
let cols: Vec<Option<usize>> = page.regions.iter().map(|r| r.column_index).collect();
assert!(
cols.iter().all(|c| c.is_none()),
"single-column prose wrongly split into columns: {cols:?}"
);
}
}