use crate::ir::{BBox, Block, Char};
const GUTTER_SIZE: f32 = 2.0;
const GUTTER_MIN_PT: f32 = 12.0;
pub(super) fn detect_columns(chars: &[Char], skip: &[bool]) -> Vec<f32> {
let active: Vec<&Char> = chars.iter().zip(skip).filter(|(_, &s)| !s).map(|(c, _)| c).collect();
if active.len() < 12 {
return Vec::new();
}
let minx = active.iter().map(|c| c.bbox.x0).fold(f32::MAX, f32::min);
let maxx = active.iter().map(|c| c.bbox.x1).fold(f32::MIN, f32::max);
let miny = active.iter().map(|c| c.bbox.y0).fold(f32::MAX, f32::min);
let maxy = active.iter().map(|c| c.bbox.y1).fold(f32::MIN, f32::max);
if maxx <= minx || maxy <= miny {
return Vec::new();
}
let mut sizes: Vec<f32> = active.iter().map(|c| c.size).collect();
sizes.sort_by(f32::total_cmp);
let size = sizes[sizes.len() / 2].max(1.0);
let gutter_min = (GUTTER_SIZE * size).max(GUTTER_MIN_PT);
let band_h = (1.2 * size).max(1.0);
let nrow = ((maxy - miny) / band_h).ceil() as usize + 1;
let nx = (maxx - minx).ceil() as usize + 1;
let mut covered = vec![vec![false; nrow]; nx];
let mut used_rows = vec![false; nrow];
for c in &active {
let cy = (c.bbox.y0 + c.bbox.y1) / 2.0;
let row = (((cy - miny) / band_h) as usize).min(nrow - 1);
used_rows[row] = true;
let a = ((c.bbox.x0 - minx).floor().max(0.0) as usize).min(nx - 1);
let b = ((c.bbox.x1 - minx).ceil() as usize).min(nx - 1);
for x in covered.iter_mut().take(b + 1).skip(a) {
x[row] = true;
}
}
let total_rows = used_rows.iter().filter(|&&u| u).count().max(1);
let thresh = 0.15 * total_rows as f32;
let row_count: Vec<usize> = covered.iter().map(|rows| rows.iter().filter(|&&b| b).count()).collect();
let first = row_count.iter().position(|&c| c as f32 > thresh);
let last = row_count.iter().rposition(|&c| c as f32 > thresh);
let (Some(first), Some(last)) = (first, last) else {
return Vec::new();
};
let mut splits = Vec::new();
let mut k = first;
while k <= last {
if row_count[k] as f32 > thresh {
k += 1;
continue;
}
let start = k;
while k <= last && (row_count[k] as f32) <= thresh {
k += 1;
}
if (k - start) as f32 >= gutter_min {
splits.push(minx + (start + k) as f32 / 2.0);
}
}
splits
}
pub(super) fn column_index(cx: f32, splits: &[f32]) -> usize {
splits.iter().filter(|&&s| cx > s).count()
}
pub(super) fn block_top(b: &Block) -> f32 {
bbox(b).y0
}
pub(super) fn reading_order(mut blocks: Vec<Block>, page_width: f32) -> Vec<Block> {
if blocks.len() <= 1 {
return blocks;
}
if page_width > 0.0 {
let mid = page_width / 2.0;
let margin = 0.05 * page_width;
let spans_mid = blocks.iter().any(|b| {
let bb = bbox(b);
bb.x0 < mid - margin && bb.x1 > mid + margin
});
if !spans_mid {
let (mut left, mut right): (Vec<Block>, Vec<Block>) =
blocks.into_iter().partition(|b| center_x(bbox(b)) < mid);
if !left.is_empty() && !right.is_empty() {
left.sort_by(|a, b| top(a).total_cmp(&top(b)));
right.sort_by(|a, b| top(a).total_cmp(&top(b)));
left.extend(right);
return left;
}
blocks = left.into_iter().chain(right).collect();
}
}
blocks.sort_by(|a, b| top(a).total_cmp(&top(b)));
blocks
}
fn bbox(b: &Block) -> BBox {
match b {
Block::Paragraph(p) => p.bbox,
Block::Table(t) => t.bbox,
Block::Image(i) => i.bbox,
}
}
fn center_x(b: BBox) -> f32 {
(b.x0 + b.x1) / 2.0
}
fn top(b: &Block) -> f32 {
bbox(b).y0
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ir::Paragraph;
fn para(label: &str, x0: f32, x1: f32, y0: f32) -> Block {
Block::Paragraph(Paragraph {
bbox: BBox { x0, y0, x1, y1: y0 + 10.0 },
text: label.into(),
heading_level: None, role: None,
})
}
fn text(b: &Block) -> &str {
match b {
Block::Paragraph(p) => &p.text,
_ => "",
}
}
#[test]
fn two_columns_ordered_left_then_right() {
let blocks = vec![
para("R-top", 300.0, 500.0, 0.0),
para("L-bottom", 0.0, 200.0, 50.0),
para("L-top", 0.0, 200.0, 0.0),
para("R-bottom", 300.0, 500.0, 50.0),
];
let ordered = reading_order(blocks, 500.0);
let got: Vec<&str> = ordered.iter().map(text).collect();
assert_eq!(got, vec!["L-top", "L-bottom", "R-top", "R-bottom"]);
}
#[test]
fn full_width_block_stays_single_column() {
let blocks = vec![
para("body", 0.0, 200.0, 50.0),
para("heading", 0.0, 500.0, 0.0), ];
let ordered = reading_order(blocks, 500.0);
let got: Vec<&str> = ordered.iter().map(text).collect();
assert_eq!(got, vec!["heading", "body"]); }
#[test]
fn single_block_unchanged() {
let blocks = vec![para("only", 0.0, 200.0, 0.0)];
assert_eq!(reading_order(blocks, 500.0).len(), 1);
}
fn glyph(x: f32, baseline: f32) -> Char {
Char {
text: "x".into(),
bbox: BBox { x0: x, y0: baseline - 10.0, x1: x + 6.0, y1: baseline },
font: crate::ir::FontRef { name: "F".into() },
size: 10.0,
color: None,
}
}
#[test]
fn detects_two_columns_despite_full_width_header() {
let mut chars = Vec::new();
for x in (0..200).step_by(6) {
chars.push(glyph(x as f32, 0.0)); }
for row in 1..15 {
let y = row as f32 * 12.0;
for x in (0..80).step_by(6) {
chars.push(glyph(x as f32, y));
}
for x in (120..200).step_by(6) {
chars.push(glyph(x as f32, y));
}
}
let skip = vec![false; chars.len()];
let splits = detect_columns(&chars, &skip);
assert_eq!(splits.len(), 1, "expected one gutter, got {splits:?}");
assert!(splits[0] > 80.0 && splits[0] < 120.0, "gutter at {}", splits[0]);
}
#[test]
fn single_column_has_no_splits() {
let mut chars = Vec::new();
for row in 0..15 {
for x in (0..200).step_by(6) {
chars.push(glyph(x as f32, row as f32 * 12.0));
}
}
let skip = vec![false; chars.len()];
assert!(detect_columns(&chars, &skip).is_empty());
}
}