use crate::models::bbox::BoundingBox;
use crate::models::content::ContentElement;
pub fn xycut_sort(elements: &mut [ContentElement], page_bbox: &BoundingBox) {
if elements.len() <= 1 {
return;
}
xycut_recursive(elements, page_bbox);
}
fn xycut_recursive(elements: &mut [ContentElement], region: &BoundingBox) {
if elements.len() <= 1 {
return;
}
let h_gap = find_horizontal_gap_size(elements);
let v_gap = find_vertical_gap_size(elements);
let vertical_analysis = v_gap.and_then(|(split_x, v_size)| {
analyze_vertical_cut(
elements,
split_x,
h_gap.map(|(_, gap)| gap).unwrap_or(0.0),
v_size,
)
.map(|analysis| (split_x, analysis))
});
if std::env::var("XYCUT_DEBUG").is_ok() {
eprintln!(
"[XYCUT] n={} h_gap={:?} v_gap={:?}",
elements.len(),
h_gap.map(|(y, g)| format!("y={:.1} gap={:.1}", y, g)),
v_gap.map(|(x, g)| format!("x={:.1} gap={:.1}", x, g))
);
for e in elements.iter() {
let b = e.bbox();
eprintln!(
" el: l={:.1} r={:.1} top={:.1} bot={:.1}",
b.left_x, b.right_x, b.top_y, b.bottom_y
);
}
}
let prefer_vertical = vertical_analysis.is_some();
if prefer_vertical {
if let Some((split_x, analysis)) = vertical_analysis.as_ref() {
if reorder_vertical_bands(elements, *split_x, analysis, region) {
return;
}
}
if let Some((split_x, _)) = v_gap {
let (left, right) = partition_by_x(elements, split_x);
if !left.is_empty() && !right.is_empty() {
xycut_recursive(left, region);
xycut_recursive(right, region);
return;
}
}
if let Some((split_y, _)) = h_gap {
let (top, bottom) = partition_by_y(elements, split_y);
if !top.is_empty() && !bottom.is_empty() {
xycut_recursive(top, region);
xycut_recursive(bottom, region);
return;
}
}
} else {
if let Some((split_y, _)) = h_gap {
let (top, bottom) = partition_by_y(elements, split_y);
if !top.is_empty() && !bottom.is_empty() {
xycut_recursive(top, region);
xycut_recursive(bottom, region);
return;
}
}
if let Some((split_x, _)) = v_gap {
let (left, right) = partition_by_x(elements, split_x);
if !left.is_empty() && !right.is_empty() {
xycut_recursive(left, region);
xycut_recursive(right, region);
return;
}
}
}
const BUCKET_PT: f64 = 4.0;
elements.sort_by_key(|e| {
let y_bucket = -((e.bbox().top_y / BUCKET_PT).round() as i64); let x_key = (e.bbox().left_x * 10.0).round() as i64; (y_bucket, x_key)
});
}
#[derive(Debug, Clone, Copy)]
struct VerticalCutAnalysis {
shared_top: f64,
shared_bottom: f64,
}
fn analyze_vertical_cut(
elements: &[ContentElement],
split_x: f64,
horizontal_gap: f64,
vertical_gap: f64,
) -> Option<VerticalCutAnalysis> {
if elements.len() < 4 {
return None;
}
let min_x = elements
.iter()
.map(|e| e.bbox().left_x)
.fold(f64::INFINITY, f64::min);
let max_x = elements
.iter()
.map(|e| e.bbox().right_x)
.fold(f64::NEG_INFINITY, f64::max);
let content_width = (max_x - min_x).max(1.0);
let spanning_width = content_width * 0.55;
let split_margin = 6.0;
let band_tolerance = 8.0;
let mut left_count = 0usize;
let mut right_count = 0usize;
let mut left_top = f64::NEG_INFINITY;
let mut left_bottom = f64::INFINITY;
let mut right_top = f64::NEG_INFINITY;
let mut right_bottom = f64::INFINITY;
let mut crossing_bands: Vec<(f64, f64)> = Vec::new();
for elem in elements {
let bbox = elem.bbox();
let width = bbox.right_x - bbox.left_x;
let crosses_split =
bbox.left_x < split_x - split_margin && bbox.right_x > split_x + split_margin;
if crosses_split && width >= spanning_width {
crossing_bands.push((bbox.top_y, bbox.bottom_y));
continue;
}
if bbox.center_x() < split_x {
left_count += 1;
left_top = left_top.max(bbox.top_y);
left_bottom = left_bottom.min(bbox.bottom_y);
} else {
right_count += 1;
right_top = right_top.max(bbox.top_y);
right_bottom = right_bottom.min(bbox.bottom_y);
}
}
if left_count < 2 || right_count < 2 {
return None;
}
let left_height = (left_top - left_bottom).max(0.0);
let right_height = (right_top - right_bottom).max(0.0);
if left_height <= 0.0 || right_height <= 0.0 {
return None;
}
let overlap = (left_top.min(right_top) - left_bottom.max(right_bottom)).max(0.0);
let overlap_ratio = overlap / left_height.min(right_height);
if overlap_ratio < 0.35 || vertical_gap < horizontal_gap * 0.5 {
return None;
}
let shared_top = left_top.min(right_top);
let shared_bottom = left_bottom.max(right_bottom);
if shared_top <= shared_bottom {
return None;
}
let ambiguous_crossing = crossing_bands
.iter()
.filter(|(top_y, bottom_y)| {
*bottom_y < shared_top - band_tolerance && *top_y > shared_bottom + band_tolerance
})
.count();
if ambiguous_crossing > 0 {
return None;
}
Some(VerticalCutAnalysis {
shared_top,
shared_bottom,
})
}
fn reorder_vertical_bands(
elements: &mut [ContentElement],
split_x: f64,
analysis: &VerticalCutAnalysis,
region: &BoundingBox,
) -> bool {
const SPLIT_MARGIN: f64 = 6.0;
const BAND_TOLERANCE: f64 = 8.0;
let mut top_spanning = Vec::new();
let mut left = Vec::new();
let mut right = Vec::new();
let mut bottom_spanning = Vec::new();
for element in elements.iter() {
let bbox = element.bbox();
let crosses_split =
bbox.left_x < split_x - SPLIT_MARGIN && bbox.right_x > split_x + SPLIT_MARGIN;
if crosses_split {
if bbox.bottom_y >= analysis.shared_top - BAND_TOLERANCE {
top_spanning.push(element.clone());
continue;
}
if bbox.top_y <= analysis.shared_bottom + BAND_TOLERANCE {
bottom_spanning.push(element.clone());
continue;
}
return false;
}
if bbox.center_x() < split_x {
left.push(element.clone());
} else {
right.push(element.clone());
}
}
if left.is_empty() || right.is_empty() {
return false;
}
if top_spanning.len() > 1 {
xycut_recursive(top_spanning.as_mut_slice(), region);
}
xycut_recursive(left.as_mut_slice(), region);
xycut_recursive(right.as_mut_slice(), region);
if bottom_spanning.len() > 1 {
xycut_recursive(bottom_spanning.as_mut_slice(), region);
}
let mut ordered = Vec::with_capacity(elements.len());
ordered.extend(top_spanning);
ordered.extend(left);
ordered.extend(right);
ordered.extend(bottom_spanning);
if ordered.len() != elements.len() {
return false;
}
for (dst, src) in elements.iter_mut().zip(ordered.into_iter()) {
*dst = src;
}
true
}
const MIN_GAP_THRESHOLD: f64 = 5.0;
fn find_horizontal_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
if elements.len() < 2 {
return None;
}
let mut sorted: Vec<(f64, f64)> = elements
.iter()
.map(|e| (e.bbox().bottom_y, e.bbox().top_y))
.collect();
sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut best_gap = 0.0f64;
let mut best_y = None;
let mut prev_bottom = sorted[0].0;
for &(bottom, top) in &sorted[1..] {
if prev_bottom > top {
let gap = prev_bottom - top;
if gap > best_gap && gap > MIN_GAP_THRESHOLD {
best_gap = gap;
best_y = Some((prev_bottom + top) / 2.0);
}
}
prev_bottom = prev_bottom.min(bottom); }
best_y.map(|y| (y, best_gap))
}
fn find_vertical_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
if elements.len() < 2 {
return None;
}
let min_x = elements
.iter()
.map(|e| e.bbox().left_x)
.fold(f64::INFINITY, f64::min);
let max_x = elements
.iter()
.map(|e| e.bbox().right_x)
.fold(f64::NEG_INFINITY, f64::max);
let content_width = (max_x - min_x).max(1.0);
let span_threshold = content_width * 0.70;
const BBOX_SHRINK: f64 = 3.0;
let mut intervals: Vec<(f64, f64)> = elements
.iter()
.filter_map(|e| {
let b = e.bbox();
let w = b.right_x - b.left_x;
if w > span_threshold {
None
} else {
let eff_left = b.left_x + BBOX_SHRINK;
let eff_right = b.right_x - BBOX_SHRINK;
if eff_right > eff_left {
Some((eff_left, eff_right))
} else {
None }
}
})
.collect();
if intervals.len() < 2 {
intervals = elements
.iter()
.filter_map(|e| {
let b = e.bbox();
let eff_left = b.left_x + BBOX_SHRINK;
let eff_right = b.right_x - BBOX_SHRINK;
if eff_right > eff_left {
Some((eff_left, eff_right))
} else {
None
}
})
.collect();
}
intervals.sort_by(|a, b| {
a.0.partial_cmp(&b.0)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
});
let mut merged: Vec<(f64, f64)> = Vec::new();
let merge_tolerance = 0.5; for (left, right) in intervals {
if let Some(last) = merged.last_mut() {
if left <= last.1 + merge_tolerance {
last.1 = last.1.max(right);
continue;
}
}
merged.push((left, right));
}
let mut best_gap = 0.0f64;
let mut best_x = None;
for w in merged.windows(2) {
let gap = w[1].0 - w[0].1;
if gap > best_gap && gap > MIN_GAP_THRESHOLD {
best_gap = gap;
best_x = Some((w[0].1 + w[1].0) / 2.0);
}
}
if best_x.is_none() {
const MIN_CENTER_GAP: f64 = 20.0;
const MIN_COL_ELEMENT_WIDTH: f64 = 50.0;
let mut centers: Vec<f64> = elements
.iter()
.filter_map(|e| {
let b = e.bbox();
let w = b.right_x - b.left_x;
if w > span_threshold || w < MIN_COL_ELEMENT_WIDTH {
None } else {
Some((b.left_x + b.right_x) * 0.5)
}
})
.collect();
if centers.len() >= 2 {
centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mut candidate_x: Option<f64> = None;
let mut candidate_gap = 0.0f64;
for pair in centers.windows(2) {
let gap = pair[1] - pair[0];
if gap > candidate_gap && gap >= MIN_CENTER_GAP {
candidate_gap = gap;
candidate_x = Some((pair[0] + pair[1]) * 0.5);
}
}
if let Some(sx) = candidate_x {
let right_min_left_x = elements
.iter()
.filter(|e| {
let b = e.bbox();
(b.left_x + b.right_x) * 0.5 >= sx
})
.map(|e| e.bbox().left_x)
.fold(f64::INFINITY, f64::min);
if std::env::var("XYCUT_DEBUG").is_ok() {
eprintln!("[XYCUT] center_x candidate_x={:.1} candidate_gap={:.1} right_min_left_x={:.1} threshold={:.1} pass={}",
sx, candidate_gap, right_min_left_x, sx * 0.85, right_min_left_x >= sx * 0.85);
}
if right_min_left_x >= sx * 0.85 {
best_x = candidate_x;
best_gap = candidate_gap;
}
}
}
}
best_x.map(|x| (x, best_gap))
}
fn partition_by_y(
elements: &mut [ContentElement],
split_y: f64,
) -> (&mut [ContentElement], &mut [ContentElement]) {
let pivot = itertools_partition(elements, |e| e.bbox().center_y() >= split_y);
elements.split_at_mut(pivot)
}
fn partition_by_x(
elements: &mut [ContentElement],
split_x: f64,
) -> (&mut [ContentElement], &mut [ContentElement]) {
let pivot = itertools_partition(elements, |e| e.bbox().center_x() < split_x);
elements.split_at_mut(pivot)
}
fn itertools_partition<T, F>(data: &mut [T], pred: F) -> usize
where
F: Fn(&T) -> bool,
{
let mut pivot = 0;
for i in 0..data.len() {
if pred(&data[i]) {
data.swap(i, pivot);
pivot += 1;
}
}
pivot
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::chunks::ImageChunk;
fn make_element(left: f64, bottom: f64, right: f64, top: f64) -> ContentElement {
ContentElement::Image(ImageChunk {
bbox: BoundingBox::new(Some(1), left, bottom, right, top),
index: None,
level: None,
})
}
#[test]
fn test_xycut_sort_single() {
let mut elements = vec![make_element(0.0, 0.0, 100.0, 50.0)];
xycut_sort(
&mut elements,
&BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0),
);
assert_eq!(elements.len(), 1);
}
#[test]
fn test_xycut_sort_two_rows() {
let mut elements = vec![
make_element(50.0, 100.0, 200.0, 150.0), make_element(50.0, 500.0, 200.0, 550.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
}
#[test]
fn test_xycut_sort_two_columns_left_first() {
let mut elements = vec![
make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), make_element(300.0, 300.0, 500.0, 350.0), make_element(50.0, 300.0, 250.0, 350.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(
elements[0].bbox().left_x < 260.0,
"First element should be left column"
);
assert!(
elements[1].bbox().left_x < 260.0,
"Second element should be left column"
);
assert!(
elements[2].bbox().left_x < 260.0,
"Third element should be left column"
);
assert!(
elements[3].bbox().left_x > 260.0,
"Fourth element should be right column"
);
assert!(
elements[4].bbox().left_x > 260.0,
"Fifth element should be right column"
);
assert!(
elements[5].bbox().left_x > 260.0,
"Sixth element should be right column"
);
assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
assert!(elements[1].bbox().top_y > elements[2].bbox().top_y);
assert!(elements[3].bbox().top_y > elements[4].bbox().top_y);
assert!(elements[4].bbox().top_y > elements[5].bbox().top_y);
}
#[test]
fn test_xycut_sort_header_then_two_columns() {
let mut elements = vec![
make_element(50.0, 700.0, 500.0, 750.0), make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(
elements[0].bbox().top_y >= 750.0,
"Header should come first"
);
assert!(
elements[1].bbox().left_x < 260.0,
"Left col should come after header"
);
assert!(elements[2].bbox().left_x < 260.0);
assert!(
elements[3].bbox().left_x > 260.0,
"Right col should come last"
);
assert!(elements[4].bbox().left_x > 260.0);
}
#[test]
fn test_projection_gap_with_overlapping_elements() {
let mut elements = vec![
make_element(50.0, 500.0, 250.0, 550.0), make_element(60.0, 400.0, 240.0, 450.0), make_element(310.0, 500.0, 500.0, 550.0), make_element(310.0, 400.0, 500.0, 450.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(elements[0].bbox().left_x < 260.0);
assert!(elements[1].bbox().left_x < 260.0);
assert!(elements[2].bbox().left_x > 260.0);
assert!(elements[3].bbox().left_x > 260.0);
}
#[test]
fn test_xycut_prefers_columns_for_staggered_two_column_layout() {
let mut elements = vec![
make_element(50.0, 680.0, 250.0, 740.0), make_element(50.0, 420.0, 250.0, 660.0), make_element(320.0, 600.0, 520.0, 760.0), make_element(320.0, 360.0, 520.0, 580.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(elements[0].bbox().left_x < 260.0);
assert!(elements[1].bbox().left_x < 260.0);
assert!(elements[2].bbox().left_x > 260.0);
assert!(elements[3].bbox().left_x > 260.0);
}
#[test]
fn test_xycut_keeps_spanning_header_and_footer_outside_columns() {
let mut elements = vec![
make_element(40.0, 760.0, 540.0, 810.0), make_element(50.0, 640.0, 250.0, 700.0), make_element(50.0, 520.0, 250.0, 620.0), make_element(320.0, 640.0, 520.0, 700.0), make_element(320.0, 520.0, 520.0, 620.0), make_element(40.0, 430.0, 540.0, 480.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(
elements[0].bbox().top_y >= 800.0,
"header should stay first"
);
assert!(elements[1].bbox().left_x < 260.0);
assert!(elements[2].bbox().left_x < 260.0);
assert!(elements[3].bbox().left_x > 260.0);
assert!(elements[4].bbox().left_x > 260.0);
assert!(elements[5].bbox().top_y <= 480.0, "footer should stay last");
}
#[test]
fn test_xycut_rejects_vertical_cut_when_spanning_band_sits_between_columns() {
let mut elements = vec![
make_element(50.0, 700.0, 250.0, 760.0), make_element(320.0, 700.0, 520.0, 760.0), make_element(40.0, 610.0, 540.0, 680.0), make_element(50.0, 500.0, 250.0, 580.0), make_element(320.0, 500.0, 520.0, 580.0), ];
let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
xycut_sort(&mut elements, &page);
assert!(elements[0].bbox().top_y >= 760.0);
assert!(elements[1].bbox().top_y >= 760.0);
assert!(elements[2].bbox().top_y >= 680.0 && elements[2].bbox().bottom_y <= 610.0);
assert!(elements[3].bbox().top_y <= 580.0);
assert!(elements[4].bbox().top_y <= 580.0);
}
}