use super::{ReadingOrderContext, ReadingOrderStrategy};
use crate::error::Result;
use crate::layout::TextSpan;
use crate::pipeline::{OrderedTextSpan, ReadingOrderInfo};
const MAX_PROJECTION_SIZE: usize = 100_000;
pub struct XYCutStrategy {
pub min_spans_for_split: usize,
pub valley_threshold: f32,
pub min_valley_width: f32,
pub prefer_horizontal: bool,
}
impl Default for XYCutStrategy {
fn default() -> Self {
Self {
min_spans_for_split: 5,
valley_threshold: 0.3,
min_valley_width: 15.0,
prefer_horizontal: true,
}
}
}
impl XYCutStrategy {
pub fn new() -> Self {
Self::default()
}
pub fn with_valley_threshold(mut self, threshold: f32) -> Self {
self.valley_threshold = threshold.clamp(0.0, 1.0);
self
}
pub fn with_min_valley_width(mut self, width: f32) -> Self {
self.min_valley_width = width.max(1.0);
self
}
pub fn partition_region(&self, spans: &[TextSpan]) -> Vec<Vec<TextSpan>> {
let indices: Vec<usize> = (0..spans.len()).collect();
let index_groups = self.partition_indexed(spans, &indices);
index_groups
.into_iter()
.map(|group| group.into_iter().map(|i| spans[i].clone()).collect())
.collect()
}
fn partition_indexed(&self, all_spans: &[TextSpan], indices: &[usize]) -> Vec<Vec<usize>> {
if indices.is_empty() {
return Vec::new();
}
if indices.len() < self.min_spans_for_split {
return vec![self.sort_indices(all_spans, indices)];
}
if let Some((left, right)) = self.find_horizontal_split_indexed(all_spans, indices) {
let mut result = self.partition_indexed(all_spans, &left);
result.extend(self.partition_indexed(all_spans, &right));
return result;
}
if let Some((top, bottom)) = self.find_vertical_split_indexed(all_spans, indices) {
let mut result = self.partition_indexed(all_spans, &top);
result.extend(self.partition_indexed(all_spans, &bottom));
return result;
}
vec![self.sort_indices(all_spans, indices)]
}
fn find_horizontal_split_indexed(
&self,
all_spans: &[TextSpan],
indices: &[usize],
) -> Option<(Vec<usize>, Vec<usize>)> {
let profile = self.horizontal_projection_indexed(all_spans, indices)?;
let (valley_start, valley_end, valley_width) = self.find_valley(&profile)?;
if valley_width < self.min_valley_width {
return None;
}
let split_x = profile.x_min + (valley_start + valley_end) as f32 / 2.0;
let (left, right): (Vec<usize>, Vec<usize>) = indices
.iter()
.partition(|&&i| all_spans[i].bbox.right() <= split_x);
if left.is_empty() || right.is_empty() {
return None;
}
Some((left, right))
}
fn find_vertical_split_indexed(
&self,
all_spans: &[TextSpan],
indices: &[usize],
) -> Option<(Vec<usize>, Vec<usize>)> {
let profile = self.vertical_projection_indexed(all_spans, indices)?;
let (valley_start, valley_end, valley_width) = self.find_valley(&profile)?;
if valley_width < self.min_valley_width {
return None;
}
let split_y = profile.y_min + (valley_start + valley_end) as f32 / 2.0;
let (top, bottom): (Vec<usize>, Vec<usize>) = indices
.iter()
.partition(|&&i| all_spans[i].bbox.top() <= split_y);
if top.is_empty() || bottom.is_empty() {
return None;
}
Some((top, bottom))
}
fn horizontal_projection_indexed(
&self,
all_spans: &[TextSpan],
indices: &[usize],
) -> Option<ProjectionProfile> {
if indices.is_empty() {
return None;
}
let mut x_min = f32::MAX;
let mut x_max = f32::MIN;
let mut y_min = f32::MAX;
let mut y_max = f32::MIN;
for &i in indices {
let span = &all_spans[i];
x_min = x_min.min(span.bbox.left());
x_max = x_max.max(span.bbox.right());
y_min = y_min.min(span.bbox.top());
y_max = y_max.max(span.bbox.bottom());
}
let width = (x_max - x_min).ceil() as usize;
if width > MAX_PROJECTION_SIZE {
log::warn!(
"XY-cut: horizontal projection width {} exceeds MAX_PROJECTION_SIZE {}, skipping region (degenerate CTM?)",
width,
MAX_PROJECTION_SIZE
);
return None;
}
let mut density = vec![0.0; width];
for &i in indices {
let span = &all_spans[i];
let x_start = (span.bbox.left() - x_min).max(0.0).ceil() as usize;
let x_end = (span.bbox.right() - x_min).ceil() as usize;
let height = span.bbox.bottom() - span.bbox.top();
for j in x_start..x_end.min(width) {
density[j] += height;
}
}
Some(ProjectionProfile {
density,
x_min,
y_min,
})
}
fn vertical_projection_indexed(
&self,
all_spans: &[TextSpan],
indices: &[usize],
) -> Option<ProjectionProfile> {
if indices.is_empty() {
return None;
}
let mut x_min = f32::MAX;
let mut x_max = f32::MIN;
let mut y_min = f32::MAX;
let mut y_max = f32::MIN;
for &i in indices {
let span = &all_spans[i];
x_min = x_min.min(span.bbox.left());
x_max = x_max.max(span.bbox.right());
y_min = y_min.min(span.bbox.top());
y_max = y_max.max(span.bbox.bottom());
}
let height = (y_max - y_min).ceil() as usize;
if height > MAX_PROJECTION_SIZE {
log::warn!(
"XY-cut: vertical projection height {} exceeds MAX_PROJECTION_SIZE {}, skipping region (degenerate CTM?)",
height,
MAX_PROJECTION_SIZE
);
return None;
}
let mut density = vec![0.0; height];
for &i in indices {
let span = &all_spans[i];
let y_start = (span.bbox.top() - y_min).max(0.0).ceil() as usize;
let y_end = (span.bbox.bottom() - y_min).ceil() as usize;
let w = span.bbox.right() - span.bbox.left();
for j in y_start..y_end.min(height) {
density[j] += w;
}
}
Some(ProjectionProfile {
density,
x_min,
y_min,
})
}
fn find_valley(&self, profile: &ProjectionProfile) -> Option<(usize, usize, f32)> {
if profile.density.is_empty() {
return None;
}
let peak = profile.density.iter().copied().fold(0.0, f32::max);
if peak == 0.0 {
return None;
}
let threshold = peak * self.valley_threshold;
let mut valleys = Vec::new();
let mut in_valley = false;
let mut valley_start = 0;
for (i, &density) in profile.density.iter().enumerate() {
if density < threshold {
if !in_valley {
valley_start = i;
in_valley = true;
}
} else if in_valley {
valleys.push((valley_start, i));
in_valley = false;
}
}
if in_valley {
valleys.push((valley_start, profile.density.len()));
}
valleys
.into_iter()
.map(|(start, end)| (start, end, (end - start) as f32))
.max_by(|a, b| crate::utils::safe_float_cmp(a.2, b.2))
}
#[cfg(test)]
fn horizontal_projection(&self, spans: &[TextSpan]) -> Option<ProjectionProfile> {
let indices: Vec<usize> = (0..spans.len()).collect();
self.horizontal_projection_indexed(spans, &indices)
}
#[cfg(test)]
fn vertical_projection(&self, spans: &[TextSpan]) -> Option<ProjectionProfile> {
let indices: Vec<usize> = (0..spans.len()).collect();
self.vertical_projection_indexed(spans, &indices)
}
#[cfg(test)]
fn sort_spans<'a>(&self, spans: &'a [TextSpan]) -> Vec<&'a TextSpan> {
let mut sorted: Vec<_> = spans.iter().collect();
sorted.sort_by(|a, b| {
let y_cmp = crate::utils::safe_float_cmp(b.bbox.top(), a.bbox.top());
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(a.bbox.left(), b.bbox.left())
});
sorted
}
fn sort_indices(&self, all_spans: &[TextSpan], indices: &[usize]) -> Vec<usize> {
let mut sorted: Vec<usize> = indices.to_vec();
sorted.sort_by(|&a, &b| {
let y_cmp =
crate::utils::safe_float_cmp(all_spans[b].bbox.top(), all_spans[a].bbox.top());
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(all_spans[a].bbox.left(), all_spans[b].bbox.left())
});
sorted
}
}
struct ProjectionProfile {
density: Vec<f32>,
x_min: f32,
y_min: f32,
}
impl ReadingOrderStrategy for XYCutStrategy {
fn apply(
&self,
spans: Vec<TextSpan>,
_context: &ReadingOrderContext,
) -> Result<Vec<OrderedTextSpan>> {
let indices: Vec<usize> = (0..spans.len()).collect();
let index_groups = self.partition_indexed(&spans, &indices);
let mut ordered = Vec::with_capacity(spans.len());
let mut span_slots: Vec<Option<TextSpan>> = spans.into_iter().map(Some).collect();
let mut order_index = 0usize;
for (group_idx, group) in index_groups.iter().enumerate() {
for &i in group {
if let Some(span) = span_slots[i].take() {
ordered.push(
OrderedTextSpan::with_info(span, order_index, ReadingOrderInfo::xycut())
.with_group(group_idx),
);
order_index += 1;
}
}
}
Ok(ordered)
}
fn name(&self) -> &'static str {
"XYCutStrategy"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
fn make_span(x: f32, y: f32, width: f32, height: f32) -> TextSpan {
use crate::layout::{Color, FontWeight};
TextSpan {
artifact_type: None,
text: "test".to_string(),
bbox: Rect::new(x, y, width, height),
font_size: 12.0,
font_name: "Arial".to_string(),
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color {
r: 0.0,
g: 0.0,
b: 0.0,
},
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
}
}
#[test]
fn test_single_column_no_split() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(10.0, 100.0, 50.0, 10.0), make_span(10.0, 85.0, 50.0, 10.0), make_span(10.0, 70.0, 50.0, 10.0), ];
let groups = strategy.partition_region(&spans);
assert_eq!(groups.len(), 1); assert_eq!(groups[0].len(), 3);
}
#[test]
fn test_two_column_split() {
let mut strategy = XYCutStrategy::new();
strategy.min_spans_for_split = 2;
let spans = vec![
make_span(10.0, 100.0, 50.0, 10.0),
make_span(10.0, 85.0, 50.0, 10.0),
make_span(100.0, 100.0, 50.0, 10.0),
make_span(100.0, 85.0, 50.0, 10.0),
];
let groups = strategy.partition_region(&spans);
assert!(!groups.is_empty(), "Expected at least 1 group");
let total_spans: usize = groups.iter().map(|g| g.len()).sum();
assert_eq!(total_spans, 4, "Expected all 4 spans to be preserved");
}
#[test]
fn test_three_column_layout() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0),
make_span(10.0, 85.0, 30.0, 10.0),
make_span(70.0, 100.0, 30.0, 10.0),
make_span(70.0, 85.0, 30.0, 10.0),
make_span(130.0, 100.0, 30.0, 10.0),
make_span(130.0, 85.0, 30.0, 10.0),
];
let groups = strategy.partition_region(&spans);
assert!(groups.len() >= 2, "Expected at least 2 groups, got {}", groups.len());
}
#[test]
fn test_small_region_no_split() {
let strategy = XYCutStrategy::new();
let spans = vec![make_span(10.0, 100.0, 50.0, 10.0)];
let groups = strategy.partition_region(&spans);
assert_eq!(groups.len(), 1); assert_eq!(groups[0].len(), 1);
}
#[test]
fn test_sort_order() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(100.0, 70.0, 50.0, 10.0), make_span(10.0, 100.0, 50.0, 10.0), make_span(100.0, 100.0, 50.0, 10.0), make_span(10.0, 70.0, 50.0, 10.0), ];
let sorted = strategy.sort_spans(&spans);
assert_eq!(sorted[0].bbox.top(), 100.0); assert_eq!(sorted[0].bbox.left(), 10.0); assert_eq!(sorted[1].bbox.top(), 100.0); assert_eq!(sorted[1].bbox.left(), 100.0); }
#[test]
fn test_horizontal_projection() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0), make_span(100.0, 100.0, 30.0, 10.0), ];
if let Some(profile) = strategy.horizontal_projection(&spans) {
assert!(!profile.density.is_empty());
assert!(profile.density.len() >= 120);
let gap_start = 30;
let gap_end = 90;
if gap_end <= profile.density.len() {
let gap_region = &profile.density[gap_start..gap_end];
let gap_density: f32 = gap_region.iter().sum();
assert!(gap_density < 1.0); }
}
}
#[test]
fn test_vertical_projection() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(10.0, 100.0, 50.0, 20.0), make_span(10.0, 50.0, 50.0, 20.0), ];
if let Some(profile) = strategy.vertical_projection(&spans) {
assert!(!profile.density.is_empty());
assert!(profile.density.len() > 50);
}
}
#[test]
fn test_narrow_gap_rejected() {
let strategy = XYCutStrategy::new();
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0), make_span(45.0, 100.0, 30.0, 10.0), ];
let groups = strategy.partition_region(&spans);
assert_eq!(groups.len(), 1);
}
#[test]
fn test_degenerate_ctm_horizontal_projection_returns_none() {
let strategy = XYCutStrategy::new();
let degenerate_x: f32 = 99_992_777_785_344.0;
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0),
make_span(degenerate_x, 100.0, 30.0, 10.0),
];
let result = strategy.horizontal_projection(&spans);
assert!(
result.is_none(),
"expected None for projection spanning ~100 trillion points, got Some"
);
}
#[test]
fn test_degenerate_ctm_vertical_projection_returns_none() {
let strategy = XYCutStrategy::new();
let degenerate_y: f32 = 99_992_777_785_344.0;
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0),
make_span(10.0, degenerate_y, 30.0, 10.0),
];
let result = strategy.vertical_projection(&spans);
assert!(
result.is_none(),
"expected None for projection spanning ~100 trillion points, got Some"
);
}
#[test]
fn test_xycut_group_id_two_column_layout() {
use crate::pipeline::reading_order::{ReadingOrderContext, ReadingOrderStrategy};
let mut strategy = XYCutStrategy::new();
strategy.min_spans_for_split = 2;
let make = |text: &str, x: f32, y: f32, w: f32| {
let mut s = make_span(x, y, w, 12.0);
s.text = text.to_string();
s
};
let spans = vec![
make("Description", 50.0, 100.0, 150.0),
make("Amount", 400.0, 100.0, 150.0),
make("Widget A", 50.0, 120.0, 150.0),
make("$150.00", 400.0, 120.0, 150.0),
make("Widget B", 50.0, 140.0, 150.0),
make("Discount", 400.0, 140.0, 150.0),
make("$25.00", 400.0, 160.0, 150.0),
];
let context = ReadingOrderContext::new();
let ordered = strategy.apply(spans, &context).unwrap();
assert!(
ordered.iter().all(|s| s.group_id.is_some()),
"all spans should have group_id set by XYCut"
);
let left_groups: Vec<usize> = ordered
.iter()
.filter(|s| s.span.bbox.left() < 300.0)
.map(|s| s.group_id.unwrap())
.collect();
let right_groups: Vec<usize> = ordered
.iter()
.filter(|s| s.span.bbox.left() >= 300.0)
.map(|s| s.group_id.unwrap())
.collect();
assert!(
left_groups.windows(2).all(|w| w[0] == w[1]),
"left column spans should share the same group_id: {:?}",
left_groups
);
assert!(
right_groups.windows(2).all(|w| w[0] == w[1]),
"right column spans should share the same group_id: {:?}",
right_groups
);
assert_ne!(
left_groups[0], right_groups[0],
"left and right columns should have different group_ids"
);
let left_orders: Vec<usize> = ordered
.iter()
.filter(|s| s.span.bbox.left() < 300.0)
.map(|s| s.reading_order)
.collect();
let right_orders: Vec<usize> = ordered
.iter()
.filter(|s| s.span.bbox.left() >= 300.0)
.map(|s| s.reading_order)
.collect();
let left_max = *left_orders.iter().max().unwrap();
let right_min = *right_orders.iter().min().unwrap();
let left_min = *left_orders.iter().min().unwrap();
let right_max = *right_orders.iter().max().unwrap();
assert!(
left_max < right_min || right_max < left_min,
"columns must be contiguous in reading order: left={:?} right={:?}",
left_orders,
right_orders
);
}
#[test]
fn test_group_id_plain_text_no_interleave() {
use crate::pipeline::converters::OutputConverter;
use crate::pipeline::converters::PlainTextConverter;
use crate::pipeline::reading_order::{ReadingOrderContext, ReadingOrderStrategy};
use crate::pipeline::TextPipelineConfig;
let mut strategy = XYCutStrategy::new();
strategy.min_spans_for_split = 2;
let make = |text: &str, x: f32, y: f32, w: f32| {
let mut s = make_span(x, y, w, 12.0);
s.text = text.to_string();
s
};
let spans = vec![
make("Description", 50.0, 100.0, 150.0),
make("Amount", 400.0, 100.0, 150.0),
make("Widget A", 50.0, 120.0, 150.0),
make("$150.00", 400.0, 120.0, 150.0),
make("Widget B", 50.0, 140.0, 150.0),
make("Discount", 400.0, 140.0, 150.0),
make("$25.00", 400.0, 160.0, 150.0),
];
let context = ReadingOrderContext::new();
let ordered = strategy.apply(spans, &context).unwrap();
let converter = PlainTextConverter::new();
let config = TextPipelineConfig::default();
let text = converter.convert(&ordered, &config).unwrap();
assert!(text.contains("Description"), "missing Description:\n{text}");
assert!(text.contains("Amount"), "missing Amount:\n{text}");
assert!(text.contains("Widget A"), "missing Widget A:\n{text}");
assert!(text.contains("$150.00"), "missing $150.00:\n{text}");
for line in text.lines() {
if line.contains("Description") {
assert!(
line.contains("Amount"),
"Description and Amount should be on same line:\n{text}"
);
}
}
}
#[test]
fn test_degenerate_ctm_partition_region_does_not_abort() {
let strategy = XYCutStrategy::new();
let degenerate_x: f32 = 99_992_777_785_344.0;
let spans = vec![
make_span(10.0, 100.0, 30.0, 10.0),
make_span(10.0, 85.0, 30.0, 10.0),
make_span(10.0, 70.0, 30.0, 10.0),
make_span(10.0, 55.0, 30.0, 10.0),
make_span(10.0, 40.0, 30.0, 10.0),
make_span(degenerate_x, 100.0, 30.0, 10.0),
];
let groups = strategy.partition_region(&spans);
let total: usize = groups.iter().map(|g| g.len()).sum();
assert_eq!(total, spans.len(), "all spans must be preserved");
}
}