use super::types::{BoundingBox, ColumnBoundary, ColumnSection, StructuredDataConfig};
use crate::text::extraction::TextFragment;
pub fn detect_column_layout(
fragments: &[TextFragment],
config: &StructuredDataConfig,
) -> Vec<ColumnSection> {
if fragments.is_empty() {
return vec![];
}
let boundaries = detect_column_boundaries(fragments, config.min_column_gap);
if boundaries.is_empty() {
return vec![create_single_column_section(fragments)];
}
assign_to_columns(fragments, &boundaries)
}
fn detect_column_boundaries(fragments: &[TextFragment], min_gap: f64) -> Vec<ColumnBoundary> {
if fragments.is_empty() {
return vec![];
}
let mut x_ranges: Vec<(f64, f64)> = fragments.iter().map(|f| (f.x, f.x + f.width)).collect();
x_ranges.sort_by(|a, b| a.0.total_cmp(&b.0));
let mut boundaries = Vec::new();
for i in 0..x_ranges.len() - 1 {
let current_end = x_ranges[i].1;
let next_start = x_ranges[i + 1].0;
let gap = next_start - current_end;
if gap >= min_gap {
let boundary_x = current_end + gap / 2.0; boundaries.push(ColumnBoundary::new(boundary_x, gap));
}
}
boundaries
}
fn assign_to_columns(
fragments: &[TextFragment],
boundaries: &[ColumnBoundary],
) -> Vec<ColumnSection> {
let num_columns = boundaries.len() + 1;
let mut columns: Vec<Vec<&TextFragment>> = vec![vec![]; num_columns];
for fragment in fragments {
let column_idx = find_column_index(fragment.x, boundaries);
columns[column_idx].push(fragment);
}
columns
.into_iter()
.enumerate()
.filter(|(_, frags)| !frags.is_empty())
.map(|(idx, frags)| {
let mut sorted = frags.clone();
sorted.sort_by(|a, b| b.y.total_cmp(&a.y).then_with(|| a.x.total_cmp(&b.x)));
let text = sorted
.iter()
.map(|f| f.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let bbox = calculate_column_bbox(&sorted);
ColumnSection::new(idx, text, bbox)
})
.collect()
}
fn find_column_index(x: f64, boundaries: &[ColumnBoundary]) -> usize {
for (idx, boundary) in boundaries.iter().enumerate() {
if x < boundary.x_position {
return idx;
}
}
boundaries.len() }
fn create_single_column_section(fragments: &[TextFragment]) -> ColumnSection {
let mut sorted = fragments.to_vec();
sorted.sort_by(|a, b| b.y.total_cmp(&a.y).then_with(|| a.x.total_cmp(&b.x)));
let text = sorted
.iter()
.map(|f| f.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let bbox = calculate_column_bbox(&sorted.iter().collect::<Vec<_>>());
ColumnSection::new(0, text, bbox)
}
fn calculate_column_bbox(fragments: &[&TextFragment]) -> BoundingBox {
if fragments.is_empty() {
return BoundingBox::new(0.0, 0.0, 0.0, 0.0);
}
let min_x = fragments.iter().map(|f| f.x).fold(f64::INFINITY, f64::min);
let max_x = fragments
.iter()
.map(|f| f.x + f.width)
.fold(f64::NEG_INFINITY, f64::max);
let min_y = fragments.iter().map(|f| f.y).fold(f64::INFINITY, f64::min);
let max_y = fragments
.iter()
.map(|f| f.y + f.height)
.fold(f64::NEG_INFINITY, f64::max);
BoundingBox::new(min_x, min_y, max_x - min_x, max_y - min_y)
}
#[cfg(test)]
mod tests {
use super::*;
fn create_fragment(text: &str, x: f64, y: f64, width: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x,
y,
width,
height: 12.0,
font_size: 12.0,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
}
}
#[test]
fn test_detect_no_boundaries() {
let fragments = vec![
create_fragment("Line 1", 100.0, 700.0, 50.0),
create_fragment("Line 2", 100.0, 680.0, 50.0),
];
let boundaries = detect_column_boundaries(&fragments, 20.0);
assert_eq!(boundaries.len(), 0); }
#[test]
fn test_detect_two_columns() {
let fragments = vec![
create_fragment("Left 1", 100.0, 700.0, 50.0),
create_fragment("Left 2", 100.0, 680.0, 50.0),
create_fragment("Right 1", 250.0, 700.0, 50.0),
create_fragment("Right 2", 250.0, 680.0, 50.0),
];
let boundaries = detect_column_boundaries(&fragments, 20.0);
assert_eq!(boundaries.len(), 1); assert!((boundaries[0].x_position - 200.0).abs() < 5.0); assert!((boundaries[0].gap_width - 100.0).abs() < 1.0);
}
#[test]
fn test_detect_three_columns() {
let fragments = vec![
create_fragment("Col1", 100.0, 700.0, 40.0),
create_fragment("Col2", 200.0, 700.0, 40.0),
create_fragment("Col3", 300.0, 700.0, 40.0),
];
let boundaries = detect_column_boundaries(&fragments, 20.0);
assert_eq!(boundaries.len(), 2); }
#[test]
fn test_find_column_index() {
let boundaries = vec![
ColumnBoundary::new(200.0, 50.0),
ColumnBoundary::new(400.0, 50.0),
];
assert_eq!(find_column_index(150.0, &boundaries), 0); assert_eq!(find_column_index(250.0, &boundaries), 1); assert_eq!(find_column_index(450.0, &boundaries), 2); }
#[test]
fn test_assign_to_columns_two_columns() {
let fragments = vec![
create_fragment("Left 1", 100.0, 700.0, 50.0),
create_fragment("Left 2", 100.0, 680.0, 50.0),
create_fragment("Right 1", 250.0, 700.0, 50.0),
create_fragment("Right 2", 250.0, 680.0, 50.0),
];
let boundaries = vec![ColumnBoundary::new(200.0, 100.0)];
let sections = assign_to_columns(&fragments, &boundaries);
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].column_index, 0);
assert_eq!(sections[1].column_index, 1);
assert!(sections[0].text.contains("Left 1"));
assert!(sections[0].text.contains("Left 2"));
assert!(sections[1].text.contains("Right 1"));
assert!(sections[1].text.contains("Right 2"));
}
#[test]
fn test_column_layout_single_column() {
let config = StructuredDataConfig::default();
let fragments = vec![
create_fragment("Line 1", 100.0, 700.0, 50.0),
create_fragment("Line 2", 100.0, 680.0, 50.0),
];
let sections = detect_column_layout(&fragments, &config);
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].column_index, 0);
assert!(sections[0].text.contains("Line 1"));
assert!(sections[0].text.contains("Line 2"));
}
#[test]
fn test_column_layout_two_columns() {
let mut config = StructuredDataConfig::default();
config.min_column_gap = 50.0;
let fragments = vec![
create_fragment("Left 1", 100.0, 700.0, 40.0),
create_fragment("Left 2", 100.0, 680.0, 40.0),
create_fragment("Right 1", 250.0, 700.0, 40.0),
create_fragment("Right 2", 250.0, 680.0, 40.0),
];
let sections = detect_column_layout(&fragments, &config);
assert_eq!(sections.len(), 2);
assert!(sections[0].text.contains("Left"));
assert!(sections[1].text.contains("Right"));
}
#[test]
fn test_column_layout_empty() {
let config = StructuredDataConfig::default();
let sections = detect_column_layout(&[], &config);
assert_eq!(sections.len(), 0);
}
#[test]
fn test_calculate_column_bbox() {
let fragments = vec![
create_fragment("A", 100.0, 700.0, 50.0),
create_fragment("B", 110.0, 680.0, 60.0),
];
let refs: Vec<&TextFragment> = fragments.iter().collect();
let bbox = calculate_column_bbox(&refs);
assert_eq!(bbox.x, 100.0); assert_eq!(bbox.width, 70.0); assert_eq!(bbox.y, 680.0); assert_eq!(bbox.height, 32.0); }
#[test]
fn test_column_sorting_reading_order() {
let config = StructuredDataConfig::default();
let fragments = vec![
create_fragment("Bottom", 100.0, 650.0, 50.0),
create_fragment("Top", 100.0, 700.0, 50.0),
create_fragment("Middle", 100.0, 675.0, 50.0),
];
let sections = detect_column_layout(&fragments, &config);
assert_eq!(sections.len(), 1);
let words: Vec<&str> = sections[0].text.split_whitespace().collect();
assert_eq!(words[0], "Top");
assert_eq!(words[1], "Middle");
assert_eq!(words[2], "Bottom");
}
}