use super::config::SegmentParams;
use super::gaps::{bbox_of, bucket_by_column, partition_by_x_center};
use super::rows::{group_rows_by_anchor, row_groups_by_drawn_anchors, row_groups_by_y_gap};
use super::segment::segment;
use super::types::{BBox, Bounded, SplitDir, Zone};
pub(super) fn build_table_zone<T: Bounded>(
items: Vec<T>,
v_gaps: &[(f32, f32)],
parent_bbox: BBox,
p: &SegmentParams,
forced_row_ys: &[f32],
) -> Zone<T> {
if items.is_empty() {
return Zone::Leaf {
bbox: parent_bbox,
items,
};
}
let p_no_regions = SegmentParams {
table_regions: Vec::new(),
..p.clone()
};
let bboxes: Vec<BBox> = items.iter().map(|it| it.bbox()).collect();
let row_groups: Vec<Vec<usize>> = if forced_row_ys.len() >= 2 {
row_groups_by_drawn_anchors(&bboxes, forced_row_ys)
} else {
let col_buckets = bucket_by_column(&bboxes, v_gaps);
let nonzero: Vec<usize> = col_buckets
.iter()
.map(|b| b.len())
.filter(|&c| c > 0)
.collect();
let uniform_counts = nonzero.len() < 2 || nonzero.iter().all(|&c| c == nonzero[0]);
if uniform_counts {
row_groups_by_y_gap(&bboxes, p)
} else {
group_rows_by_anchor(&bboxes, &col_buckets)
.unwrap_or_else(|| row_groups_by_y_gap(&bboxes, p))
}
};
if row_groups.len() < 2 {
let groups = partition_by_x_center(items, v_gaps);
let children: Vec<Zone<T>> = groups
.into_iter()
.filter(|g| !g.is_empty())
.map(|g| segment(g, &p_no_regions))
.collect();
if children.len() < 2 {
let merged: Vec<T> = children
.into_iter()
.flat_map(|c| match c {
Zone::Leaf { items, .. } => items,
_ => Vec::new(),
})
.collect();
return Zone::Leaf {
bbox: parent_bbox,
items: merged,
};
}
return Zone::Split {
dir: SplitDir::Vertical,
bbox: parent_bbox,
children,
};
}
let mut item_row: Vec<Option<usize>> = vec![None; items.len()];
for (row_i, row) in row_groups.iter().enumerate() {
for &idx in row {
item_row[idx] = Some(row_i);
}
}
let mut rows_items: Vec<Vec<T>> = (0..row_groups.len()).map(|_| Vec::new()).collect();
for (i, item) in items.into_iter().enumerate() {
if let Some(r) = item_row[i] {
rows_items[r].push(item);
}
}
let mut rows: Vec<Vec<Zone<T>>> = Vec::with_capacity(rows_items.len());
for row_items in rows_items {
let cells = partition_by_x_center(row_items, v_gaps);
let mut row: Vec<Zone<T>> = Vec::with_capacity(cells.len());
let mut row_has_content = false;
for cell_items in cells {
if cell_items.is_empty() {
row.push(Zone::Leaf {
bbox: BBox::empty(),
items: Vec::new(),
});
} else {
row_has_content = true;
let bbox = bbox_of(&cell_items);
row.push(Zone::Leaf {
bbox,
items: cell_items,
});
}
}
if row_has_content {
rows.push(row);
}
}
if rows.len() < 2 {
let merged: Vec<T> = rows
.into_iter()
.flat_map(|r| {
r.into_iter().flat_map(|c| match c {
Zone::Leaf { items, .. } => items,
_ => Vec::new(),
})
})
.collect();
return Zone::Leaf {
bbox: parent_bbox,
items: merged,
};
}
let n_cols = rows[0].len();
tracing::debug!(
v_gaps_len = v_gaps.len(),
forced_row_ys_len = forced_row_ys.len(),
rows_len = rows.len(),
n_cols,
"build_table_zone shape"
);
let is_filled = |c: &Zone<T>| -> bool {
match c {
Zone::Leaf { items, .. } => !items.is_empty(),
_ => true,
}
};
let fully_filled_rows = rows
.iter()
.filter(|r| r.len() == n_cols && r.iter().all(is_filled))
.count();
if fully_filled_rows * 5 < rows.len() * 3 {
let merged: Vec<T> = rows
.into_iter()
.flat_map(|r| {
r.into_iter().flat_map(|c| match c {
Zone::Leaf { items, .. } => items,
_ => Vec::new(),
})
})
.collect();
let groups = partition_by_x_center(merged, v_gaps);
let children: Vec<Zone<T>> = groups
.into_iter()
.filter(|g| !g.is_empty())
.map(|g| segment(g, &p_no_regions))
.collect();
if children.len() < 2 {
let merged: Vec<T> = children
.into_iter()
.flat_map(|c| match c {
Zone::Leaf { items, .. } => items,
_ => Vec::new(),
})
.collect();
return Zone::Leaf {
bbox: parent_bbox,
items: merged,
};
}
return Zone::Split {
dir: SplitDir::Vertical,
bbox: parent_bbox,
children,
};
}
Zone::Table {
bbox: parent_bbox,
rows,
}
}
#[cfg(test)]
#[path = "table_tests.rs"]
mod tests;