use std::cmp::Ordering;
use super::zone::BBox;
type ColumnAcc = std::collections::HashMap<usize, (BBox, usize, usize, Vec<f32>, Vec<f32>)>;
#[derive(Debug, Clone, Copy)]
pub struct HLine {
pub left: f32,
pub right: f32,
pub y: f32,
}
#[derive(Debug, Clone, Copy)]
pub struct VLine {
pub x: f32,
pub top: f32,
pub bottom: f32,
}
const LINE_AXIS_TOLERANCE: f32 = 2.0;
pub const LINE_MIN_LENGTH: f32 = 4.0;
const REGION_CLUSTER_GAP: f32 = 4.0;
const REGION_MIN_LINES: usize = 4;
const REGION_MIN_PER_AXIS: usize = 2;
const FIGURE_MIN_PATHS: usize = 1;
const FIGURE_MIN_DIM: f32 = 24.0;
const FIGURE_MIN_AREA: f32 = 1500.0;
pub fn classify_segment(x0: f32, y0: f32, x1: f32, y1: f32) -> Option<Line> {
let dx = (x1 - x0).abs();
let dy = (y1 - y0).abs();
if dy <= LINE_AXIS_TOLERANCE && dx >= LINE_MIN_LENGTH {
Some(Line::H(HLine {
left: x0.min(x1),
right: x0.max(x1),
y: (y0 + y1) / 2.0,
}))
} else if dx <= LINE_AXIS_TOLERANCE && dy >= LINE_MIN_LENGTH {
Some(Line::V(VLine {
x: (x0 + x1) / 2.0,
top: y0.max(y1),
bottom: y0.min(y1),
}))
} else {
None
}
}
#[derive(Debug, Clone, Copy)]
pub enum Line {
H(HLine),
V(VLine),
}
fn h_bbox(h: &HLine) -> BBox {
BBox {
left: h.left,
right: h.right,
top: h.y + LINE_AXIS_TOLERANCE,
bottom: h.y - LINE_AXIS_TOLERANCE,
}
}
fn v_bbox(v: &VLine) -> BBox {
BBox {
left: v.x - LINE_AXIS_TOLERANCE,
right: v.x + LINE_AXIS_TOLERANCE,
top: v.top,
bottom: v.bottom,
}
}
fn bboxes_touch(a: &BBox, b: &BBox) -> bool {
let x_overlap =
a.left <= b.right + REGION_CLUSTER_GAP && b.left <= a.right + REGION_CLUSTER_GAP;
let y_overlap =
a.bottom <= b.top + REGION_CLUSTER_GAP && b.bottom <= a.top + REGION_CLUSTER_GAP;
x_overlap && y_overlap
}
fn merge_bbox(acc: BBox, other: BBox) -> BBox {
BBox {
left: acc.left.min(other.left),
right: acc.right.max(other.right),
top: acc.top.max(other.top),
bottom: acc.bottom.min(other.bottom),
}
}
#[derive(Debug, Clone)]
pub struct TableRegion {
pub bbox: BBox,
pub row_ys: Vec<f32>,
pub col_xs: Vec<f32>,
}
pub fn detect_table_regions(h_lines: &[HLine], v_lines: &[VLine]) -> Vec<TableRegion> {
let mut entries: Vec<(BBox, bool)> = Vec::with_capacity(h_lines.len() + v_lines.len());
for h in h_lines {
entries.push((h_bbox(h), true));
}
for v in v_lines {
entries.push((v_bbox(v), false));
}
if entries.is_empty() {
return Vec::new();
}
let n = entries.len();
let mut parent: Vec<usize> = (0..n).collect();
fn find(parent: &mut [usize], i: usize) -> usize {
if parent[i] == i {
i
} else {
let r = find(parent, parent[i]);
parent[i] = r;
r
}
}
for i in 0..n {
for j in (i + 1)..n {
if bboxes_touch(&entries[i].0, &entries[j].0) {
let ri = find(&mut parent, i);
let rj = find(&mut parent, j);
if ri != rj {
parent[ri] = rj;
}
}
}
}
let mut acc: ColumnAcc = std::collections::HashMap::new();
let h_count = h_lines.len();
for (i, (bbox, is_h)) in entries.iter().enumerate() {
let root = find(&mut parent, i);
let entry = acc
.entry(root)
.or_insert((*bbox, 0, 0, Vec::new(), Vec::new()));
entry.0 = merge_bbox(entry.0, *bbox);
if *is_h {
entry.1 += 1;
entry.3.push(h_lines[i].y);
} else {
entry.2 += 1;
entry.4.push(v_lines[i - h_count].x);
}
}
acc.into_values()
.filter(|(_, n_h, n_v, _, _)| {
n_h + n_v >= REGION_MIN_LINES
&& *n_h >= REGION_MIN_PER_AXIS
&& *n_v >= REGION_MIN_PER_AXIS
})
.map(|(bbox, _, _, mut ys, mut xs)| {
ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
let mut dedup_ys: Vec<f32> = Vec::with_capacity(ys.len());
for y in ys {
if dedup_ys
.last()
.is_none_or(|&prev: &f32| (prev - y).abs() > LINE_AXIS_TOLERANCE)
{
dedup_ys.push(y);
}
}
xs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let mut dedup_xs: Vec<f32> = Vec::with_capacity(xs.len());
for x in xs {
if dedup_xs
.last()
.is_none_or(|&prev: &f32| (prev - x).abs() > LINE_AXIS_TOLERANCE)
{
dedup_xs.push(x);
}
}
TableRegion {
bbox,
row_ys: dedup_ys,
col_xs: dedup_xs,
}
})
.collect()
}
pub fn detect_figure_regions(path_bboxes: &[BBox]) -> Vec<BBox> {
if path_bboxes.is_empty() {
return Vec::new();
}
let n = path_bboxes.len();
let mut parent: Vec<usize> = (0..n).collect();
fn find(parent: &mut [usize], i: usize) -> usize {
if parent[i] == i {
i
} else {
let r = find(parent, parent[i]);
parent[i] = r;
r
}
}
for i in 0..n {
for j in (i + 1)..n {
if bboxes_touch(&path_bboxes[i], &path_bboxes[j]) {
let ri = find(&mut parent, i);
let rj = find(&mut parent, j);
if ri != rj {
parent[ri] = rj;
}
}
}
}
let mut acc: std::collections::HashMap<usize, (BBox, usize)> = std::collections::HashMap::new();
for (i, bbox) in path_bboxes.iter().enumerate() {
let root = find(&mut parent, i);
let entry = acc.entry(root).or_insert((*bbox, 0));
entry.0 = merge_bbox(entry.0, *bbox);
entry.1 += 1;
}
acc.into_values()
.filter(|(bbox, count)| {
let width = bbox.right - bbox.left;
let height = bbox.top - bbox.bottom;
*count >= FIGURE_MIN_PATHS
&& width >= FIGURE_MIN_DIM
&& height >= FIGURE_MIN_DIM
&& width * height >= FIGURE_MIN_AREA
})
.map(|(bbox, _)| bbox)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
fn h(left: f32, right: f32, y: f32) -> HLine {
HLine { left, right, y }
}
fn v(x: f32, bottom: f32, top: f32) -> VLine {
VLine { x, top, bottom }
}
#[test]
fn classify_horizontal_line() {
match classify_segment(10.0, 100.0, 200.0, 100.5) {
Some(Line::H(line)) => {
assert_eq!(line.left, 10.0);
assert_eq!(line.right, 200.0);
}
other => panic!("expected H line, got {:?}", other),
}
}
#[test]
fn classify_vertical_line() {
match classify_segment(50.0, 100.0, 50.5, 200.0) {
Some(Line::V(line)) => {
assert_eq!(line.top, 200.0);
assert_eq!(line.bottom, 100.0);
}
other => panic!("expected V line, got {:?}", other),
}
}
#[test]
fn classify_diagonal_rejected() {
assert!(classify_segment(0.0, 0.0, 100.0, 100.0).is_none());
}
#[test]
fn classify_too_short_rejected() {
assert!(classify_segment(0.0, 0.0, 2.0, 0.0).is_none());
}
#[test]
fn detect_simple_rectangle_table() {
let h_lines = vec![
h(0.0, 100.0, 100.0), h(0.0, 100.0, 50.0), ];
let v_lines = vec![
v(0.0, 50.0, 100.0), v(100.0, 50.0, 100.0), ];
let regions = detect_table_regions(&h_lines, &v_lines);
assert_eq!(regions.len(), 1);
let r = ®ions[0];
assert!((r.bbox.left - 0.0).abs() < 3.0);
assert!((r.bbox.right - 100.0).abs() < 3.0);
assert_eq!(r.row_ys.len(), 2, "expected 2 row Y positions");
}
#[test]
fn detect_grid_table() {
let h_lines = vec![
h(0.0, 300.0, 100.0),
h(0.0, 300.0, 60.0),
h(0.0, 300.0, 20.0),
];
let v_lines = vec![
v(0.0, 20.0, 100.0),
v(100.0, 20.0, 100.0),
v(200.0, 20.0, 100.0),
v(300.0, 20.0, 100.0),
];
let regions = detect_table_regions(&h_lines, &v_lines);
assert_eq!(regions.len(), 1);
}
#[test]
fn no_region_for_isolated_underline() {
let h_lines = vec![h(0.0, 200.0, 50.0)];
let v_lines: Vec<VLine> = Vec::new();
let regions = detect_table_regions(&h_lines, &v_lines);
assert!(regions.is_empty());
}
#[test]
fn no_region_when_below_min_lines() {
let h_lines = vec![h(0.0, 100.0, 50.0)];
let v_lines = vec![v(50.0, 0.0, 100.0)];
let regions = detect_table_regions(&h_lines, &v_lines);
assert!(regions.is_empty());
}
#[test]
fn two_distinct_tables_two_regions() {
let h_lines = vec![
h(0.0, 100.0, 100.0),
h(0.0, 100.0, 50.0),
h(200.0, 300.0, 100.0),
h(200.0, 300.0, 50.0),
];
let v_lines = vec![
v(0.0, 50.0, 100.0),
v(100.0, 50.0, 100.0),
v(200.0, 50.0, 100.0),
v(300.0, 50.0, 100.0),
];
let regions = detect_table_regions(&h_lines, &v_lines);
assert_eq!(regions.len(), 2);
}
fn bb(left: f32, right: f32, bottom: f32, top: f32) -> BBox {
BBox {
left,
right,
top,
bottom,
}
}
#[test]
fn figure_cluster_merges_touching_boxes() {
let boxes = vec![
bb(100.0, 160.0, 100.0, 160.0),
bb(150.0, 210.0, 120.0, 180.0),
bb(120.0, 200.0, 150.0, 200.0),
bb(110.0, 170.0, 90.0, 140.0),
];
let regions = detect_figure_regions(&boxes);
assert_eq!(regions.len(), 1);
let r = ®ions[0];
assert!((r.left - 100.0).abs() < 1.0);
assert!((r.right - 210.0).abs() < 1.0);
assert!((r.bottom - 90.0).abs() < 1.0);
assert!((r.top - 200.0).abs() < 1.0);
}
#[test]
fn two_separated_diagrams_two_regions() {
let boxes = vec![
bb(0.0, 40.0, 0.0, 40.0),
bb(20.0, 60.0, 10.0, 50.0),
bb(10.0, 50.0, 20.0, 60.0),
bb(300.0, 340.0, 0.0, 40.0),
bb(320.0, 360.0, 10.0, 50.0),
bb(310.0, 350.0, 20.0, 60.0),
];
let regions = detect_figure_regions(&boxes);
assert_eq!(regions.len(), 2);
}
#[test]
fn single_large_path_is_a_figure() {
let boxes = vec![bb(0.0, 100.0, 0.0, 100.0)];
let regions = detect_figure_regions(&boxes);
assert_eq!(regions.len(), 1);
}
#[test]
fn no_figure_for_single_small_box() {
let boxes = vec![bb(0.0, 10.0, 0.0, 10.0)];
let regions = detect_figure_regions(&boxes);
assert!(regions.is_empty());
}
#[test]
fn no_figure_for_thin_strip() {
let boxes = vec![
bb(0.0, 80.0, 0.0, 3.0),
bb(60.0, 140.0, 0.0, 3.0),
bb(120.0, 200.0, 0.0, 3.0),
];
let regions = detect_figure_regions(&boxes);
assert!(regions.is_empty());
}
}