use super::LayoutRegion;
use crate::pdf::markdown::constants::REGION_SAME_ROW_FRACTION;
use crate::pdf::markdown::types::LayoutHintClass;
const STRICTLY_ABOVE_EPS: f32 = 1e-3;
const HORIZONTAL_OVERLAP_PADDING: f32 = 0.1;
const MAX_DILATION_FRACTION: f32 = 0.15;
const MIN_REGIONS_FOR_DAG: usize = 4;
pub(in crate::pdf::markdown) fn order_regions_reading_order(regions: &mut [LayoutRegion], page_height: f32) {
if regions.len() < 2 {
return;
}
let mut header_indices: Vec<usize> = Vec::new();
let mut footer_indices: Vec<usize> = Vec::new();
let mut body_indices: Vec<usize> = Vec::new();
for (i, r) in regions.iter().enumerate() {
match r.hint.class {
LayoutHintClass::PageHeader => header_indices.push(i),
LayoutHintClass::PageFooter => footer_indices.push(i),
_ => body_indices.push(i),
}
}
header_indices.sort_by(|&a, &b| {
let (_, a_bot, _, a_top) = regions[a].bbox();
let (_, b_bot, _, b_top) = regions[b].bbox();
let a_cy = (a_top + a_bot) / 2.0;
let b_cy = (b_top + b_bot) / 2.0;
b_cy.total_cmp(&a_cy)
});
footer_indices.sort_by(|&a, &b| {
let (_, a_bot, _, a_top) = regions[a].bbox();
let (_, b_bot, _, b_top) = regions[b].bbox();
let a_cy = (a_top + a_bot) / 2.0;
let b_cy = (b_top + b_bot) / 2.0;
b_cy.total_cmp(&a_cy)
});
let body_order = if body_indices.len() < MIN_REGIONS_FOR_DAG {
tracing::trace!(
body_regions = body_indices.len(),
"reading order: simple Y-sort (below DAG threshold)"
);
sort_simple(&body_indices, regions, page_height)
} else {
tracing::trace!(body_regions = body_indices.len(), "reading order: DAG-based");
dag_reading_order(&body_indices, regions, page_height)
};
let mut final_order: Vec<usize> = Vec::with_capacity(regions.len());
final_order.extend(&header_indices);
final_order.extend(&body_order);
final_order.extend(&footer_indices);
reorder_by_indices(regions, &final_order);
}
fn sort_simple(indices: &[usize], regions: &[LayoutRegion], page_height: f32) -> Vec<usize> {
let y_tolerance = page_height * REGION_SAME_ROW_FRACTION;
let mut sorted = indices.to_vec();
if y_tolerance > 0.0 {
sorted.sort_by(|&a, &b| {
let (a_left, a_bot, _, a_top) = regions[a].bbox();
let (b_left, b_bot, _, b_top) = regions[b].bbox();
let a_cy = (a_top + a_bot) / 2.0;
let b_cy = (b_top + b_bot) / 2.0;
let a_row = (a_cy / y_tolerance).round() as i64;
let b_row = (b_cy / y_tolerance).round() as i64;
b_row.cmp(&a_row).then_with(|| a_left.total_cmp(&b_left))
});
}
sorted
}
fn dag_reading_order(body_indices: &[usize], regions: &[LayoutRegion], _page_height: f32) -> Vec<usize> {
let n = body_indices.len();
let bboxes: Vec<RegionBbox> = body_indices
.iter()
.map(|&idx| {
let (left, bottom, right, top) = regions[idx].bbox();
RegionBbox {
left,
bottom,
right,
top,
}
})
.collect();
let page_left = bboxes.iter().map(|b| b.left).fold(f32::MAX, f32::min);
let page_right = bboxes.iter().map(|b| b.right).fold(f32::MIN, f32::max);
let page_width = page_right - page_left;
let (up_map, dn_map) = build_dag(&bboxes);
let dilated_bboxes = apply_dilation(&bboxes, &up_map, &dn_map, page_width);
let (_up_map_d, dn_map_d) = build_dag(&dilated_bboxes);
let up_map_d = {
let mut up: Vec<Vec<usize>> = vec![Vec::new(); n];
for (i, succs) in dn_map_d.iter().enumerate() {
for &j in succs {
up[j].push(i);
}
}
up
};
let mut heads: Vec<usize> = (0..n).filter(|&i| up_map_d[i].is_empty()).collect();
heads.sort_by(|&a, &b| {
let a_left = bboxes[a].left;
let a_right = bboxes[a].right;
let b_left = bboxes[b].left;
let b_right = bboxes[b].right;
let h_overlap = a_right.min(b_right) - a_left.max(b_left);
let min_width = (a_right - a_left).min(b_right - b_left);
let different_columns = h_overlap < min_width * 0.3;
if different_columns {
a_left.total_cmp(&b_left)
} else {
let a_cy = (bboxes[a].top + bboxes[a].bottom) / 2.0;
let b_cy = (bboxes[b].top + bboxes[b].bottom) / 2.0;
b_cy.total_cmp(&a_cy).then_with(|| a_left.total_cmp(&b_left))
}
});
let mut sorted_dn: Vec<Vec<usize>> = dn_map_d;
for succs in &mut sorted_dn {
succs.sort_by(|&a, &b| {
let a_left = bboxes[a].left;
let a_right = bboxes[a].right;
let b_left = bboxes[b].left;
let b_right = bboxes[b].right;
let h_overlap = a_right.min(b_right) - a_left.max(b_left);
let min_width = (a_right - a_left).min(b_right - b_left);
let different_columns = h_overlap < min_width * 0.3;
if different_columns {
a_left.total_cmp(&b_left)
} else {
let a_cy = (bboxes[a].top + bboxes[a].bottom) / 2.0;
let b_cy = (bboxes[b].top + bboxes[b].bottom) / 2.0;
b_cy.total_cmp(&a_cy).then_with(|| a_left.total_cmp(&b_left))
}
});
}
let order = dfs_reading_order(&heads, &sorted_dn, &up_map_d, n);
order.iter().map(|&local| body_indices[local]).collect()
}
#[derive(Clone)]
struct RegionBbox {
left: f32,
bottom: f32,
right: f32,
top: f32,
}
fn build_dag(bboxes: &[RegionBbox]) -> (Vec<Vec<usize>>, Vec<Vec<usize>>) {
let n = bboxes.len();
let mut up_map: Vec<Vec<usize>> = vec![Vec::new(); n];
let mut dn_map: Vec<Vec<usize>> = vec![Vec::new(); n];
for j in 0..n {
for i in 0..n {
if i == j {
continue;
}
if bboxes[i].bottom + STRICTLY_ABOVE_EPS <= bboxes[j].top {
continue;
}
let x_overlap = (bboxes[i].right + HORIZONTAL_OVERLAP_PADDING)
.min(bboxes[j].right + HORIZONTAL_OVERLAP_PADDING)
- (bboxes[i].left - HORIZONTAL_OVERLAP_PADDING).max(bboxes[j].left - HORIZONTAL_OVERLAP_PADDING);
if x_overlap <= 0.0 {
continue;
}
if has_interruption(bboxes, i, j) {
continue;
}
if !up_map[j].contains(&i) {
up_map[j].push(i);
}
if !dn_map[i].contains(&j) {
dn_map[i].push(j);
}
}
}
(up_map, dn_map)
}
fn has_interruption(bboxes: &[RegionBbox], i: usize, j: usize) -> bool {
let corridor_bottom = bboxes[j].top; let corridor_top = bboxes[i].bottom;
if corridor_bottom >= corridor_top {
return false; }
for (w, bbox_w) in bboxes.iter().enumerate() {
if w == i || w == j {
continue;
}
let overlaps_i = bbox_w.right > bboxes[i].left && bbox_w.left < bboxes[i].right;
let overlaps_j = bbox_w.right > bboxes[j].left && bbox_w.left < bboxes[j].right;
if !(overlaps_i || overlaps_j) {
continue;
}
let below_i = bboxes[i].bottom + STRICTLY_ABOVE_EPS > bbox_w.top;
let above_j = bbox_w.bottom + STRICTLY_ABOVE_EPS > bboxes[j].top;
if below_i && above_j {
return true;
}
}
false
}
fn apply_dilation(
bboxes: &[RegionBbox],
up_map: &[Vec<usize>],
dn_map: &[Vec<usize>],
page_width: f32,
) -> Vec<RegionBbox> {
let threshold = page_width * MAX_DILATION_FRACTION;
let mut dilated = bboxes.to_vec();
for i in 0..bboxes.len() {
let mut target_left = bboxes[i].left;
let mut target_right = bboxes[i].right;
if let Some(&pred) = up_map[i].first() {
target_left = target_left.min(bboxes[pred].left);
target_right = target_right.max(bboxes[pred].right);
}
if let Some(&succ) = dn_map[i].first() {
target_left = target_left.min(bboxes[succ].left);
target_right = target_right.max(bboxes[succ].right);
}
let left_dilation = bboxes[i].left - target_left;
let right_dilation = target_right - bboxes[i].right;
if left_dilation > threshold || right_dilation > threshold {
continue; }
let candidate = RegionBbox {
left: target_left,
bottom: bboxes[i].bottom,
right: target_right,
top: bboxes[i].top,
};
let overlaps_other = bboxes.iter().enumerate().any(|(j, other)| {
if j == i {
return false;
}
candidate.left < other.right
&& candidate.right > other.left
&& candidate.bottom < other.top
&& candidate.top > other.bottom
});
if !overlaps_other {
dilated[i] = candidate;
}
}
dilated
}
fn dfs_reading_order(heads: &[usize], dn_map: &[Vec<usize>], up_map: &[Vec<usize>], n: usize) -> Vec<usize> {
let mut visited = vec![false; n];
let mut order: Vec<usize> = Vec::with_capacity(n);
for &head in heads {
if visited[head] {
continue;
}
dfs_visit(head, dn_map, up_map, &mut visited, &mut order);
}
for i in 0..n {
if !visited[i] {
dfs_visit(i, dn_map, up_map, &mut visited, &mut order);
}
}
order
}
fn dfs_visit(start: usize, dn_map: &[Vec<usize>], up_map: &[Vec<usize>], visited: &mut [bool], order: &mut Vec<usize>) {
let root = find_topmost_unvisited(start, up_map, visited);
if visited[root] {
return;
}
visited[root] = true;
order.push(root);
let mut stack: Vec<(&[usize], usize)> = vec![(dn_map[root].as_slice(), 0)];
while let Some(frame) = stack.last_mut() {
if frame.1 >= frame.0.len() {
stack.pop();
continue;
}
let candidate = frame.0[frame.1];
frame.1 += 1;
let target = find_topmost_unvisited(candidate, up_map, visited);
if !visited[target] {
visited[target] = true;
order.push(target);
stack.push((dn_map[target].as_slice(), 0));
}
}
}
fn find_topmost_unvisited(start: usize, up_map: &[Vec<usize>], visited: &[bool]) -> usize {
let mut current = start;
let mut changed = true;
while changed {
changed = false;
for &pred in &up_map[current] {
if !visited[pred] {
current = pred;
changed = true;
break;
}
}
}
current
}
fn reorder_by_indices<T>(slice: &mut [T], order: &[usize]) {
debug_assert_eq!(slice.len(), order.len());
let mut perm: Vec<usize> = vec![0; slice.len()];
for (new_pos, &old_pos) in order.iter().enumerate() {
perm[new_pos] = old_pos;
}
let mut visited = vec![false; slice.len()];
for i in 0..slice.len() {
if visited[i] || perm[i] == i {
visited[i] = true;
continue;
}
let mut j = i;
loop {
let next = perm[j];
visited[j] = true;
if next == i {
break;
}
slice.swap(j, next);
perm[j] = j;
j = next;
}
}
}