use crate::types::{GraphicPrimitive, ProjectedLine, Rect, TextItem};
use super::blocks::Block;
use super::paragraphs::collapse_whitespace;
use crate::projection::is_bold_item;
pub(super) const TABLE_MIN_COLUMNS: usize = 3;
const TABLE_MIN_ROWS: usize = 2;
const TABLE_CELL_GAP_FONT_MULTIPLIER: f32 = 1.0;
const TABLE_TRACK_TOLERANCE_PT: f32 = 6.0;
const TABLE_MIN_TRACK_GAP_FONT_MULT: f32 = 1.5;
const TABLE_MIN_TRACK_GAP_FLOOR_PT: f32 = 12.0;
const TABLE_ROW_MIN_FILL: f32 = 0.9;
const TABLE_SPARSE_ROW_MIN_BOTTOM_GAP_FRAC: f32 = 0.5;
const TABLE_ROW_GAP_MULTIPLIER: f32 = 2.5;
const TABLE_ROW_SPACING_MAX_CV: f32 = 0.5;
#[derive(Debug, Clone)]
pub(super) struct TableCell {
pub(super) start_x: f32,
pub(super) end_x: f32,
pub(super) text: String,
pub(super) bold: bool,
}
#[derive(Debug, Clone)]
pub(super) struct TableRun {
pub(super) start: usize,
pub(super) end: usize,
pub(super) body_start: usize,
pub(super) block: Block,
}
pub(super) fn split_cells(line: &ProjectedLine) -> Vec<TableCell> {
let mut spans: Vec<&TextItem> = line
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.collect();
spans.sort_by(|a, b| a.x.total_cmp(&b.x));
if spans.is_empty() {
return Vec::new();
}
let font_size = if line.dominant_font_size > 0.0 {
line.dominant_font_size
} else {
line.bbox.height.max(1.0)
};
let gap_threshold = font_size * TABLE_CELL_GAP_FONT_MULTIPLIER;
let mut cells: Vec<TableCell> = Vec::new();
let mut current_text = String::new();
let mut current_start = spans[0].x;
let mut current_bold_chars: usize = 0;
let mut current_total_chars: usize = 0;
let mut prev_right = spans[0].x;
for (i, span) in spans.iter().enumerate() {
let gap = span.x - prev_right;
let break_cell = i > 0 && gap > gap_threshold;
if break_cell {
let bold = current_total_chars > 0 && current_bold_chars * 2 > current_total_chars;
cells.push(TableCell {
start_x: current_start,
end_x: prev_right,
text: collapse_whitespace(current_text.trim()),
bold,
});
current_text.clear();
current_start = span.x;
current_bold_chars = 0;
current_total_chars = 0;
}
if !current_text.is_empty() && !current_text.ends_with(' ') {
current_text.push(' ');
}
current_text.push_str(&span.text);
let n = span.text.chars().count();
current_total_chars += n;
if is_bold_item(span) {
current_bold_chars += n;
}
prev_right = span.x + span.width.max(0.0);
}
if !current_text.trim().is_empty() {
let bold = current_total_chars > 0 && current_bold_chars * 2 > current_total_chars;
cells.push(TableCell {
start_x: current_start,
end_x: prev_right,
text: collapse_whitespace(current_text.trim()),
bold,
});
}
cells
}
fn recover_merged_cell(mut cells: Vec<TableCell>, tracks: &[f32]) -> Option<Vec<TableCell>> {
let target = tracks.len();
if cells.len() >= target {
return None;
}
while cells.len() < target {
let mut best_i: Option<usize> = None;
let mut best_count: usize = 1;
let mut best_contained: Vec<f32> = Vec::new();
for (i, cell) in cells.iter().enumerate() {
let contained: Vec<f32> = tracks
.iter()
.copied()
.filter(|t| {
*t >= cell.start_x - TABLE_TRACK_TOLERANCE_PT
&& *t <= cell.end_x + TABLE_TRACK_TOLERANCE_PT
})
.collect();
if contained.len() > best_count {
best_count = contained.len();
best_i = Some(i);
best_contained = contained;
}
}
let i = best_i?;
let cell = cells[i].clone();
let pieces = split_text_at_x_anchors(
cell.text.trim(),
cell.start_x,
cell.end_x - cell.start_x,
&best_contained[1..],
)?;
if pieces.iter().any(|p| p.is_empty()) {
return None;
}
let mut new_cells: Vec<TableCell> = Vec::with_capacity(pieces.len());
for (p, piece) in pieces.iter().enumerate() {
let start_x = if p == 0 {
cell.start_x
} else {
best_contained[p]
};
let end_x = if p + 1 < best_contained.len() {
(best_contained[p + 1] - 1.0).max(start_x)
} else {
cell.end_x
};
new_cells.push(TableCell {
start_x,
end_x,
text: piece.clone(),
bold: cell.bold,
});
}
cells.remove(i);
for (offset, c) in new_cells.into_iter().enumerate() {
cells.insert(i + offset, c);
}
}
if cells.len() == target {
Some(cells)
} else {
None
}
}
fn table_rows_adjacent(prev: &ProjectedLine, cur: &ProjectedLine) -> bool {
let prev_bottom = prev.bbox.y + prev.bbox.height;
let gap = cur.bbox.y - prev_bottom;
let line_height = prev.bbox.height.max(cur.bbox.height).max(1.0);
gap >= -line_height && gap <= line_height * TABLE_ROW_GAP_MULTIPLIER
}
fn row_spacing_cv(rows: &[(usize, &ProjectedLine, Vec<TableCell>)]) -> f32 {
if rows.len() < 3 {
return 0.0;
}
let gaps: Vec<f32> = rows
.windows(2)
.map(|w| (w[1].1.bbox.y - w[0].1.bbox.y).abs())
.collect();
let mean = gaps.iter().sum::<f32>() / gaps.len() as f32;
if mean <= 0.0 {
return f32::INFINITY;
}
let var = gaps.iter().map(|g| (g - mean).powi(2)).sum::<f32>() / gaps.len() as f32;
var.sqrt() / mean
}
fn cell_aligns_track(cell: &TableCell, track_range: (f32, f32)) -> bool {
let (ts, te) = track_range;
let tol = TABLE_TRACK_TOLERANCE_PT;
let center = (cell.start_x + cell.end_x) * 0.5;
if center >= ts - tol && center <= te + tol {
return true;
}
if (cell.start_x - ts).abs() <= tol {
return true;
}
if (cell.end_x - te).abs() <= tol {
return true;
}
false
}
fn match_track_idx(cell: &TableCell, track_ranges: &[(f32, f32)]) -> Option<usize> {
let tol = TABLE_TRACK_TOLERANCE_PT;
let center = (cell.start_x + cell.end_x) * 0.5;
if let Some((i, _)) = track_ranges
.iter()
.enumerate()
.filter(|(_, (s, e))| center >= s - tol && center <= e + tol)
.min_by(|(_, (s1, e1)), (_, (s2, e2))| {
let c1 = (s1 + e1) * 0.5;
let c2 = (s2 + e2) * 0.5;
(center - c1).abs().total_cmp(&(center - c2).abs())
})
{
return Some(i);
}
if let Some((i, _)) = track_ranges
.iter()
.enumerate()
.filter(|(_, (s, _))| (cell.start_x - s).abs() <= tol)
.min_by(|(_, (s1, _)), (_, (s2, _))| {
(cell.start_x - s1)
.abs()
.total_cmp(&(cell.start_x - s2).abs())
})
{
return Some(i);
}
track_ranges
.iter()
.enumerate()
.filter(|(_, (_, e))| (cell.end_x - e).abs() <= tol)
.min_by(|(_, (_, e1)), (_, (_, e2))| {
(cell.end_x - e1).abs().total_cmp(&(cell.end_x - e2).abs())
})
.map(|(i, _)| i)
}
const TABLE_TRACK_INFERENCE_MAX_ROWS: usize = 12;
fn infer_tracks_from_raw_items(lines: &[ProjectedLine], start_idx: usize) -> Vec<f32> {
let mut xs: Vec<f32> = Vec::new();
let push_row_xs = |xs: &mut Vec<f32>, line: &ProjectedLine| {
let row_xs: Vec<f32> = line
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.map(|s| s.x)
.collect();
if row_xs.len() >= 2 {
xs.extend(row_xs);
}
};
push_row_xs(&mut xs, &lines[start_idx]);
let mut j = start_idx + 1;
let mut rows_used = 1;
while j < lines.len() && rows_used < TABLE_TRACK_INFERENCE_MAX_ROWS {
if !table_rows_adjacent(&lines[j - 1], &lines[j]) {
break;
}
push_row_xs(&mut xs, &lines[j]);
j += 1;
rows_used += 1;
}
xs.sort_by(f32::total_cmp);
let mut clusters: Vec<(f32, usize)> = Vec::new();
let mut current_sum = 0.0f32;
let mut current_count = 0usize;
let mut current_anchor = f32::NEG_INFINITY;
for &x in &xs {
if current_count == 0 || (x - current_anchor).abs() <= TABLE_TRACK_TOLERANCE_PT {
current_sum += x;
current_count += 1;
current_anchor = current_sum / current_count as f32;
} else {
clusters.push((current_sum / current_count as f32, current_count));
current_sum = x;
current_count = 1;
current_anchor = x;
}
}
if current_count > 0 {
clusters.push((current_sum / current_count as f32, current_count));
}
let max_support = clusters.iter().map(|c| c.1).max().unwrap_or(0);
if max_support >= 3 {
clusters.retain(|c| c.1 >= 2);
}
clusters.into_iter().map(|c| c.0).collect()
}
fn cells_from_raw_items_with_tracks(
line: &ProjectedLine,
tracks: &[f32],
) -> Option<Vec<TableCell>> {
let mut spans: Vec<&TextItem> = line
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.collect();
spans.sort_by(|a, b| a.x.total_cmp(&b.x));
if spans.len() < 2 {
return None;
}
let tol = TABLE_TRACK_TOLERANCE_PT;
let mut cells: Vec<TableCell> = tracks
.iter()
.map(|&t| TableCell {
start_x: t,
end_x: t,
text: String::new(),
bold: false,
})
.collect();
let push_text = |dst: &mut String, src: &str| {
let src = src.trim();
if src.is_empty() {
return;
}
if !dst.is_empty() && !dst.ends_with(' ') {
dst.push(' ');
}
dst.push_str(src);
};
for span in &spans {
let x0 = span.x;
let x1 = span.x + span.width.max(0.0);
let covered: Vec<usize> = tracks
.iter()
.enumerate()
.filter(|&(_, &t)| t >= x0 - tol && t <= x1 + tol)
.map(|(i, _)| i)
.collect();
if covered.len() > 1 {
let left_track = tracks[covered[0]];
if (x0 - left_track).abs() > tol {
return None;
}
}
match covered.len() {
0 => return None,
1 => {
let idx = covered[0];
push_text(&mut cells[idx].text, &span.text);
cells[idx].end_x = cells[idx].end_x.max(x1);
if is_bold_item(span) {
cells[idx].bold = true;
}
}
_ => {
let pieces = split_span_at_anchors(span, &covered, tracks)?;
let bold = is_bold_item(span);
for (idx, piece) in covered.iter().zip(pieces.iter()) {
if piece.is_empty() {
return None;
}
push_text(&mut cells[*idx].text, piece);
if bold {
cells[*idx].bold = true;
}
}
}
}
}
for cell in &mut cells {
cell.text = collapse_whitespace(cell.text.trim());
}
Some(cells)
}
fn is_alpha_dominant(text: &str) -> bool {
let letters = text.chars().filter(|c| c.is_alphabetic()).count();
let digits = text.chars().filter(|c| c.is_ascii_digit()).count();
letters > digits
}
fn is_value_like(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
if t.chars().all(|c| matches!(c, '-' | '—' | '–')) {
return true;
}
if t.contains('$') || t.contains('%') || t.contains('±') {
return true;
}
let chars: Vec<char> = t.chars().collect();
chars
.windows(3)
.any(|w| w[0].is_ascii_digit() && (w[1] == '.' || w[1] == ',') && w[2].is_ascii_digit())
}
fn split_text_at_x_anchors(
text: &str,
x0: f32,
width: f32,
anchors: &[f32],
) -> Option<Vec<String>> {
let chars: Vec<char> = text.chars().collect();
let n = chars.len();
if n == 0 || anchors.is_empty() {
return None;
}
let w = width.max(1.0);
let mut split_indices: Vec<usize> = Vec::new();
for &target in anchors {
let mut best: Option<(usize, f32)> = None;
for (k, ch) in chars.iter().enumerate() {
if !ch.is_whitespace() || split_indices.contains(&k) {
continue;
}
let x = x0 + (k as f32 / n as f32) * w;
let d = (x - target).abs();
if best.as_ref().is_none_or(|b| d < b.1) {
best = Some((k, d));
}
}
let (k, _) = best?;
split_indices.push(k);
}
split_indices.sort();
let mut pieces: Vec<String> = Vec::new();
let mut prev = 0usize;
for &k in &split_indices {
pieces.push(chars[prev..k].iter().collect::<String>().trim().to_string());
prev = k;
}
pieces.push(chars[prev..].iter().collect::<String>().trim().to_string());
Some(pieces)
}
fn split_span_at_anchors(
span: &TextItem,
covered: &[usize],
tracks: &[f32],
) -> Option<Vec<String>> {
if covered.len() < 2 {
return None;
}
let anchors: Vec<f32> = covered[1..].iter().map(|&idx| tracks[idx]).collect();
split_text_at_x_anchors(&span.text, span.x, span.width, &anchors)
}
fn finalize_table_run(
lines: &[ProjectedLine],
start_idx: usize,
floor: usize,
rows: &[(usize, &ProjectedLine, Vec<TableCell>)],
track_ranges: &[(f32, f32)],
column_count: usize,
end: usize,
bold_first_row_eligible: bool,
) -> Option<TableRun> {
let absorbed = absorb_header_lines(lines, start_idx, track_ranges, column_count, floor);
let first_row = &rows[0].2;
let bold_header_qualifies = absorbed.is_none() && bold_first_row_eligible;
let (run_start, header, row_start) = match absorbed {
Some((hstart, header_texts)) => (hstart, Some(header_texts), 0),
None if bold_header_qualifies => (
start_idx,
Some(first_row.iter().map(|c| c.text.clone()).collect()),
1,
),
None => (start_idx, None, 0),
};
let body_rows: Vec<Vec<String>> = rows[row_start..]
.iter()
.map(|(_, _, cells)| cells.iter().map(|c| c.text.clone()).collect())
.collect();
if header.is_none() && body_rows.len() < TABLE_MIN_ROWS {
return None;
}
if *super::flags::DEBUG_TABLE {
eprintln!(
"[tbl-detect @{start_idx}..{end}] cols={column_count} header={header:?} rows={}",
body_rows.len()
);
}
Some(TableRun {
start: run_start,
end,
body_start: start_idx,
block: Block::Table {
header,
rows: body_rows,
},
})
}
fn try_detect_table_inferred(
lines: &[ProjectedLine],
start_idx: usize,
floor: usize,
) -> Option<TableRun> {
let dbgt = *super::flags::DEBUG_TABLE;
let seed_txt: String = lines[start_idx]
.spans
.iter()
.map(|s| s.text.trim())
.collect::<Vec<_>>()
.join("|");
macro_rules! bail {
($($a:tt)*) => {{
if dbgt {
eprintln!("[tbl-inferred bail @{start_idx} \"{:.40}\"] {}", seed_txt, format!($($a)*));
}
return None;
}};
}
let baseline_cells = split_cells(&lines[start_idx]);
let tracks = infer_tracks_from_raw_items(lines, start_idx);
if dbgt {
eprintln!(
"[tbl-inferred try @{start_idx} \"{:.40}\"] tracks={} baseline={} xs=[{}]",
seed_txt,
tracks.len(),
baseline_cells.len(),
tracks
.iter()
.map(|t| format!("{t:.0}"))
.collect::<Vec<_>>()
.join(",")
);
}
if tracks.len() < TABLE_MIN_COLUMNS {
bail!("tracks {} < MIN_COLUMNS", tracks.len());
}
if tracks.len() <= baseline_cells.len() {
bail!(
"tracks {} <= baseline {}",
tracks.len(),
baseline_cells.len()
);
}
let font_size = if lines[start_idx].dominant_font_size > 0.0 {
lines[start_idx].dominant_font_size
} else {
lines[start_idx].bbox.height.max(1.0)
};
let min_track_gap =
(font_size * TABLE_MIN_TRACK_GAP_FONT_MULT).max(TABLE_MIN_TRACK_GAP_FLOOR_PT);
let min_gap = tracks
.windows(2)
.map(|w| w[1] - w[0])
.fold(f32::INFINITY, f32::min);
if min_gap < min_track_gap {
bail!("min_gap {min_gap:.1} < {min_track_gap:.1}");
}
let column_count = tracks.len();
let track_ranges: Vec<(f32, f32)> = tracks.iter().map(|&t| (t, t)).collect();
let tracks_right_edge = *tracks.last().unwrap() + TABLE_TRACK_TOLERANCE_PT.max(8.0);
let tol = TABLE_TRACK_TOLERANCE_PT;
let is_strong_row = |line: &ProjectedLine| -> bool {
let spans: Vec<&TextItem> = line
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.collect();
if spans.len() < tracks.len() {
return false;
}
spans.iter().all(|s| {
let x0 = s.x;
let x1 = s.x + s.width.max(0.0);
tracks
.iter()
.filter(|&&t| t >= x0 - tol && t <= x1 + tol)
.count()
== 1
})
};
let mut body_start = None;
{
let mut k = start_idx;
let mut used = 0;
while k < lines.len() && used < TABLE_TRACK_INFERENCE_MAX_ROWS {
if k > start_idx && !table_rows_adjacent(&lines[k - 1], &lines[k]) {
break;
}
if is_strong_row(&lines[k]) {
body_start = Some(k);
break;
}
k += 1;
used += 1;
}
}
let Some(body_start) = body_start else {
bail!("no strong body row in window");
};
let Some(first) = cells_from_raw_items_with_tracks(&lines[body_start], &tracks) else {
bail!("body row cells unassignable");
};
if first.iter().filter(|c| !c.text.is_empty()).count() < TABLE_MIN_COLUMNS {
bail!("body populated cells < MIN_COLUMNS");
}
let mut rows: Vec<(usize, &ProjectedLine, Vec<TableCell>)> =
vec![(body_start, &lines[body_start], first)];
let mut j = body_start + 1;
while j < lines.len() {
if lines[j].bbox.x > tracks_right_edge {
j += 1;
continue;
}
if !table_rows_adjacent(rows.last().unwrap().1, &lines[j]) {
break;
}
let Some(cells) = cells_from_raw_items_with_tracks(&lines[j], &tracks) else {
if dbgt {
let rt: String = lines[j]
.spans
.iter()
.map(|s| s.text.trim())
.collect::<Vec<_>>()
.join("|");
eprintln!("[tbl-inferred trunc @{j} \"{:.40}\"] row unassignable", rt);
}
break;
};
if cells.iter().all(|c| c.text.is_empty()) {
break;
}
rows.push((j, &lines[j], cells));
j += 1;
}
if rows.len() < TABLE_MIN_ROWS {
bail!("rows {} < MIN_ROWS", rows.len());
}
if body_start > start_idx && rows.len() < 3 {
bail!("advanced body_start but only {} rows", rows.len());
}
let cv = row_spacing_cv(&rows);
if cv > TABLE_ROW_SPACING_MAX_CV {
bail!("row spacing cv {cv:.2} > {TABLE_ROW_SPACING_MAX_CV}");
}
let end = j;
let bold_eligible = rows[0].2.iter().all(|c| c.bold && !c.text.is_empty());
finalize_table_run(
lines,
body_start,
floor,
&rows,
&track_ranges,
column_count,
end,
bold_eligible,
)
}
fn try_detect_table(lines: &[ProjectedLine], start_idx: usize, floor: usize) -> Option<TableRun> {
let first_cells = split_cells(&lines[start_idx]);
if first_cells.len() < TABLE_MIN_COLUMNS {
return None;
}
let mut rows: Vec<(usize, &ProjectedLine, Vec<TableCell>)> =
vec![(start_idx, &lines[start_idx], first_cells.clone())];
let column_count = first_cells.len();
let tracks: Vec<f32> = first_cells.iter().map(|c| c.start_x).collect();
let track_ranges: Vec<(f32, f32)> = first_cells.iter().map(|c| (c.start_x, c.end_x)).collect();
let track_max_x = first_cells
.iter()
.map(|c| c.end_x.max(c.start_x))
.fold(f32::NEG_INFINITY, f32::max);
let tracks_right_edge = track_max_x + TABLE_TRACK_TOLERANCE_PT.max(8.0);
let mut j = start_idx + 1;
while j < lines.len() {
if lines[j].bbox.x > tracks_right_edge {
j += 1;
continue;
}
if !table_rows_adjacent(rows.last().unwrap().1, &lines[j]) {
break;
}
let mut cells = split_cells(&lines[j]);
if cells.len() < column_count && cells.len() >= TABLE_MIN_COLUMNS {
if let Some(patched) = recover_merged_cell(cells.clone(), &tracks) {
cells = patched;
}
}
if cells.len() < column_count && !cells.is_empty() {
let prev_line = rows.last().unwrap().1;
let prev_y_top = prev_line.bbox.y;
let prev_bottom = prev_line.bbox.y + prev_line.bbox.height;
let line_height = prev_line.bbox.height.max(lines[j].bbox.height).max(1.0);
let centroid_dy = lines[j].bbox.y - prev_y_top;
let bottom_gap = lines[j].bbox.y - prev_bottom;
let mapping: Vec<usize> = cells
.iter()
.filter_map(|c| match_track_idx(c, &track_ranges))
.collect();
let all_align_track = mapping.len() == cells.len();
if all_align_track
&& cells.len() >= 2
&& bottom_gap >= line_height * TABLE_SPARSE_ROW_MIN_BOTTOM_GAP_FRAC
{
let mut distinct = mapping.clone();
distinct.sort_unstable();
distinct.dedup();
if distinct.len() == mapping.len() {
let mut padded: Vec<TableCell> = (0..column_count)
.map(|i| TableCell {
start_x: tracks[i],
end_x: tracks[i],
text: String::new(),
bold: false,
})
.collect();
for (c, &idx) in cells.iter().zip(&mapping) {
padded[idx] = c.clone();
}
rows.push((j, &lines[j], padded));
j += 1;
continue;
}
}
if centroid_dy <= line_height * 1.5 && all_align_track {
let prev_cells = &mut rows.last_mut().unwrap().2;
for (c, &idx) in cells.iter().zip(&mapping) {
if !prev_cells[idx].text.is_empty() && !c.text.is_empty() {
prev_cells[idx].text.push(' ');
}
prev_cells[idx].text.push_str(&c.text);
}
j += 1;
continue;
}
}
if cells.len() > column_count {
let kept: Vec<TableCell> = cells
.iter()
.filter(|c| match_track_idx(c, &track_ranges).is_some())
.cloned()
.collect();
if kept.len() == column_count {
cells = kept;
} else {
break;
}
}
if cells.len() != column_count {
break;
}
let misaligned = cells
.iter()
.zip(track_ranges.iter())
.filter(|(c, r)| !cell_aligns_track(c, **r))
.count();
if misaligned > 1 {
break;
}
rows.push((j, &lines[j], cells));
j += 1;
}
if rows.len() < TABLE_MIN_ROWS {
return None;
}
let cv = row_spacing_cv(&rows);
let end = j;
if cv > TABLE_ROW_SPACING_MAX_CV {
let raw: Vec<String> = rows
.iter()
.map(|(_, line, _)| line.text.trim_end().to_string())
.collect();
return Some(TableRun {
start: start_idx,
end,
body_start: start_idx,
block: Block::GridFallback { lines: raw },
});
}
let bold_eligible = rows[0].2.iter().all(|c| c.bold);
finalize_table_run(
lines,
start_idx,
floor,
&rows,
&track_ranges,
column_count,
end,
bold_eligible,
)
}
fn absorb_header_lines(
lines: &[ProjectedLine],
start_idx: usize,
track_ranges: &[(f32, f32)],
column_count: usize,
floor: usize,
) -> Option<(usize, Vec<String>)> {
let dbgt = *super::flags::DEBUG_TABLE;
let mut absorbed: Vec<Vec<TableCell>> = Vec::new();
let mut j = start_idx;
while j > floor {
let cand = j - 1;
let cells = split_cells(&lines[cand]);
if dbgt {
let texts: Vec<&str> = cells.iter().map(|c| c.text.as_str()).collect();
eprintln!(
"[tbl-absorb cand @{cand} {:?}] cells={texts:?} extents={:?}",
lines[cand].text.chars().take(40).collect::<String>(),
cells
.iter()
.map(|c| (c.start_x as i32, c.end_x as i32))
.collect::<Vec<_>>()
);
}
if cells.len() < 2 {
break;
}
if !table_rows_adjacent(&lines[cand], &lines[j]) {
break;
}
if cells.len() > column_count {
break;
}
let all_align = cells
.iter()
.all(|c| track_ranges.iter().any(|r| cell_aligns_track(c, *r)));
if !all_align {
break;
}
absorbed.push(cells);
j = cand;
}
if absorbed.is_empty() {
return None;
}
absorbed.reverse();
let mut header = vec![String::new(); column_count];
for cells in &absorbed {
for c in cells {
let Some(idx) = match_track_idx(c, track_ranges) else {
continue;
};
if !header[idx].is_empty() && !c.text.is_empty() {
header[idx].push(' ');
}
header[idx].push_str(&c.text);
}
}
Some((j, header))
}
pub(super) fn detect_tables(lines: &[ProjectedLine]) -> Vec<TableRun> {
let mut out = Vec::new();
let mut i = 0;
let mut floor = 0;
while i < lines.len() {
if let Some(run) = try_detect_table_inferred(lines, i, floor) {
floor = run.end;
i = run.end;
out.push(run);
} else if let Some(run) = try_detect_table(lines, i, floor) {
floor = run.end;
i = run.end;
out.push(run);
} else if let Some(run) = try_detect_description_list(lines, i) {
floor = run.end;
i = run.end;
out.push(run);
} else {
i += 1;
}
}
let mut merged = merge_consecutive_table_runs(out, lines);
for _ in 0..4 {
let before = merged.len();
merged = merge_fragmented_table_clusters(merged, lines);
if merged.len() == before {
break;
}
}
merged
}
const DESC_LIST_MIN_ROWS: usize = 2;
const DESC_LIST_LABEL_MAX_CHARS: usize = 40;
const DESC_LIST_LABEL_MAX_WORDS: usize = 4;
const DESC_LIST_TRACK_TOL_PT: f32 = 8.0;
const DESC_LIST_MIN_COL_GAP_PT: f32 = 12.0;
fn is_label_like(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
if trimmed.chars().count() > DESC_LIST_LABEL_MAX_CHARS {
return false;
}
if is_bullet_only(trimmed) {
return false;
}
let word_count = trimmed.split_whitespace().count();
if word_count == 0 || word_count > DESC_LIST_LABEL_MAX_WORDS {
return false;
}
let bytes = trimmed.as_bytes();
for i in 0..bytes.len().saturating_sub(2) {
if bytes[i] == b'.' && bytes[i + 1] == b' ' {
let next = bytes[i + 2];
if next.is_ascii_uppercase() {
return false;
}
}
}
true
}
fn is_page_ref(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
if t.chars().all(|c| c.is_ascii_digit()) {
return true;
}
let lower = t.to_ascii_lowercase();
if lower
.chars()
.all(|c| matches!(c, 'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm'))
{
if t.chars().count() <= 6 {
return true;
}
}
false
}
fn is_bullet_only(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
let only_glyph = t.chars().all(|c| {
matches!(
c,
'•' | '●'
| '○'
| '◦'
| '▪'
| '■'
| '□'
| '‣'
| '⁃'
| '*'
| '-'
| '–'
| '—'
| '⮚'
| '►'
| '▶'
)
});
if only_glyph {
return true;
}
let chars: Vec<char> = t.chars().collect();
let is_paren_num = chars.first() == Some(&'(')
&& chars.last() == Some(&')')
&& chars[1..chars.len() - 1].iter().all(|c| c.is_ascii_digit());
if is_paren_num && chars.len() <= 5 {
return true;
}
let trailing = chars.last().copied();
if matches!(trailing, Some('.') | Some(')')) {
let body: String = chars[..chars.len() - 1].iter().collect();
if !body.is_empty()
&& (body.chars().all(|c| c.is_ascii_digit())
|| body
.chars()
.all(|c| matches!(c, 'i' | 'v' | 'x' | 'I' | 'V' | 'X')))
{
return true;
}
}
false
}
fn looks_like_caption(text: &str) -> bool {
let trimmed = text.trim_start();
let lower = trimmed.to_ascii_lowercase();
for prefix in ["figure ", "fig. ", "fig ", "table ", "tab. ", "tab "] {
if let Some(rest) = lower.strip_prefix(prefix) {
if rest.chars().next().is_some_and(|c| c.is_ascii_digit()) {
return true;
}
}
}
false
}
fn try_detect_description_list(lines: &[ProjectedLine], start_idx: usize) -> Option<TableRun> {
let first = split_cells(&lines[start_idx]);
if first.len() != 2 {
return None;
}
let col0_x = first[0].start_x;
let col0_end = first[0].end_x;
let col1_x = first[1].start_x;
if col1_x - col0_end < DESC_LIST_MIN_COL_GAP_PT {
return None;
}
if !is_label_like(&first[0].text) {
return None;
}
let mut rows: Vec<(usize, String, String)> =
vec![(start_idx, first[0].text.clone(), first[1].text.clone())];
let mut real_two_cell_rows: usize = 1;
let mut j = start_idx + 1;
while j < lines.len() {
let prev_line = &lines[rows.last().unwrap().0];
if !table_rows_adjacent(prev_line, &lines[j]) {
break;
}
if looks_like_caption(&lines[j].text) {
break;
}
if rows.len() >= 2 {
let prev_y = prev_line.bbox.y;
let cur_y = lines[j].bbox.y;
let cur_gap = cur_y - prev_y;
let prior_gaps: Vec<f32> = rows
.windows(2)
.map(|w| lines[w[1].0].bbox.y - lines[w[0].0].bbox.y)
.collect();
if let Some(&max_prior) = prior_gaps.iter().max_by(|a, b| a.total_cmp(b))
&& cur_gap > max_prior * 1.6
&& cur_gap > lines[j].bbox.height.max(prev_line.bbox.height)
{
break;
}
}
let cells = split_cells(&lines[j]);
match cells.len() {
2 => {
let c0_aligned = (cells[0].start_x - col0_x).abs() <= DESC_LIST_TRACK_TOL_PT;
let c1_aligned = (cells[1].start_x - col1_x).abs() <= DESC_LIST_TRACK_TOL_PT;
if c0_aligned && c1_aligned && is_label_like(&cells[0].text) {
rows.push((j, cells[0].text.clone(), cells[1].text.clone()));
real_two_cell_rows += 1;
j += 1;
continue;
}
break;
}
1 => {
let cell = &cells[0];
let c0_aligned = (cell.start_x - col0_x).abs() <= DESC_LIST_TRACK_TOL_PT;
let c1_aligned = (cell.start_x - col1_x).abs() <= DESC_LIST_TRACK_TOL_PT;
if c1_aligned {
if !rows.last().unwrap().2.is_empty() {
rows.last_mut().unwrap().2.push(' ');
}
rows.last_mut().unwrap().2.push_str(&cell.text);
j += 1;
continue;
}
let straddles = c0_aligned && cell.end_x > col1_x + DESC_LIST_TRACK_TOL_PT;
if straddles
&& let Some((left, right)) =
split_merged_at_anchor(&cell.text, cell.start_x, cell.end_x, col1_x)
&& is_label_like(&left)
{
rows.push((j, left, right));
j += 1;
continue;
}
break;
}
_ => break,
}
}
if rows.len() < DESC_LIST_MIN_ROWS {
return None;
}
if real_two_cell_rows < 2 {
return None;
}
let has_alpha_pair = rows.iter().any(|(_, c0, c1)| {
c0.chars().any(|c| c.is_alphabetic()) && c1.chars().any(|c| c.is_alphabetic())
});
if !has_alpha_pair {
return None;
}
let all_page_refs = rows.iter().all(|(_, _, c1)| is_page_ref(c1));
if all_page_refs {
return None;
}
let asymmetric = rows
.iter()
.any(|(_, c0, c1)| c1.chars().count() >= c0.chars().count().saturating_mul(2).max(20));
if rows.len() < 3 && !asymmetric {
return None;
}
let body: Vec<Vec<String>> = rows
.iter()
.map(|(_, c0, c1)| vec![c0.clone(), c1.clone()])
.collect();
Some(TableRun {
start: start_idx,
end: j,
body_start: start_idx,
block: Block::Table {
header: None,
rows: body,
},
})
}
fn split_merged_at_anchor(
text: &str,
start_x: f32,
end_x: f32,
anchor_x: f32,
) -> Option<(String, String)> {
let width = (end_x - start_x).max(1.0);
let ratio = ((anchor_x - start_x) / width).clamp(0.0, 1.0);
let chars: Vec<char> = text.chars().collect();
if chars.is_empty() {
return None;
}
let target = ((chars.len() as f32) * ratio) as usize;
let mut best: Option<usize> = None;
let mut best_dist = usize::MAX;
for (i, c) in chars.iter().enumerate() {
if c.is_whitespace() {
let d = i.abs_diff(target);
if d < best_dist {
best_dist = d;
best = Some(i);
}
}
}
let split = best?;
let left: String = chars[..split].iter().collect();
let right: String = chars[split + 1..].iter().collect();
let left = left.trim().to_string();
let right = right.trim().to_string();
if left.is_empty() || right.is_empty() {
return None;
}
Some((left, right))
}
const TABLE_HEADER_MAX_ABSORB_ROWS: usize = 3;
const TABLE_MERGE_MAX_Y_GAP_LINES: f32 = 2.0;
fn merge_consecutive_table_runs(runs: Vec<TableRun>, lines: &[ProjectedLine]) -> Vec<TableRun> {
if runs.len() < 2 {
return runs;
}
let mut out: Vec<TableRun> = Vec::with_capacity(runs.len());
for run in runs {
if let Some(prev) = out.last()
&& let Some(merged) = try_merge_pair(prev, &run, lines)
{
out.pop();
out.push(merged);
continue;
}
out.push(run);
}
out
}
const TABLE_CLUSTER_MAX_INTERSTITIAL_LINES: usize = 2;
const TABLE_CLUSTER_MAX_FAILED_ROW_FRAC: f32 = 0.3;
const TABLE_CLUSTER_MAX_HEADER_LINES: usize = 4;
fn merge_fragmented_table_clusters(runs: Vec<TableRun>, lines: &[ProjectedLine]) -> Vec<TableRun> {
if runs.len() < 2 {
return runs;
}
let dbgt = *super::flags::DEBUG_TABLE;
let mut out: Vec<TableRun> = Vec::with_capacity(runs.len());
let mut i = 0;
while i < runs.len() {
let mut j = i + 1;
while j < runs.len() && cluster_adjacent(&runs[j - 1], &runs[j], lines) {
j += 1;
}
if j - i >= 2 {
let floor = out.last().map(|r| r.end).unwrap_or(0);
if let Some(merged) = build_union_table(&runs[i..j], lines, floor) {
if dbgt {
eprintln!(
"[tbl-cluster] merged {} runs (lines {}..{}) into one table",
j - i,
merged.start,
merged.end
);
}
out.push(merged);
i = j;
continue;
} else if dbgt {
eprintln!(
"[tbl-cluster] union build failed for {} runs @{}..{}",
j - i,
runs[i].start,
runs[j - 1].end
);
}
}
out.push(runs[i].clone());
i += 1;
}
out
}
fn cluster_adjacent(a: &TableRun, b: &TableRun, lines: &[ProjectedLine]) -> bool {
let (a_header, a_rows_len) = match &a.block {
Block::Table { header, rows } => (header.is_some(), rows.len()),
_ => return false,
};
let (b_header, b_rows_len) = match &b.block {
Block::Table { header, rows } => (header.is_some(), rows.len()),
_ => return false,
};
if b.start.saturating_sub(a.end) > TABLE_CLUSTER_MAX_INTERSTITIAL_LINES {
return false;
}
if a.end == 0 || a.end > lines.len() || b.start >= lines.len() {
return false;
}
let a_last = &lines[a.end - 1];
let b_first = &lines[b.start];
let line_height = a_last.bbox.height.max(b_first.bbox.height).max(1.0);
let gap = b_first.bbox.y - (a_last.bbox.y + a_last.bbox.height);
if gap > line_height * TABLE_MERGE_MAX_Y_GAP_LINES || gap < -line_height {
return false;
}
let both_complete = a_header && b_header && a_rows_len >= 3 && b_rows_len >= 3;
if both_complete && gap > line_height {
return false;
}
let dbgt = *super::flags::DEBUG_TABLE;
let (Some(a_tracks), Some(b_tracks)) = (run_body_tracks(a, lines), run_body_tracks(b, lines))
else {
return true;
};
let (narrow, wide) = if a_tracks.len() <= b_tracks.len() {
(&a_tracks, &b_tracks)
} else {
(&b_tracks, &a_tracks)
};
let matched = narrow
.iter()
.filter(|&&t| {
wide.iter()
.any(|&w| subset_match_score(t, w, TABLE_SUBSET_TRACK_TOLERANCE_PT).is_some())
})
.count();
let ok = (matched as f32) >= (narrow.len() as f32) * 0.75;
if !ok && dbgt {
eprintln!(
"[tbl-cluster] adjacency reject @{}..{}: tracks {}/{} matched (narrow=[{}] wide=[{}])",
a.start,
b.end,
matched,
narrow.len(),
narrow
.iter()
.map(|t| format!("{:.0}-{:.0}", t.0, t.1))
.collect::<Vec<_>>()
.join(","),
wide.iter()
.map(|t| format!("{:.0}-{:.0}", t.0, t.1))
.collect::<Vec<_>>()
.join(",")
);
}
ok
}
fn union_tracks_in_window(lines: &[ProjectedLine], start: usize, end: usize) -> Vec<(f32, usize)> {
let mut xs: Vec<f32> = Vec::new();
for line in &lines[start..end.min(lines.len())] {
let cells = split_cells(line);
if cells.len() >= 2 {
xs.extend(cells.iter().map(|c| c.start_x));
}
}
xs.sort_by(f32::total_cmp);
let mut clusters: Vec<(f32, usize)> = Vec::new();
let mut current_sum = 0.0f32;
let mut current_count = 0usize;
let mut current_anchor = f32::NEG_INFINITY;
for &x in &xs {
if current_count == 0 || (x - current_anchor).abs() <= TABLE_SUBSET_TRACK_TOLERANCE_PT {
current_sum += x;
current_count += 1;
current_anchor = current_sum / current_count as f32;
} else {
clusters.push((current_sum / current_count as f32, current_count));
current_sum = x;
current_count = 1;
current_anchor = x;
}
}
if current_count > 0 {
clusters.push((current_sum / current_count as f32, current_count));
}
clusters
}
fn sparse_row_via_cells(line: &ProjectedLine, tracks: &[f32]) -> Option<Vec<String>> {
let cells = split_cells(line);
if cells.is_empty() {
return None;
}
let tol = TABLE_SUBSET_TRACK_TOLERANCE_PT;
let mut mapping: Vec<usize> = Vec::with_capacity(cells.len());
for c in &cells {
let (idx, d) = tracks
.iter()
.enumerate()
.map(|(i, &t)| (i, (c.start_x - t).abs()))
.min_by(|a, b| a.1.total_cmp(&b.1))?;
if d > tol {
return None;
}
mapping.push(idx);
}
let mut distinct = mapping.clone();
distinct.sort_unstable();
distinct.dedup();
if distinct.len() != mapping.len() {
return None;
}
let mut row = vec![String::new(); tracks.len()];
for (c, &idx) in cells.iter().zip(&mapping) {
row[idx] = c.text.clone();
}
Some(row)
}
fn union_header_from_above(
lines: &[ProjectedLine],
body_start: usize,
floor: usize,
tracks: &[f32],
) -> Option<(usize, Vec<String>)> {
let tol = TABLE_TRACK_TOLERANCE_PT;
let assign_nearest = |x_center: f32| -> Option<usize> {
let (idx, d) = tracks
.iter()
.enumerate()
.map(|(i, &t)| (i, (x_center - t).abs()))
.min_by(|a, b| a.1.total_cmp(&b.1))?;
let local_gap = if idx + 1 < tracks.len() {
tracks[idx + 1] - tracks[idx]
} else if idx > 0 {
tracks[idx] - tracks[idx - 1]
} else {
f32::INFINITY
};
if d <= (local_gap * 0.5).max(TABLE_SUBSET_TRACK_TOLERANCE_PT) {
Some(idx)
} else {
None
}
};
let mut layers: Vec<Vec<String>> = Vec::new();
let mut j = body_start;
while j > floor && layers.len() < TABLE_CLUSTER_MAX_HEADER_LINES {
let cand = j - 1;
if !table_rows_adjacent(&lines[cand], &lines[j]) {
break;
}
let spans: Vec<&TextItem> = lines[cand]
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.collect();
if spans.len() < 2 {
break;
}
let mut layer = vec![String::new(); tracks.len()];
let mut ok = true;
for s in &spans {
let x0 = s.x;
let x1 = s.x + s.width.max(0.0);
let covered: Vec<usize> = tracks
.iter()
.enumerate()
.filter(|&(_, &t)| t >= x0 - tol && t <= x1 + tol)
.map(|(i, _)| i)
.collect();
let targets: Vec<usize> = if !covered.is_empty() {
covered
} else if let Some(idx) = assign_nearest((x0 + x1) * 0.5) {
vec![idx]
} else {
ok = false;
break;
};
for idx in targets {
let dst = &mut layer[idx];
if !dst.is_empty() {
dst.push(' ');
}
dst.push_str(s.text.trim());
}
}
if !ok || layer.iter().filter(|t| !t.is_empty()).count() < 2 {
break;
}
layers.push(layer);
j = cand;
}
if layers.is_empty() {
return None;
}
layers.reverse();
let header: Vec<String> = (0..tracks.len())
.map(|col| {
let mut parts: Vec<&str> = Vec::new();
for layer in &layers {
let s = layer[col].as_str();
if s.is_empty() || parts.last() == Some(&s) {
continue;
}
parts.push(s);
}
parts.join(" ")
})
.collect();
if header.iter().all(|h| h.is_empty()) {
return None;
}
Some((j, header))
}
fn build_union_table(
cluster: &[TableRun],
lines: &[ProjectedLine],
floor: usize,
) -> Option<TableRun> {
let dbgt = *super::flags::DEBUG_TABLE;
let window_start = cluster.first()?.body_start;
let window_end = cluster.last()?.end.min(lines.len());
if window_start >= window_end {
return None;
}
let mut supported = union_tracks_in_window(lines, window_start, window_end);
let window_len = window_end - window_start;
let min_support = 2.max(window_len / 10);
supported.retain(|&(_, n)| n >= min_support);
let mut font_sizes: Vec<f32> = lines[window_start..window_end]
.iter()
.map(|l| {
if l.dominant_font_size > 0.0 {
l.dominant_font_size
} else {
l.bbox.height.max(1.0)
}
})
.collect();
font_sizes.sort_by(f32::total_cmp);
let median_font = font_sizes[font_sizes.len() / 2];
let min_track_gap =
(median_font * TABLE_MIN_TRACK_GAP_FONT_MULT).max(TABLE_MIN_TRACK_GAP_FLOOR_PT);
let mut coalesced: Vec<(f32, usize)> = Vec::with_capacity(supported.len());
for (t, n) in supported {
match coalesced.last_mut() {
Some((last, last_n)) if t - *last < min_track_gap => {
*last = (*last * *last_n as f32 + t * n as f32) / (*last_n + n) as f32;
*last_n += n;
}
_ => coalesced.push((t, n)),
}
}
let tracks: Vec<f32> = coalesced.into_iter().map(|(t, _)| t).collect();
if tracks.len() < TABLE_MIN_COLUMNS {
return None;
}
let max_run_cols = cluster.iter().filter_map(run_column_count).max()?;
if tracks.len() < max_run_cols {
if dbgt {
eprintln!(
"[tbl-cluster] reject: {} tracks < widest fragment {max_run_cols} ([{}])",
tracks.len(),
tracks
.iter()
.map(|t| format!("{t:.0}"))
.collect::<Vec<_>>()
.join(",")
);
}
return None;
}
let mut rows: Vec<Vec<String>> = Vec::new();
let mut failed_count = 0usize;
for line in &lines[window_start..window_end] {
if let Some(cells) = cells_from_raw_items_with_tracks(line, &tracks) {
if cells.iter().any(|c| !c.text.is_empty()) {
rows.push(cells.into_iter().map(|c| c.text).collect());
}
} else if let Some(row) = sparse_row_via_cells(line, &tracks) {
rows.push(row);
} else {
failed_count += 1;
let text = line.text.trim();
if !text.is_empty() {
let mut row = vec![String::new(); tracks.len()];
row[0] = collapse_whitespace(text);
rows.push(row);
}
}
}
let window_len = window_end - window_start;
if (failed_count as f32) > (window_len as f32) * TABLE_CLUSTER_MAX_FAILED_ROW_FRAC {
if dbgt {
eprintln!("[tbl-cluster] reject: {failed_count}/{window_len} lines unbinnable");
}
return None;
}
if rows.len() < TABLE_MIN_ROWS {
return None;
}
let absorbed = union_header_from_above(lines, window_start, floor, &tracks);
let (start, header) = match absorbed {
Some((hstart, header)) => (hstart, Some(header)),
None => (window_start, None),
};
Some(TableRun {
start,
end: window_end,
body_start: window_start,
block: Block::Table { header, rows },
})
}
fn run_column_count(run: &TableRun) -> Option<usize> {
match &run.block {
Block::Table { header, rows } => header
.as_ref()
.map(|h| h.len())
.or_else(|| rows.first().map(|r| r.len())),
_ => None,
}
}
fn run_body_tracks(run: &TableRun, lines: &[ProjectedLine]) -> Option<Vec<(f32, f32)>> {
let n_cols = run_column_count(run)?;
let mut acc: Option<Vec<(f32, f32)>> = None;
for line in &lines[run.start..run.end.min(lines.len())] {
let cells = split_cells(line);
if cells.len() != n_cols {
continue;
}
let row: Vec<(f32, f32)> = cells.iter().map(|c| (c.start_x, c.end_x)).collect();
acc = Some(match acc {
None => row,
Some(prev) => prev
.into_iter()
.zip(row)
.map(|((ps, pe), (s, e))| (ps.min(s), pe.max(e)))
.collect(),
});
}
acc
}
fn tracks_align_same(a: &[(f32, f32)], b: &[(f32, f32)]) -> bool {
if a.len() != b.len() {
return false;
}
a.iter().zip(b.iter()).all(|(ta, tb)| {
let ca = (ta.0 + ta.1) * 0.5;
let cb = (tb.0 + tb.1) * 0.5;
(ca - cb).abs() <= TABLE_TRACK_TOLERANCE_PT
})
}
fn subset_match_score(ta: (f32, f32), tb: (f32, f32), tol: f32) -> Option<f32> {
let d_start = (ta.0 - tb.0).abs();
let d_end = (ta.1 - tb.1).abs();
let ca = (ta.0 + ta.1) * 0.5;
let interior_lo = tb.0 + (tb.1 - tb.0) * 0.25;
let interior_hi = tb.1 - (tb.1 - tb.0) * 0.25;
let d_center = if ca >= interior_lo && ca <= interior_hi {
0.0
} else {
f32::INFINITY
};
let d = d_start.min(d_end).min(d_center);
if d <= tol { Some(d) } else { None }
}
const TABLE_SUBSET_TRACK_TOLERANCE_PT: f32 = 12.0;
fn subset_mapping(a: &[(f32, f32)], b: &[(f32, f32)]) -> Option<Vec<usize>> {
if a.len() + 1 != b.len() {
return None;
}
let tol = TABLE_SUBSET_TRACK_TOLERANCE_PT;
let mut best: Option<(Vec<usize>, f32)> = None;
for skip in 0..b.len() {
let mut mapping = Vec::with_capacity(a.len());
let mut total = 0.0f32;
let mut ok = true;
for (i, &ai) in a.iter().enumerate() {
let bi = if i < skip { i } else { i + 1 };
match subset_match_score(ai, b[bi], tol) {
Some(d) => {
mapping.push(bi);
total += d;
}
None => {
ok = false;
break;
}
}
}
if ok && best.as_ref().is_none_or(|(_, e)| total < *e) {
best = Some((mapping, total));
}
}
best.map(|(m, _)| m)
}
fn pad_row_to_layout(row: &[String], mapping: &[usize], target_len: usize) -> Vec<String> {
let mut out: Vec<String> = vec![String::new(); target_len];
for (a_idx, &b_idx) in mapping.iter().enumerate() {
if b_idx < target_len && a_idx < row.len() {
out[b_idx] = row[a_idx].clone();
}
}
out
}
const TABLE_MERGE_MAX_INTERSTITIAL: usize = 1;
const TABLE_MERGE_MAX_INTERSTITIAL_CHARS: usize = 60;
fn is_absorbable_interstitial(line: &ProjectedLine) -> bool {
let cells = split_cells(line);
if cells.len() > 1 {
return false;
}
let text = line.text.trim();
if text.len() > TABLE_MERGE_MAX_INTERSTITIAL_CHARS {
return false;
}
if let Some(last) = text.chars().last()
&& matches!(last, '.' | '!' | '?')
&& text.len() > 6
{
return false;
}
true
}
fn try_merge_pair(a: &TableRun, b: &TableRun, lines: &[ProjectedLine]) -> Option<TableRun> {
let interstitial = b.start.saturating_sub(a.end);
if interstitial > TABLE_MERGE_MAX_INTERSTITIAL {
return None;
}
let interstitial_texts: Vec<String> = if interstitial == 0 {
Vec::new()
} else {
let slice = &lines[a.end..b.start];
if !slice.iter().all(is_absorbable_interstitial) {
return None;
}
slice.iter().map(|l| l.text.trim().to_string()).collect()
};
let (a_header, a_rows) = match &a.block {
Block::Table { header, rows } => (header.clone(), rows.clone()),
_ => return None,
};
let (b_header, b_rows) = match &b.block {
Block::Table { header, rows } => (header.clone(), rows.clone()),
_ => return None,
};
let a_cols = run_column_count(a)?;
let b_cols = run_column_count(b)?;
let a_tracks = run_body_tracks(a, lines)?;
let b_tracks = run_body_tracks(b, lines)?;
if a.end == 0 || a.end > lines.len() || b.start >= lines.len() {
return None;
}
let a_last = &lines[a.end - 1];
let b_first = &lines[b.start];
let line_height = a_last.bbox.height.max(b_first.bbox.height).max(1.0);
let gap = b_first.bbox.y - (a_last.bbox.y + a_last.bbox.height);
if gap > line_height * TABLE_MERGE_MAX_Y_GAP_LINES {
return None;
}
if gap < -line_height {
return None;
}
if a_cols == b_cols && tracks_align_same(&a_tracks, &b_tracks) {
let both_complete =
a_header.is_some() && b_header.is_some() && a_rows.len() >= 3 && b_rows.len() >= 3;
if both_complete && gap > line_height * 1.0 {
return None;
}
let header = a_header.clone().or_else(|| b_header.clone());
let mut rows = a_rows.clone();
for text in &interstitial_texts {
let mut row = vec![String::new(); b_cols];
row[0] = text.clone();
rows.push(row);
}
if a_header.is_some()
&& b_header.is_some()
&& let Some(bh) = b_header.clone()
{
rows.push(bh);
}
rows.extend(b_rows.iter().cloned());
return Some(TableRun {
start: a.start,
end: b.end,
body_start: a.body_start,
block: Block::Table { header, rows },
});
}
if a_cols + 1 == b_cols && a_rows.len() <= TABLE_HEADER_MAX_ABSORB_ROWS {
let mapping = subset_mapping(&a_tracks, &b_tracks)?;
let mut header_layers: Vec<Vec<String>> = Vec::new();
if let Some(h) = &a_header {
header_layers.push(pad_row_to_layout(h, &mapping, b_cols));
}
for row in &a_rows {
header_layers.push(pad_row_to_layout(row, &mapping, b_cols));
}
if let Some(h) = &b_header {
header_layers.push(h.clone());
}
if header_layers.is_empty() {
return None;
}
let merged_header: Vec<String> = (0..b_cols)
.map(|col| {
let mut parts: Vec<String> = Vec::new();
for layer in &header_layers {
let s = layer.get(col).map(|s| s.as_str()).unwrap_or("");
if s.is_empty() {
continue;
}
if parts.last().map(|p| p.as_str()) == Some(s) {
continue;
}
parts.push(s.to_string());
}
parts.join(" ")
})
.collect();
let mut merged_rows: Vec<Vec<String>> = Vec::new();
for text in &interstitial_texts {
let mut row = vec![String::new(); b_cols];
row[0] = text.clone();
merged_rows.push(row);
}
merged_rows.extend(b_rows.iter().cloned());
return Some(TableRun {
start: a.start,
end: b.end,
body_start: b.body_start,
block: Block::Table {
header: Some(merged_header),
rows: merged_rows,
},
});
}
None
}
#[derive(Debug, Clone, Copy)]
struct HSeg {
x_min: f32,
x_max: f32,
y: f32,
}
#[derive(Debug, Clone, Copy)]
struct VSeg {
y_min: f32,
y_max: f32,
x: f32,
}
const TABLE_AXIS_TOLERANCE_PT: f32 = 1.0;
const TABLE_GRID_CLUSTER_PT: f32 = 2.0;
const TABLE_CROSS_TOLERANCE_PT: f32 = 3.0;
const TABLE_COL_BOUNDARY_CLUSTER_PT: f32 = 6.0;
const TABLE_MAX_EMPTY_CELL_FRACTION: f32 = 0.30;
const TABLE_SPINE_FILL_FRACTION: f32 = 0.7;
const TABLE_SPINE_MAX_CELL_CHARS: usize = 60;
const TABLE_MAX_EMPTY_CELL_FRACTION_WITH_SPINE: f32 = 0.75;
const TABLE_MAX_PAGE_COVERAGE: f32 = 0.95;
const RULED_HLINE_MIN_COVERAGE: f32 = 0.5;
fn extract_h_v_segments(graphics: &[GraphicPrimitive]) -> (Vec<HSeg>, Vec<VSeg>) {
let mut hs = Vec::new();
let mut vs = Vec::new();
for g in graphics {
match g {
GraphicPrimitive::Stroke { x1, y1, x2, y2, .. } => {
let (x1, y1, x2, y2) = (*x1, *y1, *x2, *y2);
let dy = (y1 - y2).abs();
let dx = (x1 - x2).abs();
if dy <= TABLE_AXIS_TOLERANCE_PT && dx > 1.0 {
hs.push(HSeg {
x_min: x1.min(x2),
x_max: x1.max(x2),
y: (y1 + y2) * 0.5,
});
} else if dx <= TABLE_AXIS_TOLERANCE_PT && dy > 1.0 {
vs.push(VSeg {
y_min: y1.min(y2),
y_max: y1.max(y2),
x: (x1 + x2) * 0.5,
});
}
}
GraphicPrimitive::Rect { bbox, stroke, .. } => {
if stroke.is_none() {
continue;
}
let top = bbox.y;
let bottom = bbox.y + bbox.height;
let left = bbox.x;
let right = bbox.x + bbox.width;
if bbox.width > 1.0 {
hs.push(HSeg {
x_min: left,
x_max: right,
y: top,
});
hs.push(HSeg {
x_min: left,
x_max: right,
y: bottom,
});
}
if bbox.height > 1.0 {
vs.push(VSeg {
y_min: top,
y_max: bottom,
x: left,
});
vs.push(VSeg {
y_min: top,
y_max: bottom,
x: right,
});
}
}
}
}
(hs, vs)
}
fn cluster_h_segments(mut segs: Vec<HSeg>) -> Vec<HSeg> {
if segs.is_empty() {
return segs;
}
segs.sort_by(|a, b| a.y.total_cmp(&b.y));
let mut out: Vec<HSeg> = Vec::with_capacity(segs.len());
for seg in segs {
if let Some(last) = out.last_mut()
&& (last.y - seg.y).abs() <= TABLE_GRID_CLUSTER_PT
{
last.x_min = last.x_min.min(seg.x_min);
last.x_max = last.x_max.max(seg.x_max);
continue;
}
out.push(seg);
}
out
}
fn cluster_v_segments(mut segs: Vec<VSeg>) -> Vec<VSeg> {
if segs.is_empty() {
return segs;
}
segs.sort_by(|a, b| a.x.total_cmp(&b.x));
let mut out: Vec<VSeg> = Vec::with_capacity(segs.len());
for seg in segs {
if let Some(last) = out.last_mut()
&& (last.x - seg.x).abs() <= TABLE_GRID_CLUSTER_PT
{
last.y_min = last.y_min.min(seg.y_min);
last.y_max = last.y_max.max(seg.y_max);
continue;
}
out.push(seg);
}
out
}
fn uf_find(parent: &mut [usize], mut x: usize) -> usize {
while parent[x] != x {
parent[x] = parent[parent[x]];
x = parent[x];
}
x
}
fn uf_union(parent: &mut [usize], a: usize, b: usize) {
let ra = uf_find(parent, a);
let rb = uf_find(parent, b);
if ra != rb {
parent[ra] = rb;
}
}
fn find_grid_components(hs: &[HSeg], vs: &[VSeg]) -> Vec<(Vec<usize>, Vec<usize>)> {
let n_h = hs.len();
let n_v = vs.len();
if n_h < 2 || n_v < 2 {
return Vec::new();
}
let n = n_h + n_v;
let mut parent: Vec<usize> = (0..n).collect();
let mut connected = vec![false; n];
let tol = TABLE_CROSS_TOLERANCE_PT;
for (i, h) in hs.iter().enumerate() {
for (j, v) in vs.iter().enumerate() {
let v_crosses_h_x = v.x >= h.x_min - tol && v.x <= h.x_max + tol;
let h_crosses_v_y = h.y >= v.y_min - tol && h.y <= v.y_max + tol;
if v_crosses_h_x && h_crosses_v_y {
uf_union(&mut parent, i, n_h + j);
connected[i] = true;
connected[n_h + j] = true;
}
}
}
use std::collections::HashMap;
let mut groups: HashMap<usize, (Vec<usize>, Vec<usize>)> = HashMap::new();
for (i, &is_connected) in connected[..n_h].iter().enumerate() {
if !is_connected {
continue;
}
let r = uf_find(&mut parent, i);
groups.entry(r).or_default().0.push(i);
}
for j in 0..n_v {
if !connected[n_h + j] {
continue;
}
let r = uf_find(&mut parent, n_h + j);
groups.entry(r).or_default().1.push(j);
}
let mut comps: Vec<(Vec<usize>, Vec<usize>)> = groups
.into_values()
.filter(|(h_idx, v_idx)| h_idx.len() >= 2 && v_idx.len() >= 2)
.collect();
comps.sort_by_key(|(h_idx, _)| h_idx[0]);
comps
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum RuledPass {
PerRegion,
Global,
}
fn filter_by<T>(items: Vec<T>, keep: &[bool]) -> Vec<T> {
items
.into_iter()
.zip(keep.iter())
.filter(|(_, k)| **k)
.map(|(v, _)| v)
.collect()
}
struct CellGrid {
text: Vec<Vec<String>>,
is_bold: Vec<Vec<bool>>,
has_text: Vec<Vec<bool>>,
repl: Vec<Vec<String>>,
row_alpha_spanner: Vec<bool>,
}
impl CellGrid {
fn new(n_rows: usize, n_cols: usize) -> Self {
CellGrid {
text: vec![vec![String::new(); n_cols]; n_rows],
is_bold: vec![vec![true; n_cols]; n_rows],
has_text: vec![vec![false; n_cols]; n_rows],
repl: vec![vec![String::new(); n_cols]; n_rows],
row_alpha_spanner: vec![false; n_rows],
}
}
fn n_rows(&self) -> usize {
self.text.len()
}
fn n_cols(&self) -> usize {
self.text.first().map(|r| r.len()).unwrap_or(0)
}
fn push_text(&mut self, row: usize, col: usize, txt: &str) {
let txt = txt.trim();
if txt.is_empty() {
return;
}
if !self.text[row][col].is_empty() {
self.text[row][col].push(' ');
}
self.text[row][col].push_str(txt);
self.has_text[row][col] = true;
}
fn push_repl(&mut self, row: usize, col: usize, txt: &str) {
let txt = txt.trim();
if txt.is_empty() {
return;
}
let dst = &mut self.repl[row][col];
if !dst.is_empty() {
dst.push(' ');
}
dst.push_str(txt);
}
fn retain_rows(&mut self, keep: &[bool]) {
self.text = filter_by(std::mem::take(&mut self.text), keep);
self.is_bold = filter_by(std::mem::take(&mut self.is_bold), keep);
self.has_text = filter_by(std::mem::take(&mut self.has_text), keep);
self.repl = filter_by(std::mem::take(&mut self.repl), keep);
self.row_alpha_spanner = filter_by(std::mem::take(&mut self.row_alpha_spanner), keep);
}
fn retain_cols(&mut self, keep: &[bool]) {
self.text = std::mem::take(&mut self.text)
.into_iter()
.map(|row| filter_by(row, keep))
.collect();
self.is_bold = std::mem::take(&mut self.is_bold)
.into_iter()
.map(|row| filter_by(row, keep))
.collect();
self.has_text = std::mem::take(&mut self.has_text)
.into_iter()
.map(|row| filter_by(row, keep))
.collect();
self.repl = std::mem::take(&mut self.repl)
.into_iter()
.map(|row| filter_by(row, keep))
.collect();
}
fn collapse_phantom_rows(&mut self, ys: &[f32]) -> Vec<f32> {
let n_rows = self.n_rows();
let row_heights: Vec<f32> = (0..n_rows).map(|r| ys[r + 1] - ys[r]).collect();
let nonempty_heights: Vec<f32> = (0..n_rows)
.filter(|r| self.has_text[*r].iter().any(|t| *t))
.map(|r| row_heights[r])
.collect();
let median_h = if !nonempty_heights.is_empty() {
let mut s = nonempty_heights.clone();
s.sort_by(|a, b| a.total_cmp(b));
s[s.len() / 2]
} else {
let mut s = row_heights.clone();
s.sort_by(|a, b| a.total_cmp(b));
s[s.len() / 2]
};
let keep: Vec<bool> = (0..n_rows)
.map(|r| {
let has_text = self.has_text[r].iter().any(|t| *t);
has_text || row_heights[r] >= median_h * 0.8
})
.collect();
let kept_row_heights: Vec<f32> = (0..n_rows)
.filter(|r| keep[*r])
.map(|r| row_heights[r])
.collect();
self.retain_rows(&keep);
kept_row_heights
}
fn collapse_phantom_cols(&mut self, xs: &[f32]) {
let n_rows = self.n_rows();
let n_cols = self.n_cols();
let col_widths: Vec<f32> = (0..n_cols).map(|c| xs[c + 1] - xs[c]).collect();
let nonempty_col_widths: Vec<f32> = (0..n_cols)
.filter(|c| (0..n_rows).any(|r| self.has_text[r][*c]))
.map(|c| col_widths[c])
.collect();
let median_w = if !nonempty_col_widths.is_empty() {
let mut s = nonempty_col_widths.clone();
s.sort_by(|a, b| a.total_cmp(b));
s[s.len() / 2]
} else {
let mut s = col_widths.clone();
s.sort_by(|a, b| a.total_cmp(b));
s[s.len() / 2]
};
let keep_col: Vec<bool> = (0..n_cols)
.map(|c| {
let has_text = (0..n_rows).any(|r| self.has_text[r][c]);
has_text || col_widths[c] >= median_w * 0.3
})
.collect();
self.retain_cols(&keep_col);
}
}
fn assign_cells(
lines: &[ProjectedLine],
xs: &[f32],
ys: &[f32],
dbg: bool,
) -> Option<(CellGrid, Vec<usize>)> {
let n_rows = ys.len() - 1;
let n_cols = xs.len() - 1;
let mut grid = CellGrid::new(n_rows, n_cols);
let mut consumed_indices: Vec<usize> = Vec::new();
const GRID_X_SLACK_PT: f32 = 6.0;
const STRADDLE_MARGIN_PT: f32 = 3.0;
let mut span_total = 0usize;
let mut span_straddle = 0usize;
let mut line_order: Vec<usize> = (0..lines.len()).collect();
line_order.sort_by(|&a, &b| lines[a].bbox.y.total_cmp(&lines[b].bbox.y));
for idx in line_order {
let line = &lines[idx];
let cy = line.bbox.y + line.bbox.height * 0.5;
if cy < ys[0] || cy > ys[n_rows] {
continue;
}
let row = match find_bucket(ys, cy) {
Some(r) => r,
None => continue,
};
if line.text.trim().is_empty() {
continue;
}
let line_x0 = line.bbox.x;
let line_x1 = line.bbox.x + line.bbox.width;
if line_x0 < xs[0] - GRID_X_SLACK_PT || line_x1 > xs[n_cols] + GRID_X_SLACK_PT {
if dbg {
eprintln!(
"[ruled] skip-overhang row={row} x={line_x0:.0}..{line_x1:.0} grid={:.0}..{:.0} text={:?}",
xs[0],
xs[n_cols],
&line.text.chars().take(60).collect::<String>()
);
}
continue;
}
let mut text_spans: Vec<&TextItem> = line
.spans
.iter()
.filter(|s| !s.text.trim().is_empty())
.collect();
text_spans.sort_by(|a, b| a.x.total_cmp(&b.x));
if text_spans.is_empty() {
let cx = line.bbox.x + line.bbox.width * 0.5;
if let Some(col) = find_bucket(xs, cx.clamp(xs[0], xs[n_cols])) {
grid.push_text(row, col, &line.text);
grid.push_repl(row, col, &line.text);
if !line.all_bold {
grid.is_bold[row][col] = false;
}
consumed_indices.push(idx);
}
continue;
}
for span in text_spans {
let span_cy = span.y + span.height * 0.5;
let row = find_bucket(ys, span_cy.clamp(ys[0], ys[n_rows])).unwrap_or(row);
let sx0 = (span.x).clamp(xs[0], xs[n_cols]);
let sx1 = (span.x + span.width).clamp(xs[0], xs[n_cols]);
let c_lo = find_bucket(xs, sx0).unwrap_or(0);
let c_hi = find_bucket(xs, sx1).unwrap_or(n_cols - 1);
span_total += 1;
{
let m0 = (span.x + STRADDLE_MARGIN_PT).clamp(xs[0], xs[n_cols]);
let m1 = (span.x + span.width - STRADDLE_MARGIN_PT).clamp(xs[0], xs[n_cols]);
if m1 > m0 && find_bucket(xs, m0) != find_bucket(xs, m1) {
span_straddle += 1;
}
}
if c_lo == c_hi {
grid.push_text(row, c_lo, &span.text);
grid.push_repl(row, c_lo, &span.text);
if !line.all_bold {
grid.is_bold[row][c_lo] = false;
}
continue;
}
for col in c_lo..=c_hi {
grid.push_repl(row, col, &span.text);
}
if is_alpha_dominant(&span.text) {
grid.row_alpha_spanner[row] = true;
}
let covered: Vec<usize> = (c_lo..=c_hi).collect();
if let Some(pieces) = split_span_at_anchors(span, &covered, xs) {
for (k, piece) in pieces.iter().enumerate() {
grid.push_text(row, c_lo + k, piece);
if !line.all_bold {
grid.is_bold[row][c_lo + k] = false;
}
}
} else {
let cx = (span.x + span.width * 0.5).clamp(xs[0], xs[n_cols]);
if let Some(col) = find_bucket(xs, cx) {
grid.push_text(row, col, &span.text);
if !line.all_bold {
grid.is_bold[row][col] = false;
}
}
}
}
consumed_indices.push(idx);
}
if consumed_indices.is_empty() {
if dbg {
eprintln!("[ruled] REJECT no-lines-consumed");
}
return None;
}
let straddle_frac = if span_total > 0 {
span_straddle as f32 / span_total as f32
} else {
0.0
};
if dbg {
eprintln!("[ruled] straddle {span_straddle}/{span_total} = {straddle_frac:.2}");
}
if span_total >= 6 && straddle_frac > 0.45 {
if dbg {
eprintln!("[ruled] REJECT straddle-frac {straddle_frac:.2}");
}
return None;
}
Some((grid, consumed_indices))
}
fn flatten_header_band(
cells: &[Vec<String>],
cell_has_text: &[Vec<bool>],
cells_repl: &[Vec<String>],
row_alpha_spanner: &[bool],
n_rows: usize,
n_cols: usize,
dbg: bool,
) -> Option<(Vec<String>, usize)> {
let row_fill =
|r: usize| cell_has_text[r].iter().filter(|t| **t).count() as f32 / n_cols as f32;
(0..n_rows)
.find(|r| row_fill(*r) >= TABLE_ROW_MIN_FILL)
.and_then(|b| {
let nonempty = cell_has_text[b].iter().filter(|t| **t).count();
let alpha_cells = (0..n_cols)
.filter(|c| cell_has_text[b][*c] && is_alpha_dominant(&cells[b][*c]))
.count();
let has_value_cell =
(0..n_cols).any(|c| cell_has_text[b][c] && is_value_like(&cells[b][c]));
let qualifies = (1..=3).contains(&b)
&& b + 1 < n_rows
&& (0..b).any(|r| row_alpha_spanner[r])
&& alpha_cells * 2 >= nonempty
&& !has_value_cell;
if !qualifies {
return None;
}
let header: Vec<String> = (0..n_cols)
.map(|c| {
let mut parts: Vec<&str> = Vec::new();
for row in cells_repl.iter().take(b + 1) {
let s = row[c].as_str();
if s.is_empty() || parts.last() == Some(&s) {
continue;
}
parts.push(s);
}
parts.join(" ")
})
.collect();
if header.iter().all(|h| h.is_empty()) {
return None;
}
if dbg {
eprintln!("[ruled] colspan header flatten: rows 0..={b} -> {header:?}");
}
Some((header, b + 1))
})
}
#[allow(clippy::type_complexity)]
fn merge_stacked_header(
cells: Vec<Vec<String>>,
cell_has_text: Vec<Vec<bool>>,
cell_is_bold: Vec<Vec<bool>>,
n_rows: usize,
n_cols: usize,
kept_row_heights: &[f32],
flattened: bool,
dbg: bool,
) -> (
Vec<Vec<String>>,
Vec<Vec<bool>>,
Vec<Vec<bool>>,
usize,
bool,
) {
let row_fill =
|r: usize, has: &[Vec<bool>]| has[r].iter().filter(|t| **t).count() as f32 / n_cols as f32;
let k = (0..n_rows)
.find(|r| row_fill(*r, &cell_has_text) >= TABLE_ROW_MIN_FILL)
.unwrap_or(0);
let union_cols = (0..n_cols)
.filter(|c| (0..k).any(|r| cell_has_text[r][*c]))
.count();
let band_tight = if k >= 2 && k < kept_row_heights.len() {
let mut below: Vec<f32> = kept_row_heights[k..].to_vec();
below.sort_by(|a, b| a.total_cmp(b));
let median_below = below[below.len() / 2];
kept_row_heights[..k]
.iter()
.all(|h| *h <= 0.75 * median_below)
} else {
false
};
if flattened || k < 2 || k >= n_rows || !band_tight || (union_cols as f32) < 0.7 * n_cols as f32
{
return (cells, cell_has_text, cell_is_bold, n_rows, false);
}
if dbg {
eprintln!("[ruled] stacked-header merge: top {k} rows → 1");
}
let mut merged_row = vec![String::new(); n_cols];
let mut merged_has = vec![false; n_cols];
let mut merged_bold = vec![true; n_cols];
for r in 0..k {
for c in 0..n_cols {
if cell_has_text[r][c] {
if !merged_row[c].is_empty() {
merged_row[c].push(' ');
}
merged_row[c].push_str(&cells[r][c]);
merged_has[c] = true;
if !cell_is_bold[r][c] {
merged_bold[c] = false;
}
}
}
}
let mut new_cells = vec![merged_row];
let mut new_has = vec![merged_has];
let mut new_bold = vec![merged_bold];
new_cells.extend(cells[k..].iter().cloned());
new_has.extend(cell_has_text[k..].iter().cloned());
new_bold.extend(cell_is_bold[k..].iter().cloned());
let nr = new_cells.len();
(new_cells, new_has, new_bold, nr, true)
}
fn passes_density_gate(
cells: &[Vec<String>],
cell_has_text: &[Vec<bool>],
cell_is_bold: &[Vec<bool>],
n_rows: usize,
n_cols: usize,
flattened: bool,
dbg: bool,
) -> bool {
let total = n_rows * n_cols;
let empty_count = cell_has_text
.iter()
.flatten()
.filter(|filled| !**filled)
.count();
let empty_frac = (empty_count as f32) / (total as f32);
if empty_frac <= TABLE_MAX_EMPTY_CELL_FRACTION {
return true;
}
let col0_fill = (0..n_rows).filter(|r| cell_has_text[*r][0]).count() as f32 / n_rows as f32;
let col0_max_chars = (0..n_rows)
.filter(|r| cell_has_text[*r][0])
.map(|r| cells[r][0].len())
.max()
.unwrap_or(0);
let col0_spine =
col0_fill >= TABLE_SPINE_FILL_FRACTION && col0_max_chars <= TABLE_SPINE_MAX_CELL_CHARS;
let header_band = n_rows.min(4);
let mut header_cols_covered = vec![false; n_cols];
let mut header_all_bold = true;
for r in 0..header_band {
for c in 0..n_cols {
if cell_has_text[r][c] {
header_cols_covered[c] = true;
if !cell_is_bold[r][c] {
header_all_bold = false;
}
}
}
}
let header_coverage = header_cols_covered.iter().filter(|t| **t).count();
let dense_inner_col = (1..n_cols).any(|c| {
let col_fill = (0..n_rows).filter(|r| cell_has_text[*r][c]).count() as f32 / n_rows as f32;
col_fill >= TABLE_SPINE_FILL_FRACTION
});
let long_prose_table =
n_rows >= 5 && n_cols >= 3 && header_coverage >= 3 && header_all_bold && dense_inner_col;
if !col0_spine && !long_prose_table && !flattened {
if dbg {
let fills: Vec<usize> = (0..n_rows)
.map(|r| cell_has_text[r].iter().filter(|t| **t).count())
.collect();
eprintln!(
"[ruled] REJECT empty-frac {empty_frac:.2} ({n_rows}x{n_cols}, no spine/long-prose) row_fills={fills:?}"
);
}
return false;
}
if empty_frac > TABLE_MAX_EMPTY_CELL_FRACTION_WITH_SPINE && !long_prose_table {
if dbg {
eprintln!("[ruled] REJECT empty-frac-with-spine {empty_frac:.2}");
}
return false;
}
true
}
fn build_ruled_table(
hs: &[HSeg],
vs: &[VSeg],
h_indices: &[usize],
v_indices: &[usize],
lines: &[ProjectedLine],
page_width: f32,
page_height: f32,
pass: RuledPass,
) -> Option<(TableRun, Vec<usize>)> {
let dbg = *super::flags::DEBUG_RULED;
let mut xs: Vec<f32> = v_indices.iter().map(|&i| vs[i].x).collect();
xs.sort_by(|a, b| a.total_cmp(b));
cluster_boundaries(&mut xs, TABLE_COL_BOUNDARY_CLUSTER_PT);
let raw_ys = |idxs: &[usize]| {
let mut v: Vec<f32> = idxs.iter().map(|&i| hs[i].y).collect();
v.sort_by(|a, b| a.total_cmp(b));
dedup_close(&mut v, TABLE_GRID_CLUSTER_PT);
v
};
let ys: Vec<f32> = if pass == RuledPass::Global && xs.len() >= 2 {
let col_lo = xs[0];
let col_hi = xs[xs.len() - 1];
let extent = (col_hi - col_lo).max(1.0);
let kept: Vec<usize> = h_indices
.iter()
.copied()
.filter(|&i| {
let h = &hs[i];
let ov = (h.x_max.min(col_hi) - h.x_min.max(col_lo)).max(0.0);
ov / extent >= RULED_HLINE_MIN_COVERAGE
})
.collect();
let filtered = raw_ys(&kept);
if filtered.len() >= 3 {
filtered
} else {
raw_ys(h_indices)
}
} else {
raw_ys(h_indices)
};
if dbg {
eprintln!(
"[ruled] component: ys={:?} xs={:?} ({} lines in scope)",
ys,
xs,
lines.len()
);
}
if ys.len() < 3 || xs.len() < 2 {
if dbg {
eprintln!(
"[ruled] REJECT grid-too-small ys={} xs={}",
ys.len(),
xs.len()
);
}
return None;
}
let n_rows = ys.len() - 1;
let n_cols = xs.len() - 1;
let bbox = crate::types::Rect {
x: xs[0],
y: ys[0],
width: xs[n_cols] - xs[0],
height: ys[n_rows] - ys[0],
};
if page_width > 0.0 && page_height > 0.0 {
let coverage = (bbox.width / page_width) * (bbox.height / page_height);
if coverage > TABLE_MAX_PAGE_COVERAGE {
if dbg {
eprintln!("[ruled] REJECT page-coverage {coverage:.2}");
}
return None;
}
}
let (mut grid, consumed_indices) = assign_cells(lines, &xs, &ys, dbg)?;
let kept_row_heights = grid.collapse_phantom_rows(&ys);
let n_rows = grid.n_rows();
if n_rows < 2 {
if dbg {
eprintln!("[ruled] REJECT rows-after-collapse {n_rows}");
}
return None;
}
grid.collapse_phantom_cols(&xs);
let n_cols = grid.n_cols();
if n_cols == 0 {
return None;
}
if n_cols == 1 && n_rows < 3 {
return None;
}
let CellGrid {
text: cells,
is_bold: cell_is_bold,
has_text: cell_has_text,
repl: cells_repl,
row_alpha_spanner,
} = grid;
let flattened_header = flatten_header_band(
&cells,
&cell_has_text,
&cells_repl,
&row_alpha_spanner,
n_rows,
n_cols,
dbg,
);
let (cells, cell_has_text, cell_is_bold, n_rows, merged_stacked_header) = merge_stacked_header(
cells,
cell_has_text,
cell_is_bold,
n_rows,
n_cols,
&kept_row_heights,
flattened_header.is_some(),
dbg,
);
if !passes_density_gate(
&cells,
&cell_has_text,
&cell_is_bold,
n_rows,
n_cols,
flattened_header.is_some(),
dbg,
) {
return None;
}
let header_qualifies = merged_stacked_header
|| (cell_has_text[0]
.iter()
.zip(cell_is_bold[0].iter())
.all(|(has, bold)| !has || *bold)
&& cell_has_text[0].iter().any(|has| *has));
let (header, body_start) = match flattened_header {
Some((h, bs)) => (Some(h), bs),
None if header_qualifies => (Some(cells[0].clone()), 1),
None => (None, 0),
};
let body_rows: Vec<Vec<String>> = cells[body_start..].to_vec();
if body_rows.is_empty() {
return None;
}
let start = *consumed_indices.iter().min().unwrap();
let end = *consumed_indices.iter().max().unwrap() + 1;
Some((
TableRun {
start,
end,
body_start: start,
block: Block::Table {
header,
rows: body_rows,
},
},
consumed_indices,
))
}
fn cluster_boundaries(v: &mut Vec<f32>, tol: f32) {
if v.len() < 2 {
return;
}
let mut out: Vec<f32> = Vec::with_capacity(v.len());
let mut cluster_sum = v[0];
let mut cluster_n = 1usize;
let mut last = v[0];
for &x in v.iter().skip(1) {
if x - last <= tol {
cluster_sum += x;
cluster_n += 1;
} else {
out.push(cluster_sum / cluster_n as f32);
cluster_sum = x;
cluster_n = 1;
}
last = x;
}
out.push(cluster_sum / cluster_n as f32);
*v = out;
}
fn dedup_close(v: &mut Vec<f32>, tol: f32) {
if v.len() < 2 {
return;
}
let mut out: Vec<f32> = Vec::with_capacity(v.len());
for x in v.iter().copied() {
if let Some(&last) = out.last()
&& (x - last).abs() <= tol
{
continue;
}
out.push(x);
}
*v = out;
}
fn find_bucket(boundaries: &[f32], val: f32) -> Option<usize> {
if boundaries.len() < 2 || val < boundaries[0] || val > *boundaries.last().unwrap() {
return None;
}
for (i, w) in boundaries.windows(2).enumerate() {
if val >= w[0] && val <= w[1] {
return Some(i);
}
}
None
}
pub fn detect_table_rects(
graphics: &[GraphicPrimitive],
page_width: f32,
page_height: f32,
) -> Vec<Rect> {
let (hs, vs) = extract_h_v_segments(graphics);
let hs = cluster_h_segments(hs);
let vs = cluster_v_segments(vs);
if hs.len() < 2 || vs.len() < 2 {
return Vec::new();
}
let components = find_grid_components(&hs, &vs);
let mut out = Vec::new();
for (h_idx, v_idx) in components {
let ys: Vec<f32> = h_idx.iter().map(|&i| hs[i].y).collect();
let xs: Vec<f32> = v_idx.iter().map(|&i| vs[i].x).collect();
let y_min = ys.iter().copied().fold(f32::INFINITY, f32::min);
let y_max = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let x_min = xs.iter().copied().fold(f32::INFINITY, f32::min);
let x_max = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let w = x_max - x_min;
let h = y_max - y_min;
if w < 5.0 || h < 5.0 {
continue;
}
if page_width > 0.0
&& page_height > 0.0
&& w / page_width >= TABLE_MAX_PAGE_COVERAGE
&& h / page_height >= TABLE_MAX_PAGE_COVERAGE
{
continue;
}
out.push(Rect {
x: x_min,
y: y_min,
width: w,
height: h,
});
}
out
}
fn detect_ruled_tables_impl(
lines: &[ProjectedLine],
graphics: &[GraphicPrimitive],
page_width: f32,
page_height: f32,
pass: RuledPass,
) -> Vec<(TableRun, Vec<usize>)> {
let (hs, vs) = extract_h_v_segments(graphics);
let hs = cluster_h_segments(hs);
let vs = cluster_v_segments(vs);
if hs.len() < 2 || vs.len() < 2 {
return Vec::new();
}
let components = find_grid_components(&hs, &vs);
let mut out = Vec::new();
for (h_idx, v_idx) in components {
if let Some(run) = build_ruled_table(
&hs,
&vs,
&h_idx,
&v_idx,
lines,
page_width,
page_height,
pass,
) {
out.push(run);
}
}
out.sort_by_key(|(r, _)| r.start);
out
}
pub(super) fn detect_ruled_tables(
lines: &[ProjectedLine],
graphics: &[GraphicPrimitive],
page_width: f32,
page_height: f32,
) -> Vec<TableRun> {
detect_ruled_tables_impl(
lines,
graphics,
page_width,
page_height,
RuledPass::PerRegion,
)
.into_iter()
.map(|(r, _)| r)
.collect()
}
pub(super) fn detect_ruled_tables_global(
lines: &[ProjectedLine],
graphics: &[GraphicPrimitive],
page_width: f32,
page_height: f32,
) -> Vec<(TableRun, Vec<usize>)> {
detect_ruled_tables_impl(lines, graphics, page_width, page_height, RuledPass::Global)
}
fn run_filled_cells(run: &TableRun) -> usize {
match &run.block {
Block::Table { header, rows } => {
let header_filled = header
.as_ref()
.map(|h| h.iter().filter(|c| !c.trim().is_empty()).count())
.unwrap_or(0);
let body_filled: usize = rows
.iter()
.flat_map(|r| r.iter())
.filter(|c| !c.trim().is_empty())
.count();
header_filled + body_filled
}
_ => 0,
}
}
pub(super) fn merge_table_runs(
mut ruled: Vec<TableRun>,
borderless: Vec<TableRun>,
) -> Vec<TableRun> {
let mut kept: Vec<TableRun> = Vec::with_capacity(ruled.len());
for r in ruled.drain(..) {
let is_one_col = matches!(&r.block, Block::Table { rows, .. } if rows.first().map(|row| row.len()).unwrap_or(0) <= 1);
if is_one_col {
let beaten = borderless.iter().any(|b| {
let overlaps = !(b.end <= r.start || b.start >= r.end);
if !overlaps {
return false;
}
matches!(&b.block, Block::Table { rows, .. } if rows.first().map(|row| row.len()).unwrap_or(0) >= 2)
});
if beaten {
continue;
}
}
let ruled_density = run_filled_cells(&r);
let beaten_by_density = borderless.iter().any(|b| {
let overlaps = !(b.end <= r.start || b.start >= r.end);
if !overlaps {
return false;
}
run_filled_cells(b) >= ruled_density * 2 + 4
});
if beaten_by_density {
continue;
}
kept.push(r);
}
for b in borderless {
let overlaps = kept.iter().any(|r| !(b.end <= r.start || b.start >= r.end));
if !overlaps {
kept.push(b);
}
}
kept.sort_by_key(|r| r.start);
kept
}
pub(super) fn escape_table_cell(s: &str) -> String {
s.replace('\\', "\\\\")
.replace('|', "\\|")
.replace('\n', " ")
}
#[cfg(test)]
mod tests {
use super::super::test_helpers::{line, line_with_spans, rect_borders, stroke};
use super::*;
#[test]
fn split_cells_splits_on_wide_gaps() {
let l = line_with_spans(&[("A", 50.0), ("B", 150.0), ("C", 250.0)], 100.0, 10.0);
let cells = split_cells(&l);
assert_eq!(cells.len(), 3);
assert_eq!(cells[0].text, "A");
assert_eq!(cells[1].text, "B");
assert_eq!(cells[2].text, "C");
}
#[test]
fn recover_merged_cell_splits_off_by_one() {
let row = vec![
TableCell {
start_x: 50.0,
end_x: 160.0,
text: "MEMORYBANK 5.00".into(),
bold: false,
},
TableCell {
start_x: 250.0,
end_x: 280.0,
text: "4.77".into(),
bold: false,
},
];
let tracks = vec![50.0, 150.0, 250.0];
let out = recover_merged_cell(row, &tracks).expect("recovery should succeed");
assert_eq!(out.len(), 3);
assert_eq!(out[0].text, "MEMORYBANK");
assert_eq!(out[1].text, "5.00");
assert_eq!(out[2].text, "4.77");
}
#[test]
fn recover_merged_cell_splits_off_by_two() {
let row = vec![
TableCell {
start_x: 50.0,
end_x: 260.0,
text: "MEMORYBANK 13.18 10.03".into(),
bold: false,
},
TableCell {
start_x: 350.0,
end_x: 380.0,
text: "7.61".into(),
bold: false,
},
];
let tracks = vec![50.0, 150.0, 250.0, 350.0];
let out = recover_merged_cell(row, &tracks).expect("recovery should succeed");
assert_eq!(out.len(), 4);
assert_eq!(out[0].text, "MEMORYBANK");
assert_eq!(out[1].text, "13.18");
assert_eq!(out[2].text, "10.03");
assert_eq!(out[3].text, "7.61");
}
#[test]
fn recover_merged_cell_bails_without_enough_whitespace() {
let row = vec![TableCell {
start_x: 50.0,
end_x: 200.0,
text: "ABC-DEF-GHI".into(),
bold: false,
}];
let tracks = vec![50.0, 150.0];
assert!(recover_merged_cell(row, &tracks).is_none());
}
#[test]
fn split_cells_keeps_close_spans_together() {
let l = line_with_spans(&[("Hello", 50.0), ("world", 80.0)], 100.0, 10.0);
let cells = split_cells(&l);
assert_eq!(cells.len(), 1);
assert_eq!(cells[0].text, "Hello world");
}
#[test]
fn absorbs_partial_header_line_above_body() {
let lines = vec![
line_with_spans(&[("Name", 50.0), ("Scores", 150.0)], 100.0, 10.0),
line_with_spans(&[("A", 50.0), ("1", 150.0), ("2", 250.0)], 115.0, 10.0),
line_with_spans(&[("B", 50.0), ("3", 150.0), ("4", 250.0)], 130.0, 10.0),
line_with_spans(&[("C", 50.0), ("5", 150.0), ("6", 250.0)], 145.0, 10.0),
];
let runs = detect_tables(&lines);
assert_eq!(runs.len(), 1);
let run = &runs[0];
assert_eq!(run.start, 0, "header line should be absorbed into the run");
assert_eq!(run.end, 4);
match &run.block {
Block::Table { header, rows } => {
let header = header.as_ref().expect("header should be present");
assert_eq!(
header,
&vec!["Name".to_string(), "Scores".to_string(), String::new()]
);
assert_eq!(rows.len(), 3);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
#[test]
fn does_not_absorb_single_cell_title_above_body() {
let lines = vec![
line_with_spans(&[("Results", 50.0)], 100.0, 10.0),
line_with_spans(&[("A", 50.0), ("1", 150.0), ("2", 250.0)], 115.0, 10.0),
line_with_spans(&[("B", 50.0), ("3", 150.0), ("4", 250.0)], 130.0, 10.0),
line_with_spans(&[("C", 50.0), ("5", 150.0), ("6", 250.0)], 145.0, 10.0),
];
let runs = detect_tables(&lines);
assert_eq!(runs.len(), 1);
assert_eq!(
runs[0].start, 1,
"single-cell title must stay out of the run"
);
}
#[test]
fn rejects_table_when_row_count_too_low() {
let lines = vec![line_with_spans(
&[("A", 50.0), ("B", 150.0), ("C", 250.0)],
100.0,
10.0,
)];
let runs = detect_tables(&lines);
assert!(runs.is_empty());
}
#[test]
fn rejects_table_when_column_count_too_low() {
let lines = vec![
line_with_spans(&[("A", 50.0), ("B", 200.0)], 100.0, 10.0),
line_with_spans(&[("C", 50.0), ("D", 200.0)], 115.0, 10.0),
];
let runs = detect_tables(&lines);
assert!(runs.is_empty());
}
#[test]
fn escapes_pipe_inside_cell() {
assert_eq!(escape_table_cell("a|b"), "a\\|b");
}
#[test]
fn ruled_table_2x2_detected() {
let mut graphics = Vec::new();
for y in [100.0_f32, 140.0, 180.0] {
graphics.push(stroke(50.0, y, 250.0, y, 0.5));
}
for x in [50.0_f32, 150.0, 250.0] {
graphics.push(stroke(x, 100.0, x, 180.0, 0.5));
}
let lines = vec![
line("a", 90.0, 115.0, 10.0, 10.0), line("b", 190.0, 115.0, 10.0, 10.0), line("c", 90.0, 155.0, 10.0, 10.0), line("d", 190.0, 155.0, 10.0, 10.0), ];
let runs = detect_ruled_tables(&lines, &graphics, 612.0, 792.0);
assert_eq!(runs.len(), 1, "expected 1 ruled table, got {runs:?}");
match &runs[0].block {
Block::Table { header, rows } => {
assert!(header.is_none(), "no bold first row → no header");
assert_eq!(rows.len(), 2);
assert_eq!(rows[0], vec!["a", "b"]);
assert_eq!(rows[1], vec!["c", "d"]);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
#[test]
fn ruled_table_rect_borders_detected() {
let mut graphics = Vec::new();
graphics.extend(rect_borders(50.0, 100.0, 100.0, 40.0)); graphics.extend(rect_borders(150.0, 100.0, 100.0, 40.0)); graphics.extend(rect_borders(50.0, 140.0, 100.0, 40.0)); graphics.extend(rect_borders(150.0, 140.0, 100.0, 40.0));
let lines = vec![
line("a", 90.0, 115.0, 10.0, 10.0),
line("b", 190.0, 115.0, 10.0, 10.0),
line("c", 90.0, 155.0, 10.0, 10.0),
line("d", 190.0, 155.0, 10.0, 10.0),
];
let runs = detect_ruled_tables(&lines, &graphics, 612.0, 792.0);
assert_eq!(runs.len(), 1);
}
#[test]
fn ruled_table_page_border_rejected() {
let graphics = rect_borders(10.0, 10.0, 590.0, 770.0);
let lines = vec![line("body text", 50.0, 400.0, 10.0, 10.0)];
let runs = detect_ruled_tables(&lines, &graphics, 612.0, 792.0);
assert!(
runs.is_empty(),
"page-border rect should not become a table, got {runs:?}"
);
}
#[test]
fn ruled_table_mostly_empty_rejected() {
let mut graphics = Vec::new();
for y in [100.0_f32, 130.0, 160.0, 190.0] {
graphics.push(stroke(50.0, y, 350.0, y, 0.5));
}
for x in [50.0_f32, 150.0, 250.0, 350.0] {
graphics.push(stroke(x, 100.0, x, 190.0, 0.5));
}
let lines = vec![line("only", 90.0, 115.0, 10.0, 10.0)];
let runs = detect_ruled_tables(&lines, &graphics, 612.0, 792.0);
assert!(runs.is_empty());
}
#[test]
fn ruled_table_first_row_bold_becomes_header() {
let mut graphics = Vec::new();
for y in [100.0_f32, 140.0, 180.0] {
graphics.push(stroke(50.0, y, 250.0, y, 0.5));
}
for x in [50.0_f32, 150.0, 250.0] {
graphics.push(stroke(x, 100.0, x, 180.0, 0.5));
}
let mut a = line("Name", 90.0, 115.0, 10.0, 10.0);
let mut b = line("Score", 190.0, 115.0, 10.0, 10.0);
a.all_bold = true;
b.all_bold = true;
let lines = vec![
a,
b,
line("alice", 90.0, 155.0, 10.0, 10.0),
line("99", 190.0, 155.0, 10.0, 10.0),
];
let runs = detect_ruled_tables(&lines, &graphics, 612.0, 792.0);
assert_eq!(runs.len(), 1);
match &runs[0].block {
Block::Table { header, rows } => {
assert_eq!(
header.as_deref(),
Some(&["Name".into(), "Score".into()][..])
);
assert_eq!(rows.len(), 1);
assert_eq!(rows[0], vec!["alice", "99"]);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
#[test]
fn merge_prefers_ruled_when_overlapping() {
let ruled = vec![TableRun {
start: 5,
end: 10,
body_start: 5,
block: Block::Table {
header: None,
rows: vec![vec!["ruled".into()]],
},
}];
let borderless = vec![TableRun {
start: 6,
end: 11,
body_start: 6,
block: Block::GridFallback {
lines: vec!["bl".into()],
},
}];
let merged = merge_table_runs(ruled, borderless);
assert_eq!(merged.len(), 1);
assert!(matches!(&merged[0].block, Block::Table { .. }));
}
fn three_col_line(label: &str, y: f32) -> ProjectedLine {
line_with_spans(&[(label, 50.0), (label, 150.0), (label, 250.0)], y, 10.0)
}
fn four_col_line(label: &str, y: f32) -> ProjectedLine {
line_with_spans(
&[
(label, 50.0),
(label, 150.0),
(label, 250.0),
(label, 350.0),
],
y,
10.0,
)
}
fn three_col_subset_line(label: &str, y: f32) -> ProjectedLine {
line_with_spans(&[(label, 150.0), (label, 250.0), (label, 350.0)], y, 10.0)
}
#[test]
fn merge_same_column_count_concatenates_rows() {
let lines = vec![
three_col_line("h1", 10.0),
three_col_line("h2", 25.0),
three_col_line("b1", 40.0),
three_col_line("b2", 55.0),
three_col_line("b3", 70.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: Some(vec!["A".into(), "B".into(), "C".into()]),
rows: vec![vec!["1".into(), "2".into(), "3".into()]],
},
};
let b = TableRun {
start: 2,
end: 5,
body_start: 2,
block: Block::Table {
header: None,
rows: vec![
vec!["x".into(), "y".into(), "z".into()],
vec!["p".into(), "q".into(), "r".into()],
vec!["m".into(), "n".into(), "o".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 1, "expected single merged run");
match &merged[0].block {
Block::Table { header, rows } => {
assert_eq!(header.as_deref().map(|h| h.len()), Some(3));
assert_eq!(rows.len(), 4);
assert_eq!(rows[0], vec!["1", "2", "3"]);
assert_eq!(rows[3], vec!["m", "n", "o"]);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
#[test]
fn merge_subset_columns_folds_into_header() {
let lines = vec![
three_col_subset_line("2011", 10.0),
three_col_subset_line("(pct)", 25.0),
four_col_line("body", 40.0),
four_col_line("body", 55.0),
four_col_line("body", 70.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: None,
rows: vec![
vec!["2011".into(), "2010".into(), "Avg".into()],
vec!["(pct)".into(), "(pct)".into(), "(pct)".into()],
],
},
};
let b = TableRun {
start: 2,
end: 5,
body_start: 2,
block: Block::Table {
header: None,
rows: vec![
vec!["Q3".into(), "10".into(), "20".into(), "30".into()],
vec!["Q4".into(), "11".into(), "21".into(), "31".into()],
vec!["YR".into(), "12".into(), "22".into(), "32".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 1);
match &merged[0].block {
Block::Table { header, rows } => {
let h = header.as_deref().expect("expected header");
assert_eq!(h.len(), 4);
assert_eq!(h[0], "");
assert_eq!(h[1], "2011 (pct)");
assert_eq!(h[2], "2010 (pct)");
assert_eq!(h[3], "Avg (pct)");
assert_eq!(rows.len(), 3);
assert_eq!(rows[0], vec!["Q3", "10", "20", "30"]);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
#[test]
fn merge_skips_distant_runs() {
let lines = vec![
three_col_line("h1", 10.0),
three_col_line("h2", 25.0),
three_col_line("b1", 200.0), three_col_line("b2", 215.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: Some(vec!["A".into(), "B".into(), "C".into()]),
rows: vec![vec!["1".into(), "2".into(), "3".into()]],
},
};
let b = TableRun {
start: 2,
end: 4,
body_start: 2,
block: Block::Table {
header: None,
rows: vec![
vec!["x".into(), "y".into(), "z".into()],
vec!["p".into(), "q".into(), "r".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 2, "distant runs should not merge");
}
#[test]
fn merge_skips_large_prior_run() {
let lines: Vec<ProjectedLine> = (0..10)
.map(|i| three_col_subset_line("x", 10.0 + i as f32 * 15.0))
.chain((0..3).map(|i| four_col_line("y", 160.0 + i as f32 * 15.0)))
.collect();
let a = TableRun {
start: 0,
end: 10,
body_start: 0,
block: Block::Table {
header: None,
rows: (0..10)
.map(|_| vec!["a".into(), "b".into(), "c".into()])
.collect(),
},
};
let b = TableRun {
start: 10,
end: 13,
body_start: 10,
block: Block::Table {
header: None,
rows: (0..3)
.map(|_| vec!["1".into(), "2".into(), "3".into(), "4".into()])
.collect(),
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 2, "large prior run should not be absorbed");
}
#[test]
fn merge_skips_two_col_diff() {
let lines = vec![
three_col_subset_line("x", 10.0),
three_col_subset_line("y", 25.0),
line_with_spans(
&[
("a", 50.0),
("b", 150.0),
("c", 250.0),
("d", 350.0),
("e", 450.0),
],
40.0,
10.0,
),
line_with_spans(
&[
("a", 50.0),
("b", 150.0),
("c", 250.0),
("d", 350.0),
("e", 450.0),
],
55.0,
10.0,
),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: None,
rows: vec![
vec!["x".into(), "y".into(), "z".into()],
vec!["x".into(), "y".into(), "z".into()],
],
},
};
let b = TableRun {
start: 2,
end: 4,
body_start: 2,
block: Block::Table {
header: None,
rows: vec![
vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()],
vec!["1".into(), "2".into(), "3".into(), "4".into(), "5".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 2, "2-col difference should not merge");
}
#[test]
fn merge_grid_fallback_left_alone() {
let lines = vec![
three_col_line("a", 10.0),
three_col_line("b", 25.0),
three_col_line("c", 40.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: None,
rows: vec![
vec!["a".into(), "b".into(), "c".into()],
vec!["a".into(), "b".into(), "c".into()],
],
},
};
let b = TableRun {
start: 2,
end: 3,
body_start: 2,
block: Block::GridFallback {
lines: vec!["fallback".into()],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 2, "grid fallback should not be merged");
}
#[test]
fn merge_rejects_long_prose_interstitial() {
let lines = vec![
three_col_line("h", 10.0),
three_col_line("h", 25.0),
line_with_spans(
&[
("This", 50.0),
("is", 150.0),
("real", 250.0),
("content", 350.0),
],
40.0,
10.0,
),
three_col_line("b", 55.0),
three_col_line("b", 70.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: Some(vec!["A".into(), "B".into(), "C".into()]),
rows: vec![vec!["1".into(), "2".into(), "3".into()]],
},
};
let b = TableRun {
start: 3,
end: 5,
body_start: 3,
block: Block::Table {
header: None,
rows: vec![
vec!["x".into(), "y".into(), "z".into()],
vec!["p".into(), "q".into(), "r".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 2, "multi-cell interstitial should not merge");
}
#[test]
fn merge_absorbs_single_cell_interstitial_as_body_row() {
let lines = vec![
three_col_subset_line("h", 10.0),
three_col_subset_line("h", 25.0),
line_with_spans(&[("Topsoil", 50.0)], 40.0, 10.0),
four_col_line("body", 55.0),
four_col_line("body", 70.0),
four_col_line("body", 85.0),
];
let a = TableRun {
start: 0,
end: 2,
body_start: 0,
block: Block::Table {
header: None,
rows: vec![
vec!["2011".into(), "2010".into(), "Avg".into()],
vec!["(pct)".into(), "(pct)".into(), "(pct)".into()],
],
},
};
let b = TableRun {
start: 3,
end: 6,
body_start: 3,
block: Block::Table {
header: None,
rows: vec![
vec!["Q3".into(), "10".into(), "20".into(), "30".into()],
vec!["Q4".into(), "11".into(), "21".into(), "31".into()],
vec!["YR".into(), "12".into(), "22".into(), "32".into()],
],
},
};
let merged = merge_consecutive_table_runs(vec![a, b], &lines);
assert_eq!(merged.len(), 1);
match &merged[0].block {
Block::Table { header, rows } => {
assert!(header.is_some());
assert_eq!(rows.len(), 4, "interstitial + 3 body rows");
assert_eq!(rows[0][0], "Topsoil");
assert_eq!(rows[1], vec!["Q3", "10", "20", "30"]);
}
other => panic!("expected Block::Table, got {other:?}"),
}
}
}