use crate::pdf::structure::text_repair::repair_broken_word_spacing;
use crate::pdf::structure::types::{LayoutHint, LayoutHintClass};
use crate::pdf::table_reconstruct::{is_well_formed_table, post_process_table, reconstruct_table, table_to_markdown};
use crate::types::Table;
use super::table_recognition::word_hint_iow;
pub(in crate::pdf::structure) fn extract_tables_from_layout_hints(
words: &[crate::pdf::table_reconstruct::HocrWord],
hints: &[LayoutHint],
page_index: usize,
page_height: f32,
min_confidence: f32,
allow_single_column: bool,
) -> Vec<Table> {
use crate::pdf::table_reconstruct::HocrWord;
let table_hints: Vec<&LayoutHint> = hints
.iter()
.filter(|h| h.class == LayoutHintClass::Table && h.confidence >= min_confidence)
.collect();
if table_hints.is_empty() {
return Vec::new();
}
let mut tables = Vec::new();
for hint in &table_hints {
let hint_img_top = (page_height - hint.top).max(0.0);
let hint_img_bottom = (page_height - hint.bottom).max(0.0);
let table_words: Vec<HocrWord> = words
.iter()
.filter(|w| {
if w.text.trim().is_empty() {
return false;
}
word_hint_iow(w, hint.left, hint_img_top, hint.right, hint_img_bottom) >= 0.2
})
.cloned()
.collect();
if table_words.len() < 4 {
continue;
}
let table_width = hint.right - hint.left;
let col_gap = compute_adaptive_column_gap(&table_words, table_width);
let table_cells = reconstruct_table(&table_words, col_gap, 0.5);
if table_cells.is_empty() || table_cells[0].is_empty() {
continue;
}
let bounding_box = Some(crate::types::BoundingBox {
x0: hint.left as f64,
y0: hint.bottom as f64,
x1: hint.right as f64,
y1: hint.top as f64,
});
let table_cells = match post_process_table(table_cells, true, allow_single_column) {
Some(cleaned) => cleaned,
None => {
tracing::trace!(
page = page_index,
hint_left = hint.left,
hint_right = hint.right,
words = table_words.len(),
"table reconstruction failed — skipping false-positive Table hint"
);
continue;
}
};
if table_cells.len() <= 1 {
tracing::trace!(
page = page_index,
rows = table_cells.len(),
"table has <=1 row — skipping likely false-positive Table hint"
);
continue;
}
let total_cells: usize = table_cells.iter().map(|row| row.len()).sum();
let empty_cells: usize = table_cells
.iter()
.flat_map(|row| row.iter())
.filter(|cell| cell.trim().is_empty())
.count();
if total_cells > 0 && empty_cells as f64 / total_cells as f64 > 0.55 {
tracing::trace!(
page = page_index,
total_cells,
empty_cells,
"table has >40% empty cells — skipping degenerate table"
);
continue;
}
let total_text_len: usize = table_cells
.iter()
.flat_map(|row| row.iter())
.map(|cell| cell.trim().len())
.sum();
if total_cells > 6 && total_text_len < total_cells {
tracing::trace!(
page = page_index,
total_cells,
total_text_len,
"table text content too sparse — skipping degenerate table"
);
continue;
}
if table_cells.len() >= 3 {
let single_cell_rows = table_cells
.iter()
.filter(|row| row.iter().filter(|c| !c.trim().is_empty()).count() <= 1)
.count();
if single_cell_rows as f64 / table_cells.len() as f64 > 0.5 {
tracing::trace!(
page = page_index,
rows = table_cells.len(),
single_cell_rows,
"table has >50% single-cell rows — skipping likely false-positive"
);
continue;
}
}
if !is_well_formed_table(&table_cells) {
tracing::trace!(
page = page_index,
rows = table_cells.len(),
cols = table_cells.first().map_or(0, |r| r.len()),
"table failed quality validation — skipping as prose"
);
continue;
}
let repaired_cells: Vec<Vec<String>> = table_cells
.iter()
.map(|row| {
row.iter()
.map(|cell| repair_broken_word_spacing(cell).into_owned())
.collect()
})
.collect();
let markdown = table_to_markdown(&repaired_cells);
tracing::trace!(
page = page_index,
rows = table_cells.len(),
total_cells,
empty_cells,
total_text_len,
markdown_len = markdown.len(),
"table accepted"
);
tables.push(Table {
cells: table_cells,
markdown,
page_number: page_index + 1,
bounding_box,
});
}
tables
}
fn compute_adaptive_column_gap(words: &[crate::pdf::table_reconstruct::HocrWord], table_width: f32) -> u32 {
let mut gaps: Vec<u32> = Vec::new();
if words.len() >= 4 {
let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
heights.sort_unstable();
let median_h = heights[heights.len() / 2];
let row_tolerance = (median_h / 2).max(3);
let mut sorted: Vec<(u32, u32, u32)> = words
.iter()
.map(|w| {
let yc = w.top + w.height / 2;
(yc, w.left, w.left + w.width)
})
.collect();
sorted.sort_by_key(|&(yc, x, _)| (yc, x));
let mut row_start = 0;
while row_start < sorted.len() {
let row_yc = sorted[row_start].0;
let mut row_end = row_start + 1;
while row_end < sorted.len() && sorted[row_end].0.abs_diff(row_yc) <= row_tolerance {
row_end += 1;
}
for i in row_start + 1..row_end {
let prev_right = sorted[i - 1].2;
let curr_left = sorted[i].1;
if curr_left > prev_right {
gaps.push(curr_left - prev_right);
}
}
row_start = row_end;
}
}
if gaps.len() >= 3 {
gaps.sort_unstable();
let median_gap = gaps[gaps.len() / 2];
let threshold = (median_gap * 2).clamp(8, 40);
return threshold;
}
if table_width < 200.0 {
10
} else if table_width < 400.0 {
15
} else if table_width < 600.0 {
20
} else {
30
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pdf::table_reconstruct::HocrWord;
fn make_word(text: &str, left: u32, top: u32, width: u32, height: u32) -> HocrWord {
HocrWord {
text: text.to_string(),
left,
top,
width,
height,
confidence: 95.0,
}
}
fn make_table_hint(confidence: f32, left: f32, bottom: f32, right: f32, top: f32) -> LayoutHint {
LayoutHint {
class: LayoutHintClass::Table,
confidence,
left,
bottom,
right,
top,
}
}
#[test]
fn test_no_table_hints_returns_empty() {
let words = vec![make_word("hello", 10, 10, 50, 12)];
let hints = vec![LayoutHint {
class: LayoutHintClass::Text,
confidence: 0.9,
left: 0.0,
bottom: 0.0,
right: 600.0,
top: 800.0,
}];
let tables = extract_tables_from_layout_hints(&words, &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_low_confidence_table_hint_filtered() {
let words = vec![
make_word("A", 10, 10, 50, 12),
make_word("B", 100, 10, 50, 12),
make_word("C", 10, 30, 50, 12),
make_word("D", 100, 30, 50, 12),
];
let hints = vec![make_table_hint(0.3, 0.0, 0.0, 200.0, 800.0)];
let tables = extract_tables_from_layout_hints(&words, &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_empty_region_too_few_words() {
let words = vec![make_word("A", 10, 10, 50, 12), make_word("B", 100, 10, 50, 12)];
let hints = vec![make_table_hint(0.9, 0.0, 0.0, 200.0, 800.0)];
let tables = extract_tables_from_layout_hints(&words, &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_empty_words_returns_empty() {
let hints = vec![make_table_hint(0.9, 0.0, 0.0, 200.0, 800.0)];
let tables = extract_tables_from_layout_hints(&[], &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_no_hints_returns_empty() {
let words = vec![
make_word("A", 10, 10, 50, 12),
make_word("B", 100, 10, 50, 12),
make_word("C", 10, 30, 50, 12),
make_word("D", 100, 30, 50, 12),
];
let tables = extract_tables_from_layout_hints(&words, &[], 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_words_outside_hint_bbox_excluded() {
let words = vec![
make_word("A", 500, 500, 50, 12),
make_word("B", 560, 500, 50, 12),
make_word("C", 500, 520, 50, 12),
make_word("D", 560, 520, 50, 12),
];
let hints = vec![make_table_hint(0.9, 0.0, 700.0, 100.0, 800.0)];
let tables = extract_tables_from_layout_hints(&words, &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_whitespace_only_words_filtered() {
let words = vec![
make_word(" ", 10, 10, 50, 12),
make_word("A", 100, 10, 50, 12),
make_word("B", 10, 30, 50, 12),
make_word("C", 100, 30, 50, 12),
];
let hints = vec![make_table_hint(0.9, 0.0, 0.0, 200.0, 800.0)];
let tables = extract_tables_from_layout_hints(&words, &hints, 0, 800.0, 0.5, false);
assert!(tables.is_empty());
}
#[test]
fn test_page_number_is_one_indexed() {
let words = vec![
make_word("Header1", 10, 10, 80, 15),
make_word("Header2", 200, 10, 80, 15),
make_word("Cell1", 10, 40, 80, 15),
make_word("Cell2", 200, 40, 80, 15),
make_word("Cell3", 10, 70, 80, 15),
make_word("Cell4", 200, 70, 80, 15),
];
let hints = vec![make_table_hint(0.9, 0.0, 700.0, 400.0, 800.0)];
let tables = extract_tables_from_layout_hints(&words, &hints, 2, 800.0, 0.5, false);
for table in &tables {
assert_eq!(table.page_number, 3); }
}
}