use super::types::TextLine;
#[derive(Debug, Clone)]
pub struct Table {
pub page: usize,
pub x_min: f32,
pub x_max: f32,
pub y_min: f32,
pub y_max: f32,
pub rows: Vec<Vec<String>>,
}
impl Table {
pub fn to_markdown(&self) -> String {
use std::fmt::Write;
if self.rows.is_empty() {
return String::new();
}
let col_count = self.rows.iter().map(Vec::len).max().unwrap_or(0);
if col_count == 0 {
return String::new();
}
let mut md = String::new();
md.push('|');
let header = &self.rows[0];
for col in 0..col_count {
let cell = header.get(col).map_or("", String::as_str);
let _ = write!(md, " {cell} |");
}
md.push('\n');
md.push('|');
for _ in 0..col_count {
md.push_str(" --- |");
}
md.push('\n');
for row in self.rows.iter().skip(1) {
md.push('|');
for col in 0..col_count {
let cell = row.get(col).map_or("", String::as_str);
let _ = write!(md, " {cell} |");
}
md.push('\n');
}
md
}
}
const MIN_TABLE_ROWS: usize = 3;
const BOUNDARY_TOLERANCE: f32 = 5.0;
pub fn detect_tables(lines: &[TextLine]) -> Vec<Table> {
let mut tables = Vec::new();
let mut page_groups: std::collections::BTreeMap<usize, Vec<&TextLine>> =
std::collections::BTreeMap::new();
for line in lines {
page_groups.entry(line.page).or_default().push(line);
}
for (page, page_lines) in &page_groups {
let line_boundaries: Vec<Vec<f32>> = page_lines
.iter()
.map(|line| find_column_boundaries(line))
.collect();
let mut run_start = 0;
while run_start < page_lines.len() {
let mut run_end = run_start + 1;
while run_end < page_lines.len()
&& boundaries_align(
&line_boundaries[run_start],
&line_boundaries[run_end],
BOUNDARY_TOLERANCE,
)
{
run_end += 1;
}
let run_len = run_end - run_start;
if run_len >= MIN_TABLE_ROWS && !line_boundaries[run_start].is_empty() {
let boundaries = &line_boundaries[run_start];
let rows: Vec<Vec<String>> = page_lines[run_start..run_end]
.iter()
.map(|line| split_at_boundaries(line, boundaries))
.collect();
let table_lines = &page_lines[run_start..run_end];
tables.push(Table {
page: *page,
x_min: table_lines
.iter()
.map(|l| l.x)
.fold(f32::INFINITY, f32::min),
x_max: table_lines
.iter()
.map(|l| l.chars.last().map_or(l.x, |c| c.x + c.width))
.fold(f32::NEG_INFINITY, f32::max),
y_min: table_lines
.iter()
.map(|l| l.y)
.fold(f32::INFINITY, f32::min),
y_max: table_lines
.iter()
.map(|l| l.y)
.fold(f32::NEG_INFINITY, f32::max),
rows,
});
}
run_start = run_end;
}
}
tables
}
fn find_column_boundaries(line: &TextLine) -> Vec<f32> {
if line.chars.len() < 2 {
return Vec::new();
}
let avg_width: f32 = line.chars.iter().map(|c| c.width).sum::<f32>() / line.chars.len() as f32;
let gap_threshold = avg_width * 2.0;
let mut boundaries = Vec::new();
for i in 1..line.chars.len() {
let gap = line.chars[i].x - (line.chars[i - 1].x + line.chars[i - 1].width);
if gap > gap_threshold {
boundaries.push(line.chars[i - 1].x + line.chars[i - 1].width + gap / 2.0);
}
}
boundaries
}
fn boundaries_align(a: &[f32], b: &[f32], tolerance: f32) -> bool {
if a.len() != b.len() || a.is_empty() {
return false;
}
a.iter()
.zip(b.iter())
.all(|(ax, bx)| (ax - bx).abs() < tolerance)
}
fn split_at_boundaries(line: &TextLine, boundaries: &[f32]) -> Vec<String> {
let mut cells = vec![String::new(); boundaries.len() + 1];
for ch in &line.chars {
let col = boundaries
.iter()
.position(|&b| ch.x < b)
.unwrap_or(boundaries.len());
cells[col].push(ch.ch);
}
cells.iter().map(|s| s.trim().to_string()).collect()
}
#[cfg(test)]
mod tests {
use super::super::types::PdfChar;
use super::*;
fn make_char(ch: char, x: f32, y: f32, width: f32, page: usize) -> PdfChar {
PdfChar {
ch,
x,
y,
width,
height: 12.0,
page,
}
}
fn make_line(text: &str, x_start: f32, y: f32, char_width: f32, page: usize) -> TextLine {
let chars: Vec<PdfChar> = text
.chars()
.enumerate()
.map(|(i, ch)| make_char(ch, x_start + i as f32 * char_width, y, char_width, page))
.collect();
TextLine {
text: text.to_string(),
x: x_start,
y,
chars,
page,
}
}
fn make_table_line(cells: &[&str], y: f32, page: usize, col_width: f32, gap: f32) -> TextLine {
let char_w = 6.0;
let mut chars = Vec::new();
let mut full_text = String::new();
let mut x = 10.0;
for (col_idx, cell) in cells.iter().enumerate() {
if col_idx > 0 {
x += gap; }
for ch in cell.chars() {
chars.push(make_char(ch, x, y, char_w, page));
full_text.push(ch);
x += char_w;
}
let used = cell.len() as f32 * char_w;
if used < col_width {
x += col_width - used;
}
}
TextLine {
text: full_text,
x: chars.first().map_or(10.0, |c| c.x),
y,
chars,
page,
}
}
#[test]
fn table_to_markdown_empty() {
let table = Table {
page: 0,
x_min: 0.0,
x_max: 100.0,
y_min: 0.0,
y_max: 100.0,
rows: vec![],
};
assert_eq!(table.to_markdown(), "");
}
#[test]
fn table_to_markdown_simple() {
let table = Table {
page: 0,
x_min: 0.0,
x_max: 200.0,
y_min: 0.0,
y_max: 100.0,
rows: vec![
vec!["Name".into(), "Age".into()],
vec!["Alice".into(), "30".into()],
vec!["Bob".into(), "25".into()],
],
};
let md = table.to_markdown();
assert!(md.contains("| Name | Age |"));
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| Alice | 30 |"));
assert!(md.contains("| Bob | 25 |"));
}
#[test]
fn table_to_markdown_ragged_rows() {
let table = Table {
page: 0,
x_min: 0.0,
x_max: 200.0,
y_min: 0.0,
y_max: 100.0,
rows: vec![
vec!["A".into(), "B".into(), "C".into()],
vec!["1".into(), "2".into()], ],
};
let md = table.to_markdown();
assert!(md.contains("| A | B | C |"));
assert!(md.contains("| 1 | 2 | |")); }
#[test]
fn detect_tables_finds_aligned_columns() {
let gap = 50.0; let lines: Vec<TextLine> = vec![
make_table_line(&["Name", "Age", "City"], 100.0, 0, 40.0, gap),
make_table_line(&["Alice", "30", "NYC"], 88.0, 0, 40.0, gap),
make_table_line(&["Bob", "25", "LA"], 76.0, 0, 40.0, gap),
make_table_line(&["Carol", "35", "SF"], 64.0, 0, 40.0, gap),
];
let tables = detect_tables(&lines);
assert!(!tables.is_empty(), "Should detect at least one table");
assert_eq!(tables[0].rows.len(), 4);
}
#[test]
fn detect_tables_ignores_plain_text() {
let lines: Vec<TextLine> = vec![
make_line("This is a paragraph of regular text.", 10.0, 100.0, 6.0, 0),
make_line("Another line of plain text content.", 10.0, 88.0, 6.0, 0),
make_line("And one more line for good measure.", 10.0, 76.0, 6.0, 0),
];
let tables = detect_tables(&lines);
assert!(
tables.is_empty(),
"Plain text should not be detected as table"
);
}
#[test]
fn boundaries_align_same() {
assert!(boundaries_align(&[10.0, 50.0], &[10.0, 50.0], 5.0));
}
#[test]
fn boundaries_align_within_tolerance() {
assert!(boundaries_align(&[10.0, 50.0], &[12.0, 48.0], 5.0));
}
#[test]
fn boundaries_do_not_align_different_count() {
assert!(!boundaries_align(&[10.0], &[10.0, 50.0], 5.0));
}
#[test]
fn boundaries_do_not_align_out_of_tolerance() {
assert!(!boundaries_align(&[10.0, 50.0], &[20.0, 50.0], 5.0));
}
}