use crate::error::Error;
use crate::geometry::Rect;
use crate::layout::TextBlock;
use crate::structure::types::{StructChild, StructElem, StructType};
#[derive(Debug, Clone)]
pub struct Table {
pub rows: Vec<TableRow>,
pub has_header: bool,
pub col_count: usize,
pub bbox: Option<Rect>,
}
#[derive(Debug, Clone)]
pub struct TableRow {
pub cells: Vec<TableCell>,
pub is_header: bool,
}
#[derive(Debug, Clone)]
pub struct TableCell {
pub text: String,
pub spans: Vec<crate::layout::TextSpan>,
pub colspan: u32,
pub rowspan: u32,
pub mcids: Vec<u32>,
pub bbox: Option<Rect>,
pub is_header: bool,
}
impl Default for Table {
fn default() -> Self {
Self::new()
}
}
impl Table {
pub fn new() -> Self {
Self {
rows: Vec::new(),
has_header: false,
col_count: 0,
bbox: None,
}
}
pub fn render_text(&self) -> String {
let col_count = self.col_count;
if col_count == 0 || self.rows.is_empty() {
return String::new();
}
let mut col_widths = vec![0usize; col_count];
for row in &self.rows {
let mut col_idx = 0;
for cell in &row.cells {
if cell.colspan == 1 && col_idx < col_count {
let w = cell.text.trim().chars().count();
col_widths[col_idx] = col_widths[col_idx].max(w);
}
col_idx += cell.colspan as usize;
}
}
for w in &mut col_widths {
if *w < 2 {
*w = 2;
}
}
let effective_cols = {
let mut eff = col_widths.len();
while eff > 0 {
let col = eff - 1;
let all_empty = self.rows.iter().all(|row| {
let mut ci = 0;
for cell in &row.cells {
let span = cell.colspan as usize;
let covers_col = ci <= col && col < ci + span;
if covers_col {
return cell.text.trim().is_empty();
}
ci += span;
}
true
});
if all_empty {
eff -= 1;
} else {
break;
}
}
eff
};
if effective_cols == 0 {
return String::new();
}
let col_widths = &col_widths[..effective_cols];
let is_right_aligned: Vec<bool> = (0..effective_cols)
.map(|c| {
let mut has_content = false;
for row in &self.rows {
let mut ci = 0;
for cell in &row.cells {
if ci == c && cell.colspan == 1 {
let t = cell.text.trim();
if !t.is_empty() {
has_content = true;
let stripped: String = t
.chars()
.filter(|ch| {
!matches!(
ch,
'$' | '€'
| '£'
| ','
| ' '
| '%'
| '+'
| '-'
| '('
| ')'
)
})
.collect();
if stripped.is_empty() || stripped.parse::<f64>().is_err() {
return false;
}
}
}
ci += cell.colspan as usize;
}
}
has_content
})
.collect();
let separator: String = col_widths
.iter()
.map(|&w| "\u{2500}".repeat(w))
.collect::<Vec<_>>()
.join(" ");
let mut output = String::new();
for (r_idx, row) in self.rows.iter().enumerate() {
let mut col_idx = 0;
let mut cells_text = Vec::new();
for cell in &row.cells {
let text = cell.text.trim();
if col_idx < effective_cols {
let width = if cell.colspan > 1 {
let end = (col_idx + cell.colspan as usize).min(effective_cols);
let base: usize = col_widths[col_idx..end].iter().sum();
base + (end - col_idx).saturating_sub(1) * 2
} else {
col_widths[col_idx]
};
let formatted = if cell.colspan == 1 && is_right_aligned[col_idx] {
format!("{:>width$}", text, width = width)
} else {
format!("{:<width$}", text, width = width)
};
cells_text.push(formatted);
} else {
cells_text.push(text.to_string());
}
col_idx += cell.colspan as usize;
}
output.push_str(cells_text.join(" ").trim_end());
output.push('\n');
if self.has_header
&& row.is_header
&& r_idx + 1 < self.rows.len()
&& !self.rows[r_idx + 1].is_header
{
output.push_str(&separator);
output.push('\n');
}
}
output
}
pub fn add_row(&mut self, row: TableRow) {
if self.col_count == 0 && !row.cells.is_empty() {
self.col_count = row.cells.len();
}
self.rows.push(row);
}
pub fn is_empty(&self) -> bool {
self.rows.is_empty()
}
}
impl TableRow {
pub fn new(is_header: bool) -> Self {
Self {
cells: Vec::new(),
is_header,
}
}
pub fn has_colspan(&self) -> bool {
self.cells.iter().any(|c| c.colspan > 1)
}
pub fn add_cell(&mut self, cell: TableCell) {
self.cells.push(cell);
}
}
impl TableCell {
pub fn new(text: String, is_header: bool) -> Self {
Self {
text,
spans: Vec::new(),
colspan: 1,
rowspan: 1,
mcids: Vec::new(),
bbox: None,
is_header,
}
}
pub fn with_colspan(mut self, colspan: u32) -> Self {
self.colspan = colspan;
self
}
pub fn with_rowspan(mut self, rowspan: u32) -> Self {
self.rowspan = rowspan;
self
}
pub fn add_mcid(&mut self, mcid: u32) {
self.mcids.push(mcid);
}
}
pub fn find_table_elements(
struct_tree: &crate::structure::types::StructTreeRoot,
page_num: u32,
) -> Vec<&StructElem> {
let mut tables = Vec::new();
for elem in &struct_tree.root_elements {
collect_table_elements(elem, page_num, &mut tables);
}
tables
}
fn collect_table_elements<'a>(
elem: &'a StructElem,
page_num: u32,
tables: &mut Vec<&'a StructElem>,
) {
if elem.struct_type == StructType::Table {
if element_has_page_content(elem, page_num) {
tables.push(elem);
}
return; }
for child in &elem.children {
if let StructChild::StructElem(child_elem) = child {
collect_table_elements(child_elem, page_num, tables);
}
}
}
fn element_has_page_content(elem: &StructElem, page_num: u32) -> bool {
if elem.page == Some(page_num) {
return true;
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, .. } => {
if *page == page_num {
return true;
}
},
StructChild::StructElem(child_elem) => {
if element_has_page_content(child_elem, page_num) {
return true;
}
},
StructChild::ObjectRef(_, _) => {},
}
}
false
}
pub fn extract_table_from_spans(
table_elem: &StructElem,
spans: &[crate::layout::TextSpan],
) -> Result<Table, Error> {
let text_blocks: Vec<TextBlock> = spans
.iter()
.filter(|s| s.mcid.is_some())
.map(|s| TextBlock {
chars: Vec::new(),
bbox: s.bbox,
text: s.text.clone(),
avg_font_size: s.font_size,
dominant_font: s.font_name.clone(),
is_bold: s.font_weight.is_bold(),
is_italic: s.is_italic,
mcid: s.mcid,
})
.collect();
extract_table(table_elem, &text_blocks)
}
pub fn extract_table(table_elem: &StructElem, text_blocks: &[TextBlock]) -> Result<Table, Error> {
let mut table = Table::new();
let has_thead = table_elem
.children
.iter()
.any(|child| matches!(child, StructChild::StructElem(elem) if elem.struct_type == StructType::THead));
if has_thead {
table.has_header = true;
}
for child in &table_elem.children {
match child {
StructChild::StructElem(elem) => match elem.struct_type {
StructType::TR => {
let row = extract_row(elem, text_blocks, false)?;
table.add_row(row);
},
StructType::THead => {
extract_row_group(elem, text_blocks, true, &mut table)?;
},
StructType::TBody => {
extract_row_group(elem, text_blocks, false, &mut table)?;
},
StructType::TFoot => {
extract_row_group(elem, text_blocks, false, &mut table)?;
},
_ => {
},
},
StructChild::MarkedContentRef { .. } => {
},
StructChild::ObjectRef(_, _) => {
},
}
}
Ok(table)
}
fn extract_row_group(
group_elem: &StructElem,
text_blocks: &[TextBlock],
is_header: bool,
table: &mut Table,
) -> Result<(), Error> {
for child in &group_elem.children {
match child {
StructChild::StructElem(elem) if elem.struct_type == StructType::TR => {
let row = extract_row(elem, text_blocks, is_header)?;
table.add_row(row);
},
_ => {
},
}
}
Ok(())
}
fn extract_row(
tr_elem: &StructElem,
text_blocks: &[TextBlock],
force_header: bool,
) -> Result<TableRow, Error> {
let mut row = TableRow::new(force_header);
for child in &tr_elem.children {
match child {
StructChild::StructElem(elem) => match elem.struct_type {
StructType::TH => {
let cell = extract_cell(elem, text_blocks, true)?;
row.add_cell(cell);
},
StructType::TD => {
let cell = extract_cell(elem, text_blocks, false)?;
row.add_cell(cell);
},
_ => {
},
},
StructChild::MarkedContentRef { .. } => {
},
StructChild::ObjectRef(_, _) => {
},
}
}
Ok(row)
}
fn extract_cell(
cell_elem: &StructElem,
text_blocks: &[TextBlock],
is_header: bool,
) -> Result<TableCell, Error> {
let mut mcids = Vec::new();
collect_mcids(cell_elem, &mut mcids);
let mut cell_text = String::new();
for mcid in &mcids {
for block in text_blocks {
if let Some(block_mcid) = block.mcid {
if block_mcid == *mcid {
if !cell_text.is_empty() && !cell_text.ends_with(' ') {
cell_text.push(' ');
}
cell_text.push_str(&block.text);
break;
}
}
}
}
let mut cell = TableCell::new(cell_text.trim().to_string(), is_header);
cell.mcids = mcids;
Ok(cell)
}
fn collect_mcids(elem: &StructElem, mcids: &mut Vec<u32>) {
for child in &elem.children {
match child {
StructChild::MarkedContentRef { mcid, .. } => {
mcids.push(*mcid);
},
StructChild::StructElem(child_elem) => {
collect_mcids(child_elem, mcids);
},
StructChild::ObjectRef(_, _) => {
},
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::structure::types::StructTreeRoot;
#[test]
fn test_table_new() {
let table = Table::new();
assert!(table.is_empty());
assert_eq!(table.col_count, 0);
assert!(!table.has_header);
assert!(table.bbox.is_none());
}
#[test]
fn test_table_bbox() {
let mut table = Table::new();
assert!(table.bbox.is_none());
table.bbox = Some(Rect::new(10.0, 20.0, 100.0, 50.0));
assert!(table.bbox.is_some());
let bbox = table.bbox.unwrap();
assert_eq!(bbox.x, 10.0);
assert_eq!(bbox.y, 20.0);
assert_eq!(bbox.width, 100.0);
assert_eq!(bbox.height, 50.0);
}
#[test]
fn test_table_row_new() {
let header_row = TableRow::new(true);
assert!(header_row.is_header);
assert!(header_row.cells.is_empty());
let body_row = TableRow::new(false);
assert!(!body_row.is_header);
}
#[test]
fn test_table_cell_new() {
let cell = TableCell::new("Hello".to_string(), false);
assert_eq!(cell.text, "Hello");
assert!(!cell.is_header);
assert_eq!(cell.colspan, 1);
assert_eq!(cell.rowspan, 1);
assert!(cell.mcids.is_empty());
}
#[test]
fn test_table_cell_with_spans() {
let cell = TableCell::new("Data".to_string(), false)
.with_colspan(2)
.with_rowspan(3);
assert_eq!(cell.colspan, 2);
assert_eq!(cell.rowspan, 3);
}
#[test]
fn test_table_cell_header() {
let cell = TableCell::new("Header".to_string(), true);
assert!(cell.is_header);
}
#[test]
fn test_table_row_add_cells() {
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Cell1".to_string(), false));
row.add_cell(TableCell::new("Cell2".to_string(), false));
assert_eq!(row.cells.len(), 2);
assert_eq!(row.cells[0].text, "Cell1");
assert_eq!(row.cells[1].text, "Cell2");
}
#[test]
fn test_table_add_rows() {
let mut table = Table::new();
let mut row1 = TableRow::new(false);
row1.add_cell(TableCell::new("A".to_string(), false));
row1.add_cell(TableCell::new("B".to_string(), false));
table.add_row(row1);
assert_eq!(table.col_count, 2);
assert_eq!(table.rows.len(), 1);
}
#[test]
fn test_table_has_header() {
let mut table = Table::new();
assert!(!table.has_header);
table.has_header = true;
assert!(table.has_header);
}
fn make_table_elem(page: u32, mcids: &[u32]) -> StructElem {
let mut table = StructElem::new(StructType::Table);
let mut tr = StructElem::new(StructType::TR);
for &mcid in mcids {
let mut td = StructElem::new(StructType::TD);
td.add_child(StructChild::MarkedContentRef { mcid, page });
tr.add_child(StructChild::StructElem(Box::new(td)));
}
table.add_child(StructChild::StructElem(Box::new(tr)));
table
}
#[test]
fn test_find_table_elements_finds_table_on_matching_page() {
let mut tree = StructTreeRoot::new();
tree.add_root_element(make_table_elem(0, &[1, 2]));
let tables = find_table_elements(&tree, 0);
assert_eq!(tables.len(), 1);
assert_eq!(tables[0].struct_type, StructType::Table);
}
#[test]
fn test_find_table_elements_skips_table_on_different_page() {
let mut tree = StructTreeRoot::new();
tree.add_root_element(make_table_elem(1, &[1, 2]));
let tables = find_table_elements(&tree, 0);
assert!(tables.is_empty());
}
#[test]
fn test_find_table_elements_empty_tree() {
let tree = StructTreeRoot::new();
let tables = find_table_elements(&tree, 0);
assert!(tables.is_empty());
}
#[test]
fn test_find_table_elements_multiple_tables() {
let mut tree = StructTreeRoot::new();
tree.add_root_element(make_table_elem(0, &[1, 2]));
tree.add_root_element(make_table_elem(0, &[3, 4]));
let tables = find_table_elements(&tree, 0);
assert_eq!(tables.len(), 2);
}
#[test]
fn test_find_table_elements_nested_in_section() {
let mut tree = StructTreeRoot::new();
let mut sect = StructElem::new(StructType::Sect);
sect.add_child(StructChild::StructElem(Box::new(make_table_elem(0, &[1]))));
tree.add_root_element(sect);
let tables = find_table_elements(&tree, 0);
assert_eq!(tables.len(), 1);
}
#[test]
fn test_find_table_elements_table_with_page_attribute() {
let mut tree = StructTreeRoot::new();
let mut table = StructElem::new(StructType::Table);
table.page = Some(2);
tree.add_root_element(table);
let tables = find_table_elements(&tree, 2);
assert_eq!(tables.len(), 1);
}
#[test]
fn test_find_table_elements_mixed_pages() {
let mut tree = StructTreeRoot::new();
tree.add_root_element(make_table_elem(0, &[1]));
tree.add_root_element(make_table_elem(1, &[2]));
tree.add_root_element(make_table_elem(0, &[3]));
let page0_tables = find_table_elements(&tree, 0);
assert_eq!(page0_tables.len(), 2);
let page1_tables = find_table_elements(&tree, 1);
assert_eq!(page1_tables.len(), 1);
}
#[test]
fn test_element_has_page_content_via_mcid() {
let mut elem = StructElem::new(StructType::P);
elem.add_child(StructChild::MarkedContentRef { mcid: 1, page: 3 });
assert!(element_has_page_content(&elem, 3));
assert!(!element_has_page_content(&elem, 0));
}
#[test]
fn test_element_has_page_content_via_page_attribute() {
let mut elem = StructElem::new(StructType::P);
elem.page = Some(5);
assert!(element_has_page_content(&elem, 5));
assert!(!element_has_page_content(&elem, 0));
}
#[test]
fn test_element_has_page_content_recursive() {
let mut parent = StructElem::new(StructType::Sect);
let mut child = StructElem::new(StructType::P);
child.add_child(StructChild::MarkedContentRef { mcid: 1, page: 2 });
parent.add_child(StructChild::StructElem(Box::new(child)));
assert!(element_has_page_content(&parent, 2));
assert!(!element_has_page_content(&parent, 0));
}
#[test]
fn test_element_has_page_content_empty() {
let elem = StructElem::new(StructType::P);
assert!(!element_has_page_content(&elem, 0));
}
#[test]
fn test_element_has_page_content_object_ref_ignored() {
let mut elem = StructElem::new(StructType::P);
elem.add_child(StructChild::ObjectRef(1, 0));
assert!(!element_has_page_content(&elem, 0));
}
fn make_text_span(text: &str, mcid: Option<u32>) -> crate::layout::TextSpan {
use crate::layout::text_block::{Color, FontWeight};
crate::layout::TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(0.0, 0.0, 50.0, 12.0),
font_name: "Test".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 1.0,
primary_detected: false,
char_widths: vec![],
}
}
#[test]
fn test_extract_table_from_spans_basic() {
let mut table_elem = StructElem::new(StructType::Table);
let mut tr = StructElem::new(StructType::TR);
let mut td1 = StructElem::new(StructType::TD);
td1.add_child(StructChild::MarkedContentRef { mcid: 10, page: 0 });
let mut td2 = StructElem::new(StructType::TD);
td2.add_child(StructChild::MarkedContentRef { mcid: 11, page: 0 });
tr.add_child(StructChild::StructElem(Box::new(td1)));
tr.add_child(StructChild::StructElem(Box::new(td2)));
table_elem.add_child(StructChild::StructElem(Box::new(tr)));
let spans = vec![
make_text_span("Hello", Some(10)),
make_text_span("World", Some(11)),
make_text_span("Unrelated", Some(99)),
];
let result = extract_table_from_spans(&table_elem, &spans).unwrap();
assert_eq!(result.rows.len(), 1);
assert_eq!(result.rows[0].cells.len(), 2);
assert_eq!(result.rows[0].cells[0].text, "Hello");
assert_eq!(result.rows[0].cells[1].text, "World");
}
#[test]
fn test_extract_table_from_spans_no_matching_mcids() {
let mut table_elem = StructElem::new(StructType::Table);
let mut tr = StructElem::new(StructType::TR);
let mut td = StructElem::new(StructType::TD);
td.add_child(StructChild::MarkedContentRef { mcid: 10, page: 0 });
tr.add_child(StructChild::StructElem(Box::new(td)));
table_elem.add_child(StructChild::StructElem(Box::new(tr)));
let spans = vec![make_text_span("Other", Some(99))];
let result = extract_table_from_spans(&table_elem, &spans).unwrap();
assert_eq!(result.rows.len(), 1);
assert_eq!(result.rows[0].cells[0].text, ""); }
#[test]
fn test_extract_table_from_spans_filters_no_mcid_spans() {
let mut table_elem = StructElem::new(StructType::Table);
let mut tr = StructElem::new(StructType::TR);
let mut td = StructElem::new(StructType::TD);
td.add_child(StructChild::MarkedContentRef { mcid: 5, page: 0 });
tr.add_child(StructChild::StructElem(Box::new(td)));
table_elem.add_child(StructChild::StructElem(Box::new(tr)));
let spans = vec![
make_text_span("No MCID", None),
make_text_span("Has MCID", Some(5)),
];
let result = extract_table_from_spans(&table_elem, &spans).unwrap();
assert_eq!(result.rows[0].cells[0].text, "Has MCID");
}
#[test]
fn test_extract_table_from_spans_with_thead() {
let mut table_elem = StructElem::new(StructType::Table);
let mut thead = StructElem::new(StructType::THead);
let mut hdr_tr = StructElem::new(StructType::TR);
let mut th = StructElem::new(StructType::TH);
th.add_child(StructChild::MarkedContentRef { mcid: 1, page: 0 });
hdr_tr.add_child(StructChild::StructElem(Box::new(th)));
thead.add_child(StructChild::StructElem(Box::new(hdr_tr)));
table_elem.add_child(StructChild::StructElem(Box::new(thead)));
let mut tbody = StructElem::new(StructType::TBody);
let mut body_tr = StructElem::new(StructType::TR);
let mut td = StructElem::new(StructType::TD);
td.add_child(StructChild::MarkedContentRef { mcid: 2, page: 0 });
body_tr.add_child(StructChild::StructElem(Box::new(td)));
tbody.add_child(StructChild::StructElem(Box::new(body_tr)));
table_elem.add_child(StructChild::StructElem(Box::new(tbody)));
let spans = vec![
make_text_span("Header", Some(1)),
make_text_span("Data", Some(2)),
];
let result = extract_table_from_spans(&table_elem, &spans).unwrap();
assert!(result.has_header);
assert_eq!(result.rows.len(), 2);
assert!(result.rows[0].is_header);
assert!(!result.rows[1].is_header);
assert_eq!(result.rows[0].cells[0].text, "Header");
assert_eq!(result.rows[1].cells[0].text, "Data");
}
#[test]
fn test_extract_table_from_spans_empty_table() {
let table_elem = StructElem::new(StructType::Table);
let spans: Vec<crate::layout::TextSpan> = vec![];
let result = extract_table_from_spans(&table_elem, &spans).unwrap();
assert!(result.is_empty());
}
}