use scraper::{Html, Node};
use oxipdf_ir::TextContent;
use oxipdf_ir::node::{
BorderCollapse, ContentVariant, TableCell as IrTableCell, TableColumn, TableColumnWidth,
TableContent, TableLayoutMode, TableRow as IrTableRow, TableRowGroup, TableRowGroupKind,
};
use oxipdf_ir::semantic::SemanticRole;
use oxipdf_ir::style::ResolvedStyle;
use oxipdf_ir::tree::StyledTreeBuilder;
use oxipdf_ir::units::Pt;
use crate::css::{apply_declarations, parse_declarations};
use crate::error::HtmlError;
use super::cascade::{
apply_important_stylesheet_rules, apply_matching_rules, apply_normal_stylesheet_rules,
};
struct CollectedCell<'a> {
node_ref: ego_tree::NodeRef<'a, Node>,
el: &'a scraper::node::Element,
colspan: u32,
rowspan: u32,
is_header: bool,
group_kind: TableRowGroupKind,
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn convert_table(
document: &Html,
table_ref: ego_tree::NodeRef<'_, Node>,
parent_id: oxipdf_ir::node::NodeId,
table_style: ResolvedStyle,
rules: &[crate::css::CssRule],
element_id: Option<String>,
builder: &mut StyledTreeBuilder,
) -> Result<(), HtmlError> {
let mut collected_rows: Vec<Vec<CollectedCell<'_>>> = Vec::new();
for child in table_ref.children() {
if let Node::Element(el) = child.value() {
let tag = el.name().to_lowercase();
match tag.as_str() {
"thead" => {
collect_rows_from_group(child, TableRowGroupKind::Header, &mut collected_rows)
}
"tbody" => {
collect_rows_from_group(child, TableRowGroupKind::Body, &mut collected_rows)
}
"tfoot" => {
collect_rows_from_group(child, TableRowGroupKind::Footer, &mut collected_rows)
}
"tr" => {
if let Some(row) = collect_row(child, TableRowGroupKind::Body) {
collected_rows.push(row);
}
}
_ => {} }
}
}
if collected_rows.is_empty() {
builder.add_child(
parent_id,
ContentVariant::Container,
table_style,
Some(SemanticRole::Table),
element_id,
);
return Ok(());
}
let num_columns = collected_rows
.iter()
.map(|row| row.iter().map(|c| c.colspan).sum::<u32>())
.max()
.unwrap_or(1) as usize;
let table_node_idx = builder.len() as u32;
let first_cell_idx = table_node_idx + 1;
let total_cells: usize = collected_rows.iter().map(|r| r.len()).sum();
let mut row_groups: Vec<TableRowGroup> = Vec::new();
let mut cell_idx = 0u32;
let mut current_kind: Option<TableRowGroupKind> = None;
let mut current_rows: Vec<IrTableRow> = Vec::new();
for row in &collected_rows {
let row_kind = row
.first()
.map(|c| c.group_kind)
.unwrap_or(TableRowGroupKind::Body);
if let Some(prev_kind) = current_kind {
if prev_kind != row_kind && !current_rows.is_empty() {
row_groups.push(TableRowGroup {
kind: prev_kind,
rows: std::mem::take(&mut current_rows),
});
}
}
current_kind = Some(row_kind);
let mut ir_cells = Vec::new();
for cell in row {
let content_node = oxipdf_ir::node::NodeId::from_raw(first_cell_idx + cell_idx);
ir_cells.push(IrTableCell {
content_node,
colspan: cell.colspan,
rowspan: cell.rowspan,
});
cell_idx += 1;
}
current_rows.push(IrTableRow { cells: ir_cells });
}
if !current_rows.is_empty() {
if let Some(kind) = current_kind {
row_groups.push(TableRowGroup {
kind,
rows: current_rows,
});
}
}
let columns: Vec<TableColumn> = (0..num_columns)
.map(|_| TableColumn {
width: TableColumnWidth::Auto,
})
.collect();
let table_content = TableContent {
columns,
row_groups,
border_collapse: BorderCollapse::Collapse,
cell_spacing_h: Pt::new(0.0),
cell_spacing_v: Pt::new(0.0),
table_layout: TableLayoutMode::Auto,
};
let table_id = builder.add_child(
parent_id,
ContentVariant::Table(table_content),
table_style,
Some(SemanticRole::Table),
element_id,
);
debug_assert_eq!(table_id.raw(), table_node_idx);
for row in &collected_rows {
for cell in row {
let mut cell_style = ResolvedStyle::default();
if let Some(inline_css) = cell.el.attr("style") {
let decls = parse_declarations(inline_css);
apply_normal_stylesheet_rules(document, cell.node_ref.id(), &mut cell_style, rules);
let normal: Vec<_> = decls.iter().filter(|d| !d.important).cloned().collect();
if !normal.is_empty() {
apply_declarations(&mut cell_style, &normal);
}
apply_important_stylesheet_rules(
document,
cell.node_ref.id(),
&mut cell_style,
rules,
);
let important: Vec<_> = decls.iter().filter(|d| d.important).cloned().collect();
if !important.is_empty() {
apply_declarations(&mut cell_style, &important);
}
} else {
apply_matching_rules(document, cell.node_ref.id(), &mut cell_style, rules);
}
if cell.is_header && cell_style.typography.font_weight < 700 {
cell_style.typography.font_weight = 700;
}
let cell_text = collect_text_content(cell.node_ref);
let role = if cell.is_header {
Some(SemanticRole::TableHeader)
} else {
Some(SemanticRole::TableCell)
};
let cell_node_id = builder.add_child(
table_id,
ContentVariant::Text(TextContent::new(&cell_text)),
cell_style,
role,
None,
);
let _ = cell_node_id;
}
}
debug_assert_eq!(cell_idx as usize, total_cells);
Ok(())
}
fn collect_rows_from_group<'a>(
group_ref: ego_tree::NodeRef<'a, Node>,
kind: TableRowGroupKind,
rows: &mut Vec<Vec<CollectedCell<'a>>>,
) {
for child in group_ref.children() {
if let Node::Element(el) = child.value() {
if el.name().eq_ignore_ascii_case("tr") {
if let Some(row) = collect_row(child, kind) {
rows.push(row);
}
}
}
}
}
fn collect_row(
tr_ref: ego_tree::NodeRef<'_, Node>,
group_kind: TableRowGroupKind,
) -> Option<Vec<CollectedCell<'_>>> {
let mut cells = Vec::new();
for child in tr_ref.children() {
if let Node::Element(el) = child.value() {
let tag = el.name().to_lowercase();
if tag == "td" || tag == "th" {
let colspan = el
.attr("colspan")
.and_then(|v| v.parse::<u32>().ok())
.unwrap_or(1)
.max(1);
let rowspan = el
.attr("rowspan")
.and_then(|v| v.parse::<u32>().ok())
.unwrap_or(1)
.max(1);
cells.push(CollectedCell {
node_ref: child,
el,
colspan,
rowspan,
is_header: tag == "th",
group_kind,
});
}
}
}
if cells.is_empty() { None } else { Some(cells) }
}
fn collect_text_content(node_ref: ego_tree::NodeRef<'_, Node>) -> String {
let mut text = String::new();
for child in node_ref.children() {
match child.value() {
Node::Text(t) => text.push_str(&t.text),
Node::Element(el) => {
let tag = el.name().to_lowercase();
if tag == "br" {
text.push('\n');
} else {
text.push_str(&collect_text_content(child));
}
}
_ => {}
}
}
text
}
#[cfg(test)]
mod tests {
use super::*;
use crate::convert::html_to_tree;
#[test]
fn simple_table_creates_table_node() {
let html = r#"
<table>
<tr><td>A</td><td>B</td></tr>
<tr><td>C</td><td>D</td></tr>
</table>
"#;
let tree = html_to_tree(html).unwrap();
let mut found_table = false;
for node in tree.iter_nodes() {
if let ContentVariant::Table(ref tc) = node.content {
found_table = true;
assert_eq!(tc.columns.len(), 2, "should have 2 columns");
assert_eq!(tc.total_rows(), 2, "should have 2 rows");
for rg in &tc.row_groups {
assert_eq!(rg.kind, TableRowGroupKind::Body);
}
}
}
assert!(found_table, "should have a table node");
}
#[test]
fn table_with_thead_tbody() {
let html = r#"
<table>
<thead><tr><th>Header 1</th><th>Header 2</th></tr></thead>
<tbody>
<tr><td>Data 1</td><td>Data 2</td></tr>
<tr><td>Data 3</td><td>Data 4</td></tr>
</tbody>
</table>
"#;
let tree = html_to_tree(html).unwrap();
let mut found = false;
for node in tree.iter_nodes() {
if let ContentVariant::Table(ref tc) = node.content {
found = true;
assert_eq!(tc.columns.len(), 2);
assert_eq!(tc.total_rows(), 3);
assert!(
tc.row_groups
.iter()
.any(|rg| rg.kind == TableRowGroupKind::Header),
"should have header group"
);
assert!(
tc.row_groups
.iter()
.any(|rg| rg.kind == TableRowGroupKind::Body),
"should have body group"
);
let header_rows: usize = tc
.row_groups
.iter()
.filter(|rg| rg.kind == TableRowGroupKind::Header)
.map(|rg| rg.rows.len())
.sum();
assert_eq!(header_rows, 1);
}
}
assert!(found);
}
#[test]
fn table_cell_content_is_text() {
let html = r#"<table><tr><td>Hello</td></tr></table>"#;
let tree = html_to_tree(html).unwrap();
let mut found_hello = false;
for node in tree.iter_nodes() {
if let ContentVariant::Text(ref t) = node.content {
if t.text.contains("Hello") {
found_hello = true;
}
}
}
assert!(found_hello, "cell text should be in tree");
}
#[test]
fn table_th_cells_are_bold() {
let html = r#"<table><tr><th>Bold</th><td>Normal</td></tr></table>"#;
let tree = html_to_tree(html).unwrap();
let mut bold_count = 0;
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::TableHeader)
&& node.style.typography.font_weight >= 700
{
bold_count += 1;
}
}
assert!(bold_count >= 1, "th cells should be bold");
}
#[test]
fn table_colspan() {
let html = r#"
<table>
<tr><td colspan="2">Wide</td></tr>
<tr><td>A</td><td>B</td></tr>
</table>
"#;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if let ContentVariant::Table(ref tc) = node.content {
assert_eq!(tc.columns.len(), 2);
let first_row = &tc.row_groups[0].rows[0];
assert_eq!(first_row.cells.len(), 1);
assert_eq!(first_row.cells[0].colspan, 2);
let second_row = &tc.row_groups[0].rows[1];
assert_eq!(second_row.cells.len(), 2);
}
}
}
#[test]
fn table_rowspan() {
let html = r#"
<table>
<tr><td rowspan="2">Tall</td><td>B</td></tr>
<tr><td>D</td></tr>
</table>
"#;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if let ContentVariant::Table(ref tc) = node.content {
let first_row = &tc.row_groups[0].rows[0];
assert_eq!(first_row.cells[0].rowspan, 2);
}
}
}
#[test]
fn empty_table_becomes_container() {
let html = r#"<table></table>"#;
let tree = html_to_tree(html).unwrap();
assert!(tree.node_count() >= 2);
}
#[test]
fn table_cell_nodeids_are_valid() {
let html = r#"
<table>
<tr><td>A</td><td>B</td></tr>
<tr><td>C</td><td>D</td></tr>
</table>
"#;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if let ContentVariant::Table(ref tc) = node.content {
for row in tc.all_rows() {
for cell in &row.cells {
let cell_node = tree.node(cell.content_node);
assert!(
matches!(cell_node.content, ContentVariant::Text(_)),
"cell content should be text"
);
}
}
}
}
}
}