use quick_xml::events::Event;
use quick_xml::Reader;
use crate::ir;
#[doc(hidden)]
#[must_use]
pub fn parse_html_table(literal: &str) -> Option<ir::Block> {
let trimmed = literal.trim();
if !trimmed
.get(..6)
.is_some_and(|s| s.eq_ignore_ascii_case("<table"))
{
tracing::warn!("html_table: input does not start with <table, skipping");
return None;
}
let mut reader = Reader::from_str(trimmed);
reader.config_mut().check_end_names = false;
let rows = walk_table_events(&mut reader)?;
let col_count = rows
.iter()
.map(|r| r.cells.iter().map(|c| c.colspan as usize).sum::<usize>())
.max()
.unwrap_or(0);
Some(ir::Block::Table { rows, col_count, inner_margin: None })
}
#[allow(clippy::too_many_lines)] fn walk_table_events(reader: &mut Reader<&[u8]>) -> Option<Vec<ir::TableRow>> {
let mut table_depth: u32 = 0;
let mut current_row: Option<RowBuilder> = None;
let mut current_cell: Option<CellBuilder> = None;
let mut rows: Vec<ir::TableRow> = Vec::new();
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = local_name(e.name().as_ref());
match local.to_ascii_lowercase().as_str() {
"table" => {
table_depth += 1;
if table_depth > 1 {
tracing::warn!(
"html_table: nested <table> not supported, returning None"
);
return None;
}
}
"tr" => {
if let Some(row) = current_row.take() {
if !row.cells.is_empty() {
rows.push(finish_row(row));
}
}
current_row = Some(RowBuilder::default());
}
"th" | "td" => {
if current_row.is_none() {
current_row = Some(RowBuilder::default());
}
let is_th = local.eq_ignore_ascii_case("th");
let (colspan, rowspan) = parse_span_attrs(e);
current_cell =
Some(CellBuilder { text: String::new(), colspan, rowspan, is_th });
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
let local = local_name(e.name().as_ref());
match local.to_ascii_lowercase().as_str() {
"table" => {
table_depth += 1;
if table_depth > 1 {
tracing::warn!(
"html_table: nested <table> not supported, returning None"
);
return None;
}
}
"tr" => {
if let Some(row) = current_row.take() {
if !row.cells.is_empty() {
rows.push(finish_row(row));
}
}
current_row = Some(RowBuilder::default());
}
"th" | "td" => {
if current_row.is_none() {
current_row = Some(RowBuilder::default());
}
let is_th = local.eq_ignore_ascii_case("th");
let (colspan, rowspan) = parse_span_attrs(e);
let cell = CellBuilder { text: String::new(), colspan, rowspan, is_th };
if let Some(row) = current_row.as_mut() {
row.cells.push(cell);
}
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let local = local_name(e.name().as_ref());
match local.to_ascii_lowercase().as_str() {
"table" => {
table_depth = table_depth.saturating_sub(1);
if table_depth == 0 {
if let Some(row) = current_row.take() {
if !row.cells.is_empty() {
rows.push(finish_row(row));
}
}
}
}
"th" | "td" => {
if let Some(cell) = current_cell.take() {
if let Some(row) = current_row.as_mut() {
row.cells.push(cell);
}
}
}
"tr" => {
if let Some(row) = current_row.take() {
if !row.cells.is_empty() {
rows.push(finish_row(row));
}
}
}
_ => {}
}
}
Ok(Event::Text(e)) => {
if let Some(cell) = current_cell.as_mut() {
match e.unescape() {
Ok(text) => cell.text.push_str(&text),
Err(err) => tracing::warn!("html_table: entity decode error: {err}"),
}
}
}
Ok(Event::Eof) => break,
Err(err) => {
tracing::warn!("html_table: quick_xml parse error: {err}");
return None;
}
_ => {}
}
}
if rows.is_empty() {
tracing::warn!("html_table: table has zero rows, returning None");
return None;
}
Some(rows)
}
fn local_name(name: &[u8]) -> String {
let s = std::str::from_utf8(name).unwrap_or("");
s.rsplit(':').next().unwrap_or(s).to_owned()
}
fn parse_span_attrs(e: &quick_xml::events::BytesStart<'_>) -> (u32, u32) {
let mut colspan: u32 = 1;
let mut rowspan: u32 = 1;
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
match key {
"colspan" => {
if let Ok(val) = std::str::from_utf8(attr.value.as_ref()) {
if let Ok(n) = val.parse::<u32>() {
colspan = n.max(1);
}
}
}
"rowspan" => {
if let Ok(val) = std::str::from_utf8(attr.value.as_ref()) {
if let Ok(n) = val.parse::<u32>() {
rowspan = n.max(1);
}
}
}
_ => {}
}
}
(colspan, rowspan)
}
#[derive(Default)]
struct RowBuilder {
cells: Vec<CellBuilder>,
}
struct CellBuilder {
text: String,
colspan: u32,
rowspan: u32,
is_th: bool,
}
fn finish_row(row: RowBuilder) -> ir::TableRow {
let is_header = !row.cells.is_empty() && row.cells.iter().all(|c| c.is_th);
let cells = row
.cells
.into_iter()
.map(|c| ir::TableCell {
blocks: vec![ir::Block::Paragraph {
inlines: vec![ir::Inline::plain(c.text)],
}],
colspan: c.colspan,
rowspan: c.rowspan,
})
.collect();
ir::TableRow { cells, is_header }
}