use scraper::ElementRef;
pub fn is_data_table(table: ElementRef) -> bool {
let el = table.value();
if let Some(role) = el.attr("role") {
if role == "grid" || role == "table" {
return true;
}
if role == "presentation" || role == "none" {
return false;
}
}
let mut has_th = false;
let mut has_caption = false;
let mut multi_cell_rows = 0u32;
for descendant in table.descendants() {
if let Some(el_ref) = ElementRef::wrap(descendant) {
match el_ref.value().name() {
"th" => has_th = true,
"caption" => has_caption = true,
"tr" => {
let cell_count = el_ref
.children()
.filter_map(ElementRef::wrap)
.filter(|c| {
let name = c.value().name();
(name == "td" || name == "th") && has_substantive_text(*c)
})
.count();
if cell_count > 1 {
multi_cell_rows += 1;
}
}
_ => {}
}
}
}
if has_th || has_caption {
return true;
}
multi_cell_rows >= 2
}
fn has_substantive_text(el: ElementRef) -> bool {
let text = el.text().collect::<String>();
let trimmed = text.trim().replace('\u{a0}', ""); trimmed.len() > 1 }
pub fn extract_table_data(table: ElementRef) -> (Vec<String>, Vec<Vec<String>>) {
let mut headers: Vec<String> = Vec::new();
let mut rows: Vec<Vec<String>> = Vec::new();
for descendant in table.children().filter_map(ElementRef::wrap) {
let name = descendant.value().name();
if name == "thead" {
for tr in descendant.children().filter_map(ElementRef::wrap) {
if tr.value().name() == "tr" {
headers = extract_cells(tr);
break; }
}
} else if name == "tbody" || name == "tr" {
let trs: Box<dyn Iterator<Item = ElementRef>> = if name == "tbody" {
Box::new(
descendant
.children()
.filter_map(ElementRef::wrap)
.filter(|e| e.value().name() == "tr"),
)
} else {
Box::new(std::iter::once(descendant))
};
for tr in trs {
let cells = extract_cells(tr);
if !cells.is_empty() {
if headers.is_empty() && has_th_cells(tr) {
headers = cells;
} else {
rows.push(cells);
}
}
}
}
}
if headers.is_empty() && !rows.is_empty() {
headers = rows.remove(0);
}
(headers, rows)
}
fn extract_cells(tr: ElementRef) -> Vec<String> {
tr.children()
.filter_map(ElementRef::wrap)
.filter(|e| {
let n = e.value().name();
n == "td" || n == "th"
})
.map(|cell| {
let text = cell.text().collect::<String>();
text.split_whitespace().collect::<Vec<_>>().join(" ")
})
.collect()
}
fn has_th_cells(tr: ElementRef) -> bool {
tr.children()
.filter_map(ElementRef::wrap)
.any(|e| e.value().name() == "th")
}
pub fn render_markdown_table(headers: &[String], rows: &[Vec<String>]) -> String {
if headers.is_empty() {
return String::new();
}
let col_count = headers.len();
let mut out = String::new();
out.push('|');
for h in headers {
out.push(' ');
out.push_str(h);
out.push_str(" |");
}
out.push('\n');
out.push('|');
for _ in 0..col_count {
out.push_str(" --- |");
}
out.push('\n');
for row in rows {
out.push('|');
for i in 0..col_count {
out.push(' ');
if let Some(cell) = row.get(i) {
out.push_str(cell);
}
out.push_str(" |");
}
out.push('\n');
}
out.trim_end().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn parse_table(html: &str) -> Html {
Html::parse_document(html)
}
fn select_table(doc: &Html) -> ElementRef<'_> {
let sel = Selector::parse("table").unwrap();
doc.select(&sel).next().unwrap()
}
#[test]
fn single_cell_is_layout() {
let doc = parse_table("<table><tr><td>content</td></tr></table>");
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn table_with_th_is_data() {
let doc = parse_table(
"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>",
);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn table_with_caption_is_data() {
let doc = parse_table(
"<table><caption>Users</caption><tr><td>Alice</td><td>30</td></tr></table>",
);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn role_presentation_is_layout() {
let doc = parse_table(
r#"<table role="presentation"><tr><td>layout</td><td>stuff</td></tr></table>"#,
);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn role_grid_is_data() {
let doc =
parse_table(r#"<table role="grid"><tr><td>Alice</td><td>30</td></tr></table>"#);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn multi_row_multi_cell_is_data() {
let doc = parse_table(
"<table>\
<tr><td>Alice</td><td>Engineer</td></tr>\
<tr><td>Bob</td><td>Designer</td></tr>\
</table>",
);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn spacer_cells_not_substantive() {
let doc = parse_table(
"<table><tr><td>content</td><td> </td></tr>\
<tr><td>more</td><td> </td></tr></table>",
);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn render_simple_table() {
let headers = vec!["Name".into(), "Age".into()];
let rows = vec![
vec!["Alice".into(), "30".into()],
vec!["Bob".into(), "25".into()],
];
let md = render_markdown_table(&headers, &rows);
assert_eq!(
md,
"| Name | Age |\n| --- | --- |\n| Alice | 30 |\n| Bob | 25 |"
);
}
#[test]
fn render_empty_headers() {
let md = render_markdown_table(&[], &[]);
assert_eq!(md, "");
}
#[test]
fn extract_with_thead() {
let doc = parse_table(
"<table><thead><tr><th>A</th><th>B</th></tr></thead>\
<tbody><tr><td>1</td><td>2</td></tr></tbody></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["A", "B"]);
assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
}
#[test]
fn extract_promotes_first_row() {
let doc = parse_table(
"<table><tr><td>Name</td><td>Val</td></tr>\
<tr><td>X</td><td>Y</td></tr></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["Name", "Val"]);
assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
}
#[test]
fn role_none_is_layout() {
let doc = parse_table(
r#"<table role="none"><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>"#,
);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn role_table_is_data() {
let doc =
parse_table(r#"<table role="table"><tr><td>a</td></tr></table>"#);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn role_unknown_falls_through_to_structural() {
let doc =
parse_table(r#"<table role="banner"><tr><td>only one cell</td></tr></table>"#);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn role_presentation_overrides_structure() {
let doc = parse_table(
r#"<table role="presentation"><tr><td>Alice</td><td>Engineer</td></tr>\
<tr><td>Bob</td><td>Designer</td></tr></table>"#,
);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn single_char_cells_not_substantive() {
let doc = parse_table(
"<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>",
);
assert!(!is_data_table(select_table(&doc)));
}
#[test]
fn two_char_cells_are_substantive() {
let doc = parse_table(
"<table><tr><td>ab</td><td>cd</td></tr><tr><td>ef</td><td>gh</td></tr></table>",
);
assert!(is_data_table(select_table(&doc)));
}
#[test]
fn extract_with_tbody_no_thead() {
let doc = parse_table(
"<table><tbody><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr></tbody></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["Name", "Val"]);
assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
}
#[test]
fn thead_present_blocks_later_th_row_promotion() {
let doc = parse_table(
"<table><thead><tr><th>A</th><th>B</th></tr></thead>\
<tbody><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></tbody></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten");
assert_eq!(r.len(), 2);
}
#[test]
fn no_thead_th_row_promotes_to_headers() {
let doc = parse_table(
"<table><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["X", "Y"]);
assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
}
#[test]
fn all_td_rows_promote_first_to_headers() {
let doc = parse_table(
"<table><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr><tr><td>P</td><td>Q</td></tr></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["Name", "Val"]);
assert_eq!(r.len(), 2);
}
#[test]
fn td_only_row_is_not_a_header_row() {
let doc = parse_table(
"<table><tr><td>data-1</td><td>data-2</td></tr>\
<tr><td>data-3</td><td>data-4</td></tr>\
<tr><td>data-5</td><td>data-6</td></tr></table>",
);
let (h, r) = extract_table_data(select_table(&doc));
assert_eq!(h, vec!["data-1", "data-2"]);
assert_eq!(r.len(), 2, "remaining rows should be data, not headers");
}
}