use scraper::node::Node;
use scraper::{ElementRef, Html};
use crate::elements::{self, BlockKind, ElementAction, InlineKind};
use crate::replies;
use crate::tables;
use crate::whitespace;
pub fn convert(html: &str) -> String {
if html.is_empty() {
return String::new();
}
let document = Html::parse_document(html);
let mut ctx = Context::new();
walk_children(document.root_element(), &mut ctx);
whitespace::normalize(&ctx.output)
}
struct Context {
output: String,
list_depth: u32,
in_pre: bool,
in_link: bool,
list_stack: Vec<ListType>,
}
#[derive(Clone, Copy)]
enum ListType {
Unordered,
Ordered(u32), }
impl Context {
fn new() -> Self {
Self {
output: String::with_capacity(4096),
list_depth: 0,
in_pre: false,
in_link: false,
list_stack: Vec::new(),
}
}
fn push(&mut self, s: &str) {
self.output.push_str(s);
}
fn push_char(&mut self, c: char) {
self.output.push(c);
}
fn ensure_blank_line(&mut self) {
let trimmed = self.output.trim_end_matches(' ');
if trimmed.is_empty() {
return;
}
if trimmed.ends_with("\n\n") {
return;
}
self.output.truncate(trimmed.len());
self.output.push_str("\n\n");
}
fn ensure_newline(&mut self) {
if !self.output.is_empty() && !self.output.ends_with('\n') {
self.output.push('\n');
}
}
fn list_indent(&self) -> String {
if self.list_depth <= 1 {
return String::new();
}
" ".repeat((self.list_depth - 1) as usize)
}
}
fn walk_children(parent: ElementRef, ctx: &mut Context) {
for child in parent.children() {
match child.value() {
Node::Text(text) => {
handle_text(&text.text, ctx);
}
Node::Element(_) => {
if let Some(el_ref) = ElementRef::wrap(child) {
handle_element(el_ref, ctx);
}
}
_ => {}
}
}
}
fn handle_text(text: &str, ctx: &mut Context) {
if ctx.in_pre {
ctx.push(text);
return;
}
let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n');
for ch in text.chars() {
if ch.is_ascii_whitespace() {
if !last_was_space {
ctx.push_char(' ');
last_was_space = true;
}
} else {
ctx.push_char(ch);
last_was_space = false;
}
}
}
fn handle_element(el: ElementRef, ctx: &mut Context) {
let element = el.value();
if elements::is_hidden(element) {
return;
}
if replies::is_reply_boundary(el) {
render_reply_block(el, ctx);
return;
}
if replies::is_outlook_separator(el) {
ctx.ensure_blank_line();
let text: String = el.text().collect();
let trimmed = text.split_whitespace().collect::<Vec<_>>().join(" ");
ctx.push(&trimmed);
ctx.ensure_blank_line();
return;
}
match elements::classify(element) {
ElementAction::Skip => {}
ElementAction::Transparent => walk_children(el, ctx),
ElementAction::Block(kind) => handle_block(el, ctx, kind),
ElementAction::Inline(kind) => handle_inline(el, ctx, kind),
}
}
fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) {
match kind {
BlockKind::Paragraph => {
ctx.ensure_blank_line();
walk_children(el, ctx);
ctx.ensure_blank_line();
}
BlockKind::Heading(level) => {
ctx.ensure_blank_line();
let prefix = "#".repeat(level as usize);
ctx.push(&prefix);
ctx.push_char(' ');
walk_children(el, ctx);
ctx.ensure_blank_line();
}
BlockKind::Blockquote => {
ctx.ensure_blank_line();
let mut inner_ctx = Context::new();
inner_ctx.in_pre = ctx.in_pre;
inner_ctx.in_link = ctx.in_link;
walk_children(el, &mut inner_ctx);
let inner = whitespace::normalize(&inner_ctx.output);
for line in inner.lines() {
ctx.push("> ");
ctx.push(line);
ctx.push_char('\n');
}
ctx.push_char('\n');
}
BlockKind::UnorderedList => {
ctx.ensure_blank_line();
ctx.list_depth += 1;
ctx.list_stack.push(ListType::Unordered);
walk_children(el, ctx);
ctx.list_stack.pop();
ctx.list_depth -= 1;
ctx.ensure_blank_line();
}
BlockKind::OrderedList => {
ctx.ensure_blank_line();
ctx.list_depth += 1;
ctx.list_stack.push(ListType::Ordered(0));
walk_children(el, ctx);
ctx.list_stack.pop();
ctx.list_depth -= 1;
ctx.ensure_blank_line();
}
BlockKind::ListItem => {
ctx.ensure_newline();
let indent = ctx.list_indent();
ctx.push(&indent);
let marker = match ctx.list_stack.last_mut() {
Some(ListType::Unordered) => "- ".to_string(),
Some(ListType::Ordered(n)) => {
*n += 1;
format!("{}. ", *n)
}
None => "- ".to_string(),
};
ctx.push(&marker);
walk_children(el, ctx);
ctx.ensure_newline();
}
BlockKind::PreFormatted => {
ctx.ensure_blank_line();
ctx.push("```\n");
ctx.in_pre = true;
walk_children(el, ctx);
ctx.in_pre = false;
ctx.ensure_newline();
ctx.push("```");
ctx.ensure_blank_line();
}
BlockKind::HorizontalRule => {
ctx.ensure_blank_line();
ctx.push("---");
ctx.ensure_blank_line();
}
BlockKind::Table => {
ctx.ensure_blank_line();
if tables::is_data_table(el) {
let (headers, rows) = tables::extract_table_data(el);
let md = tables::render_markdown_table(&headers, &rows);
if !md.is_empty() {
ctx.push(&md);
}
} else {
render_layout_table(el, ctx);
}
ctx.ensure_blank_line();
}
BlockKind::Div => {
ctx.ensure_blank_line();
walk_children(el, ctx);
ctx.ensure_blank_line();
}
}
}
fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) {
match kind {
InlineKind::Bold => {
ctx.push("**");
walk_children(el, ctx);
ctx.push("**");
}
InlineKind::Italic => {
ctx.push("*");
walk_children(el, ctx);
ctx.push("*");
}
InlineKind::Strikethrough => {
ctx.push("~~");
walk_children(el, ctx);
ctx.push("~~");
}
InlineKind::Code => {
if ctx.in_pre {
walk_children(el, ctx);
} else {
ctx.push("`");
walk_children(el, ctx);
ctx.push("`");
}
}
InlineKind::Link => {
if ctx.in_link {
walk_children(el, ctx);
return;
}
let href = el.value().attr("href").unwrap_or("");
if href.is_empty() || href == "#" {
walk_children(el, ctx);
return;
}
let mut text_ctx = Context::new();
text_ctx.in_link = true;
walk_children(el, &mut text_ctx);
let text = text_ctx.output.trim().to_string();
if text.is_empty() {
ctx.push(href);
} else if text == href {
ctx.push(href);
} else {
ctx.push("[");
ctx.push(&text);
ctx.push("](");
ctx.push(href);
ctx.push(")");
}
}
InlineKind::Image => {
let element = el.value();
if elements::is_tracking_pixel(element) {
return;
}
let alt = element.attr("alt").unwrap_or("");
let src = element.attr("src").unwrap_or("");
if src.is_empty() {
return;
}
ctx.push(";
ctx.push(src);
ctx.push(")");
}
InlineKind::LineBreak => {
ctx.push_char('\n');
}
InlineKind::Superscript => {
ctx.push("^");
walk_children(el, ctx);
}
InlineKind::Subscript => {
ctx.push("~");
walk_children(el, ctx);
}
}
}
fn render_reply_block(el: ElementRef, ctx: &mut Context) {
ctx.ensure_blank_line();
if let Some(attribution) = replies::find_attribution(el) {
ctx.push(&attribution);
ctx.push_char('\n');
}
let mut inner_ctx = Context::new();
inner_ctx.in_pre = ctx.in_pre;
inner_ctx.in_link = ctx.in_link;
walk_children(el, &mut inner_ctx);
let inner = whitespace::normalize(&inner_ctx.output);
if !inner.is_empty() {
for line in inner.lines() {
ctx.push("> ");
ctx.push(line);
ctx.push_char('\n');
}
ctx.push_char('\n');
}
}
fn render_layout_table(table: ElementRef, ctx: &mut Context) {
for descendant in table.descendants() {
if let Some(el_ref) = ElementRef::wrap(descendant) {
let name = el_ref.value().name();
if name == "td" || name == "th" {
if !elements::is_hidden(el_ref.value()) {
walk_children(el_ref, ctx);
ctx.ensure_blank_line();
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input() {
assert_eq!(convert(""), "");
}
#[test]
fn plain_text() {
assert_eq!(convert("hello world"), "hello world");
}
#[test]
fn paragraph() {
assert_eq!(convert("<p>one</p><p>two</p>"), "one\n\ntwo");
}
#[test]
fn headings() {
assert_eq!(convert("<h1>Title</h1>"), "# Title");
assert_eq!(convert("<h3>Sub</h3>"), "### Sub");
}
#[test]
fn bold_and_italic() {
assert_eq!(
convert("<p><strong>bold</strong> and <em>italic</em></p>"),
"**bold** and *italic*"
);
}
#[test]
fn link() {
assert_eq!(
convert(r#"<a href="https://example.com">click</a>"#),
"[click](https://example.com)"
);
}
#[test]
fn link_text_matches_url() {
assert_eq!(
convert(r#"<a href="https://example.com">https://example.com</a>"#),
"https://example.com"
);
}
#[test]
fn link_empty_href() {
assert_eq!(convert(r#"<a href="">click</a>"#), "click");
}
#[test]
fn image() {
assert_eq!(
convert(r#"<img src="photo.jpg" alt="A photo">"#),
""
);
}
#[test]
fn tracking_pixel_skipped() {
assert_eq!(convert(r#"<img src="track.gif" width="1" height="1">"#), "");
}
#[test]
fn unordered_list() {
assert_eq!(
convert("<ul><li>one</li><li>two</li></ul>"),
"- one\n- two"
);
}
#[test]
fn ordered_list() {
assert_eq!(
convert("<ol><li>first</li><li>second</li></ol>"),
"1. first\n2. second"
);
}
#[test]
fn nested_list() {
let html = "<ul><li>outer<ul><li>inner</li></ul></li></ul>";
let md = convert(html);
assert!(md.contains("- outer"));
assert!(md.contains(" - inner"));
}
#[test]
fn nested_list_exact_indent_depth_2() {
assert_eq!(
convert("<ul><li>A<ul><li>B</li></ul></li></ul>"),
"- A\n\n - B"
);
}
#[test]
fn triple_nested_list_exact_indent_depth_3() {
assert_eq!(
convert("<ul><li>A<ul><li>B<ul><li>C</li></ul></li></ul></li></ul>"),
"- A\n\n - B\n\n - C"
);
}
#[test]
fn sibling_top_level_lists_have_no_indent_after_nesting() {
let md = convert(
"<ul><li>A<ul><li>B</li></ul></li></ul><ul><li>C</li></ul>",
);
assert!(
md.contains("\n- C"),
"second top-level list must not be indented after a nested list closes; got: {md:?}"
);
assert!(
!md.contains("\n - C"),
"second list incorrectly indented; got: {md:?}"
);
}
#[test]
fn ordered_list_decrements_depth_after_nesting() {
let md = convert(
"<ol><li>A<ol><li>B</li></ol></li></ol><ol><li>C</li></ol>",
);
assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}");
assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}");
}
#[test]
fn blockquote() {
assert_eq!(convert("<blockquote>quoted text</blockquote>"), "> quoted text");
}
#[test]
fn nested_blockquote() {
let html = "<blockquote>outer<blockquote>inner</blockquote></blockquote>";
let md = convert(html);
assert!(md.contains("> outer"));
assert!(md.contains("> > inner"));
}
#[test]
fn preformatted() {
let html = "<pre><code>fn main() {\n println!(\"hi\");\n}</code></pre>";
let md = convert(html);
assert!(md.starts_with("```\n"));
assert!(md.contains("fn main()"));
assert!(md.ends_with("\n```"));
}
#[test]
fn horizontal_rule() {
assert_eq!(convert("<p>above</p><hr><p>below</p>"), "above\n\n---\n\nbelow");
}
#[test]
fn br_tag() {
assert_eq!(convert("line one<br>line two"), "line one\nline two");
}
#[test]
fn strikethrough() {
assert_eq!(convert("<del>removed</del>"), "~~removed~~");
}
#[test]
fn inline_code() {
assert_eq!(convert("use <code>pter</code> here"), "use `pter` here");
}
#[test]
fn script_and_style_stripped() {
assert_eq!(
convert("<p>text</p><script>alert('x')</script><style>.x{}</style>"),
"text"
);
}
#[test]
fn unknown_elements_transparent() {
assert_eq!(convert("<span>hello</span>"), "hello");
}
#[test]
fn hidden_element_skipped() {
assert_eq!(
convert(r#"<p>visible</p><div style="display:none">hidden</div>"#),
"visible"
);
}
#[test]
fn whitespace_collapsed() {
assert_eq!(convert(" lots of space "), "lots of space");
}
#[test]
fn entities_decoded() {
assert_eq!(convert("<p>& < > "</p>"), "& < > \"");
}
#[test]
fn sup_and_sub() {
assert_eq!(convert("x<sup>2</sup>"), "x^2");
assert_eq!(convert("H<sub>2</sub>O"), "H~2O");
}
#[test]
fn div_separates_blocks() {
assert_eq!(convert("<div>one</div><div>two</div>"), "one\n\ntwo");
}
#[test]
fn layout_table_single_cell_unwrapped() {
let html = "<table><tr><td><p>Hello world</p></td></tr></table>";
assert_eq!(convert(html), "Hello world");
}
#[test]
fn layout_table_multi_column_linearized() {
let html = "<table><tr><td>Left</td><td>Right</td></tr></table>";
let md = convert(html);
assert!(md.contains("Left"));
assert!(md.contains("Right"));
}
#[test]
fn data_table_rendered_as_markdown() {
let html = "<table><tr><th>Name</th><th>Age</th></tr>\
<tr><td>Alice</td><td>30</td></tr>\
<tr><td>Bob</td><td>25</td></tr></table>";
let md = convert(html);
assert!(md.contains("| Name | Age |"));
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| Alice | 30 |"));
assert!(md.contains("| Bob | 25 |"));
}
#[test]
fn nested_layout_tables_unwrapped() {
let html = "<table><tr><td>\
<table><tr><td>Inner content</td></tr></table>\
</td></tr></table>";
let md = convert(html);
assert!(md.contains("Inner content"));
assert!(!md.contains("|"));
}
#[test]
fn presentation_role_is_layout() {
let html = r#"<table role="presentation"><tr><td>Content</td><td> </td></tr></table>"#;
let md = convert(html);
assert!(md.contains("Content"));
assert!(!md.contains("|"));
}
#[test]
fn spacer_element_hidden() {
let html = r#"<p>real</p><div style="font-size:0">spacer</div><p>also real</p>"#;
let md = convert(html);
assert!(md.contains("real"));
assert!(!md.contains("spacer"));
assert!(md.contains("also real"));
}
#[test]
fn mixed_content() {
let html = r#"
<h1>Subject</h1>
<p>Hello <strong>Max</strong>,</p>
<p>Check out <a href="https://example.com">this link</a>.</p>
<ul>
<li>Item one</li>
<li>Item two</li>
</ul>
"#;
let md = convert(html);
assert!(md.starts_with("# Subject"));
assert!(md.contains("Hello **Max**,"));
assert!(md.contains("[this link](https://example.com)"));
assert!(md.contains("- Item one\n- Item two"));
}
}