use itertools::Itertools;
use miette::miette;
use super::iframe::detect_embed;
use super::node::HtmlElement;
use super::node::HtmlNode;
use super::options::ConversionOptions;
type MarkdownInline = bool;
type MarkdownBlock = (String, MarkdownInline);
fn extract_text_from_pre_children(nodes: &[HtmlNode]) -> String {
let mut text_content = String::new();
for node in nodes {
match node {
HtmlNode::Text(text) => text_content.push_str(text),
HtmlNode::Element(el) if el.tag_name == "br" => text_content.push('\n'),
HtmlNode::Element(el) if el.tag_name == "code" => {
text_content.push_str(&extract_text_from_pre_children(&el.children));
}
HtmlNode::Element(el) => {
text_content.push_str(&extract_text_from_pre_children(&el.children));
}
HtmlNode::Comment(_) => {}
}
}
text_content
}
fn normalize_unicode_whitespace(text: &str) -> String {
if text.chars().any(|c| matches!(c, '\u{00A0}' | '\u{202F}' | '\u{2009}')) {
text.chars()
.map(|c| match c {
'\u{00A0}' | '\u{202F}' | '\u{2009}' => ' ',
_ => c,
})
.collect()
} else {
text.to_owned()
}
}
#[derive(PartialEq, Debug, Clone, Copy)]
enum Alignment {
Left,
Center,
Right,
Default,
}
fn get_cell_alignment(element: &HtmlElement) -> Alignment {
if let Some(Some(style_attr)) = element.attributes.get("style") {
for part in style_attr.split(';') {
let sub_parts: Vec<&str> = part.trim().splitn(2, ':').collect();
if sub_parts.len() == 2 && sub_parts[0].trim() == "text-align" {
match sub_parts[1].trim().to_lowercase().as_str() {
"left" => return Alignment::Left,
"center" => return Alignment::Center,
"right" => return Alignment::Right,
_ => {}
}
}
}
}
if let Some(Some(align_attr)) = element.attributes.get("align") {
match align_attr.to_lowercase().as_str() {
"left" => return Alignment::Left,
"center" => return Alignment::Center,
"right" => return Alignment::Right,
_ => {}
}
}
Alignment::Default
}
fn escape_table_cell_content(content: &str) -> String {
content.replace("|", "\\|")
}
fn convert_html_table_to_markdown(table_element: &HtmlElement) -> miette::Result<String> {
let mut caption_text: Option<String> = None;
for node in &table_element.children {
if let HtmlNode::Element(el) = node
&& el.tag_name == "caption"
{
let text = convert_children_to_string(&el.children)?;
let trimmed = text.trim().to_string();
if !trimmed.is_empty() {
caption_text = Some(trimmed);
}
break;
}
}
let mut header_cells: Vec<String> = Vec::new();
let mut header_alignments: Vec<Alignment> = Vec::new();
let mut body_rows: Vec<Vec<String>> = Vec::new();
let mut first_tbody_first_row_used_as_header = false;
for node in &table_element.children {
if let HtmlNode::Element(thead_element) = node
&& thead_element.tag_name == "thead"
&& let Some(HtmlNode::Element(tr_element)) = thead_element
.children
.iter()
.find(|n| matches!(n, HtmlNode::Element(el) if el.tag_name == "tr"))
{
for cell_node in &tr_element.children {
if let HtmlNode::Element(cell_element) = cell_node
&& (cell_element.tag_name == "th" || cell_element.tag_name == "td")
{
let cell_content = convert_children_to_string(&cell_element.children)?;
header_cells.push(escape_table_cell_content(cell_content.trim()));
header_alignments.push(get_cell_alignment(cell_element));
}
}
break;
}
}
if header_cells.is_empty() {
for node in &table_element.children {
if let HtmlNode::Element(tbody_element) = node {
if tbody_element.tag_name == "tbody"
&& let Some(HtmlNode::Element(tr_element)) = tbody_element
.children
.iter()
.find(|n| matches!(n, HtmlNode::Element(el) if el.tag_name == "tr"))
{
for cell_node in &tr_element.children {
if let HtmlNode::Element(cell_element) = cell_node
&& (cell_element.tag_name == "td" || cell_element.tag_name == "th")
{
let cell_content = convert_children_to_string(&cell_element.children)?;
header_cells.push(escape_table_cell_content(cell_content.trim()));
header_alignments.push(get_cell_alignment(cell_element));
}
}
if !header_cells.is_empty() {
first_tbody_first_row_used_as_header = true;
}
}
break;
}
}
}
if header_cells.is_empty() {
return Ok("".to_string());
}
let column_count = header_cells.len();
let mut first_tbody_processed_for_data = false;
for node in &table_element.children {
if let HtmlNode::Element(tbody_element) = node
&& tbody_element.tag_name == "tbody"
{
let mut rows_to_iterate = tbody_element.children.iter();
if first_tbody_first_row_used_as_header && !first_tbody_processed_for_data {
rows_to_iterate.next();
first_tbody_processed_for_data = true;
}
for tr_node in rows_to_iterate {
if let HtmlNode::Element(tr_element) = tr_node
&& tr_element.tag_name == "tr"
{
let mut current_row_cells: Vec<String> = Vec::new();
for td_node in &tr_element.children {
if let HtmlNode::Element(td_element) = td_node
&& (td_element.tag_name == "td" || td_element.tag_name == "th")
{
let cell_content = convert_children_to_string(&td_element.children)?;
current_row_cells.push(escape_table_cell_content(cell_content.trim()));
}
}
body_rows.push(current_row_cells);
}
}
}
}
let mut markdown_table = String::new();
markdown_table.push_str("| ");
markdown_table.push_str(&header_cells.join(" | "));
markdown_table.push_str(" |\n");
markdown_table.push('|');
for i in 0..column_count {
let align = header_alignments.get(i).unwrap_or(&Alignment::Default);
let sep_str = match align {
Alignment::Left => ":---",
Alignment::Center => ":---:",
Alignment::Right => "---:",
Alignment::Default => "---",
};
markdown_table.push_str(sep_str);
markdown_table.push('|');
}
markdown_table.push('\n');
for row_cells in &body_rows {
markdown_table.push_str("| ");
for cell_idx in 0..column_count {
if let Some(cell_content) = row_cells.get(cell_idx) {
markdown_table.push_str(cell_content);
}
markdown_table.push_str(" | ");
}
if column_count > 0 {
markdown_table.truncate(markdown_table.len() - 3);
}
markdown_table.push_str(" |\n");
}
let table_md = markdown_table.trim_end_matches('\n').to_string();
if let Some(caption) = caption_text {
Ok(format!("{}\n\n{}", caption, table_md))
} else {
Ok(table_md)
}
}
fn process_url_for_markdown(url: &str) -> String {
let processed_url = url.replace(" ", "%20");
let needs_angle_brackets =
url.is_empty() || url.contains(' ') || processed_url.contains('(') || processed_url.contains(')');
if needs_angle_brackets {
format!("<{}>", processed_url)
} else {
processed_url
}
}
fn handle_heading_element(element: &HtmlElement) -> miette::Result<String> {
let children_content_str = convert_children_to_string(&element.children)?;
let marker_level = element.tag_name[1..].parse().unwrap_or(1);
Ok(format!("{} {}", "#".repeat(marker_level), children_content_str))
}
fn handle_paragraph_element(element: &HtmlElement) -> miette::Result<String> {
convert_children_to_string(&element.children)
}
fn handle_hr_element() -> miette::Result<String> {
Ok("---".to_string())
}
fn handle_list_element(element: &HtmlElement, options: ConversionOptions) -> miette::Result<String> {
convert_html_list_to_markdown(element, 0, options)
}
fn handle_blockquote_element(element: &HtmlElement, options: ConversionOptions) -> miette::Result<String> {
let inner_markdown = convert_nodes_to_markdown(&element.children, options)?;
if !inner_markdown.is_empty() {
let quoted_lines: Vec<String> = inner_markdown.lines().map(|line| format!("> {}", line)).collect();
Ok(quoted_lines.join("\n"))
} else {
Ok(">".to_string())
}
}
fn dedent(text: &str) -> String {
let min_indent = text
.lines()
.filter(|line| !line.trim().is_empty())
.map(|line| line.len() - line.trim_start().len())
.min()
.unwrap_or(0);
if min_indent == 0 {
return text.to_owned();
}
text.lines()
.map(|line| {
if line.len() >= min_indent {
&line[min_indent..]
} else {
line.trim_start()
}
})
.collect::<Vec<_>>()
.join("\n")
}
fn handle_pre_element(element: &HtmlElement, _options: ConversionOptions) -> miette::Result<String> {
let mut lang_specifier = String::new();
let code_child = element.children.iter().find_map(|n| {
if let HtmlNode::Element(el) = n
&& el.tag_name == "code"
{
Some(el)
} else {
None
}
});
let text_content = if let Some(code_element) = code_child {
if let Some(Some(class_attr)) = code_element.attributes.get("class") {
for class_name in class_attr.split_whitespace() {
if let Some(lang) = class_name.strip_prefix("language-") {
lang_specifier = lang.to_string();
break;
} else if let Some(lang) = class_name.strip_prefix("lang-") {
lang_specifier = lang.to_string();
break;
}
}
}
let mut text = extract_text_from_pre_children(&code_element.children);
let non_code: Vec<&HtmlNode> = element
.children
.iter()
.filter(|n| !matches!(n, HtmlNode::Element(el) if el.tag_name == "code"))
.collect();
text.push_str(&extract_text_from_pre_children(
non_code.iter().copied().cloned().collect::<Vec<_>>().as_slice(),
));
text
} else {
extract_text_from_pre_children(&element.children)
};
let text_content = text_content.strip_prefix('\n').unwrap_or(&text_content);
let text_content = dedent(text_content.trim_end_matches('\n'));
Ok(format!("```{}\n{}\n```", lang_specifier, text_content))
}
fn handle_table_element(element: &HtmlElement, _options: ConversionOptions) -> miette::Result<String> {
convert_html_table_to_markdown(element)
}
fn handle_dl_element(element: &HtmlElement, options: ConversionOptions) -> miette::Result<String> {
let mut dl_content_parts = Vec::new();
for child_node in &element.children {
match child_node {
HtmlNode::Element(dt_el) if dt_el.tag_name == "dt" => {
let dt_text = convert_children_to_string(&dt_el.children)?;
let dt_trimmed = dt_text.trim();
let dt_formatted = if dt_trimmed.starts_with("**") && dt_trimmed.ends_with("**") {
dt_trimmed.to_string()
} else {
format!("**{}**", dt_trimmed)
};
dl_content_parts.push(dt_formatted);
}
HtmlNode::Element(dd_el) if dd_el.tag_name == "dd" => {
let dd_markdown_block = convert_nodes_to_markdown(&dd_el.children, options)?;
if !dd_markdown_block.is_empty() {
let indented_dd_lines: Vec<String> =
dd_markdown_block.lines().map(|line| format!(" {}", line)).collect();
dl_content_parts.push(indented_dd_lines.join("\n"));
}
}
HtmlNode::Text(text) if text.trim().is_empty() => {}
HtmlNode::Comment(_) => {}
_ => {
let unexpected_block = convert_nodes_to_markdown(std::slice::from_ref(child_node), options)?;
if !unexpected_block.is_empty() {
dl_content_parts.push(unexpected_block);
}
}
}
}
if !dl_content_parts.is_empty() {
Ok(dl_content_parts.join("\n"))
} else {
Ok("".to_string())
}
}
fn handle_script_element(element: &HtmlElement, options: ConversionOptions) -> miette::Result<Option<String>> {
if options.extract_scripts_as_code_blocks {
if element.attributes.get("src").and_then(|opt| opt.as_ref()).is_none() {
let type_attr = element
.attributes
.get("type")
.and_then(|opt| opt.as_ref())
.map(|s| s.to_lowercase());
let lang_specifier = match type_attr.as_deref() {
Some("text/javascript") | Some("application/javascript") | Some("module") => "javascript".to_string(),
Some("application/json") | Some("application/ld+json") => "json".to_string(),
_ => "".to_string(),
};
let mut script_content = extract_text_from_pre_children(&element.children);
if script_content.starts_with('\n') {
script_content.remove(0);
}
let final_content = script_content.trim_end_matches('\n');
Ok(Some(format!("```{}\n{}\n```", lang_specifier, final_content)))
} else {
Ok(None)
}
} else {
Ok(None)
}
}
fn handle_embedded_content_element(element: &HtmlElement) -> miette::Result<Option<String>> {
let tag_name = element.tag_name.as_str();
let mut src_url: Option<String> = None;
let mut additional_info = String::new();
match tag_name {
"iframe" => {
let src = element.attributes.get("src").and_then(|opt| opt.as_ref().cloned());
if let Some(ref s) = src
&& let Some((description, canonical_url)) = detect_embed(s)
{
return Ok(Some(format!("[{}]({})", description, canonical_url)));
}
src_url = src;
}
"embed" => src_url = element.attributes.get("src").and_then(|opt| opt.as_ref().cloned()),
"video" | "audio" => {
src_url = element.attributes.get("src").and_then(|opt| opt.as_ref().cloned());
if src_url.is_none() {
for child_node in &element.children {
if let HtmlNode::Element(source_el) = child_node
&& source_el.tag_name == "source"
&& let Some(Some(s_src)) = source_el.attributes.get("src")
{
src_url = Some(s_src.clone());
break;
}
}
}
if tag_name == "video"
&& let Some(Some(poster_url)) = element.attributes.get("poster")
&& !poster_url.is_empty()
{
additional_info = format!(" (Poster: {})", poster_url);
}
}
"object" => src_url = element.attributes.get("data").and_then(|opt| opt.as_ref().cloned()),
_ => {}
}
if let Some(url) = src_url {
if !url.is_empty() {
let title_val_opt = element.attributes.get("title").and_then(|opt| opt.as_ref());
let final_description_text = match title_val_opt {
Some(title_str) if !title_str.is_empty() => title_str.clone(),
_ => match tag_name {
"iframe" => "Embedded Iframe".to_string(),
"video" => "Video".to_string(),
"audio" => "Audio".to_string(),
"embed" => "Embedded Content".to_string(),
"object" => "Embedded Object".to_string(),
_ => "Embedded Resource".to_string(),
},
};
let title_md_part = title_val_opt
.filter(|t_str| !t_str.is_empty())
.map(|t_str| format!(" \"{}\"", t_str.replace('"', "\\\"")))
.unwrap_or_default();
Ok(Some(format!(
"[{}]({}{}){}",
final_description_text, url, title_md_part, additional_info
)))
} else {
Ok(None)
}
} else {
Ok(None)
}
}
fn handle_svg_element(element: &HtmlElement) -> miette::Result<String> {
let mut title_text: Option<String> = None;
for child_node in &element.children {
if let HtmlNode::Element(title_el) = child_node
&& title_el.tag_name == "title"
{
let extracted_title = convert_children_to_string(&title_el.children)?;
let trimmed_title = extracted_title.trim();
if !trimmed_title.is_empty() {
title_text = Some(trimmed_title.to_string());
}
break;
}
}
if let Some(title) = title_text {
Ok(format!("[SVG: {}]", title))
} else {
Ok("[SVG Image]".to_string())
}
}
fn convert_html_list_to_markdown(
list_element: &HtmlElement,
indent_level: usize,
options: ConversionOptions,
) -> miette::Result<String> {
let mut markdown_items = Vec::new();
let base_indent = " ".repeat(indent_level);
let mut current_list_number = if list_element.tag_name == "ol" {
list_element
.attributes
.get("start")
.and_then(|opt_val| opt_val.as_ref())
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(1)
} else {
0
};
for node in &list_element.children {
if let HtmlNode::Element(li_element) = node {
if li_element.tag_name == "li" {
let marker_prefix = match list_element.tag_name.as_str() {
"ul" => "* ".to_string(),
"ol" => {
let m = format!("{}. ", current_list_number);
current_list_number += 1;
m
}
_ => {
return Err(miette!("Unexpected list tag name: {}", list_element.tag_name,));
}
};
let li_content_markdown = convert_nodes_to_markdown(&li_element.children, options)?;
if li_content_markdown.is_empty() {
markdown_items.push(format!("{}{}", base_indent, marker_prefix));
} else {
let mut first_line_in_li = true;
for line in li_content_markdown.lines() {
if first_line_in_li {
markdown_items.push(format!("{}{}{}", base_indent, marker_prefix, line));
first_line_in_li = false;
} else {
let continuation_indent = " ".repeat(marker_prefix.len());
markdown_items.push(format!("{}{}{}", base_indent, continuation_indent, line));
}
}
}
}
} else if let HtmlNode::Text(text_content) = node
&& !text_content.trim().is_empty()
{
}
}
Ok(markdown_items.iter().filter(|item| !item.trim().is_empty()).join("\n"))
}
pub fn convert_children_to_string(nodes: &[HtmlNode]) -> miette::Result<String> {
let mut parts = Vec::new();
for node in nodes {
match node {
HtmlNode::Text(text) => {
let normalized = normalize_unicode_whitespace(text);
let trimmed = normalized.trim_start_matches('\n').trim_end_matches('\n');
let collapsed = if trimmed.contains('\n') {
let leading_space = trimmed.starts_with(' ');
let trailing_space = trimmed.ends_with(' ');
let inner = trimmed
.split('\n')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
match (leading_space, trailing_space) {
(true, true) => format!(" {} ", inner),
(true, false) => format!(" {}", inner),
(false, true) => format!("{} ", inner),
(false, false) => inner,
}
} else {
let trimmed = if trimmed.starts_with(' ') {
format!(" {}", trimmed.trim_start())
} else {
trimmed.to_owned()
};
if trimmed.ends_with(' ') {
format!("{} ", trimmed.trim_end())
} else {
trimmed
}
};
parts.push(collapsed);
}
HtmlNode::Element(element) => {
let link_text = convert_children_to_string(&element.children)?;
match element.tag_name.as_str() {
"strong" => {
if !link_text.is_empty() {
parts.push(format!("**{}**", link_text));
}
}
"em" => {
if !link_text.is_empty() {
parts.push(format!("*{}*", link_text));
}
}
"a" => {
if let Some(Some(href)) = element.attributes.get("href") {
let title_part = element
.attributes
.get("title")
.and_then(|opt_title| opt_title.as_ref())
.filter(|title_str| !title_str.is_empty())
.map(|title_str| format!(" \"{}\"", title_str.replace('"', "\\\"")))
.unwrap_or_default();
let processed_href = process_url_for_markdown(href);
parts.push(format!(
"[{}]({}{})",
link_text.replace("\n", "").trim(),
processed_href,
title_part
));
} else if !link_text.is_empty() {
parts.push(link_text);
}
}
"code" => {
if !link_text.is_empty() {
parts.push(format!("`{}`", link_text));
} else {
parts.push("``".to_string());
}
}
"br" => parts.push(" \n".to_string()),
"img" => {
if let Some(Some(src_url)) = element.attributes.get("src")
&& !src_url.is_empty()
{
let alt_text = element
.attributes
.get("alt")
.and_then(|opt_alt| opt_alt.as_ref())
.map(|s| s.as_str())
.unwrap_or("");
let title_part = element
.attributes
.get("title")
.and_then(|opt_title| opt_title.as_ref())
.filter(|title_str| !title_str.is_empty())
.map(|title_str| format!(" \"{}\"", title_str.replace('"', "\\\"")))
.unwrap_or_default();
let processed_src = process_url_for_markdown(src_url);
parts.push(format!("", alt_text, processed_src, title_part));
}
}
"input" => {
let is_ui_toggle = element.attributes.get("role").and_then(|v| v.as_deref()) == Some("button")
|| element.attributes.get("aria-haspopup").and_then(|v| v.as_deref()) == Some("true");
if is_ui_toggle {
} else if let Some(Some(type_attr)) = element.attributes.get("type") {
match type_attr.to_lowercase().as_str() {
"checkbox" | "radio" => {
if element.attributes.contains_key("checked") {
parts.push("[x] ".to_string());
} else {
parts.push("[ ] ".to_string());
}
}
"text" | "number" | "button" | "url" | "email"
if element.attributes.contains_key("value") =>
{
parts.push(element.attributes.get("value").cloned().unwrap().unwrap_or_default());
}
_ => {}
}
}
}
"s" | "strike" | "del" => {
if !link_text.is_empty() {
parts.push(format!("~~{}~~", link_text));
}
}
"kbd" => parts.push(format!("<kbd>{}</kbd>", link_text)),
"u" => {
parts.push(format!("<u>{}</u>", link_text));
}
"sub" => parts.push(format!("<sub>{}</sub>", link_text)),
"sup" => parts.push(format!("<sup>{}</sup>", link_text)),
"q" => {
if !link_text.is_empty() {
parts.push(format!("\"{}\"", link_text));
}
}
"cite" => {
if !link_text.is_empty() {
parts.push(format!("*{}*", link_text));
}
}
"ins" => {
if !link_text.is_empty() {
parts.push(link_text);
}
}
"mark" => parts.push(format!("<mark>{}</mark>", link_text)),
"summary" => {
if !link_text.is_empty() {
parts.push(format!("**{}**", link_text));
}
}
"abbr" => {
if !link_text.is_empty() {
if let Some(Some(title)) = element.attributes.get("title")
&& !title.is_empty()
{
parts.push(format!("{} ({})", link_text, title));
} else {
parts.push(link_text);
}
}
}
"picture" => parts.push(link_text),
"ruby" => {
let mut base = String::new();
let mut annotation = String::new();
for child in &element.children {
match child {
HtmlNode::Text(t) => base.push_str(t),
HtmlNode::Element(el) if el.tag_name == "rt" => {
annotation.push_str(&convert_children_to_string(&el.children)?);
}
HtmlNode::Element(el) if el.tag_name == "rp" => {}
HtmlNode::Element(el) => {
base.push_str(&convert_children_to_string(&el.children)?);
}
HtmlNode::Comment(_) => {}
}
}
let base = base.trim();
let annotation = annotation.trim();
if !annotation.is_empty() {
parts.push(format!("{}({})", base, annotation));
} else if !base.is_empty() {
parts.push(base.to_string());
}
}
"dfn" => {
if !link_text.is_empty() {
parts.push(format!("*{}*", link_text));
}
}
"time" | "small" | "bdi" => {
if !link_text.is_empty() {
parts.push(link_text);
}
}
"span" => parts.push(link_text),
"nav" | "aside" | "noscript" => {} _ => parts.push(link_text),
}
}
HtmlNode::Comment(_) => {}
}
}
Ok(parts.join("").to_string())
}
fn is_css_dropdown_widget(element: &HtmlElement) -> bool {
let has_toggle_input = element.children.iter().any(|child| {
matches!(child, HtmlNode::Element(el)
if el.tag_name == "input"
&& (el.attributes.get("role").and_then(|v| v.as_deref()) == Some("button")
|| el.attributes.get("aria-haspopup").and_then(|v| v.as_deref()) == Some("true")))
});
if has_toggle_input {
return true;
}
let has_any_input = element
.children
.iter()
.any(|child| matches!(child, HtmlNode::Element(el) if el.tag_name == "input"));
let has_any_label = element
.children
.iter()
.any(|child| matches!(child, HtmlNode::Element(el) if el.tag_name == "label"));
if has_any_input && has_any_label {
return element.children.iter().all(|child| match child {
HtmlNode::Text(t) => t.trim().is_empty(),
HtmlNode::Comment(_) => true,
HtmlNode::Element(el) => matches!(el.tag_name.as_str(), "input" | "label"),
});
}
false
}
fn is_heading_with_aux_siblings(element: &HtmlElement) -> bool {
let has_heading = element.children.iter().any(|child| {
matches!(child, HtmlNode::Element(el)
if matches!(el.tag_name.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6"))
});
if !has_heading {
return false;
}
element.children.iter().all(|child| match child {
HtmlNode::Text(t) => t.trim().is_empty(),
HtmlNode::Comment(_) => true,
HtmlNode::Element(el) => matches!(
el.tag_name.as_str(),
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "span" | "input" | "label" | "button"
),
})
}
pub fn convert_nodes_to_markdown(nodes: &[HtmlNode], options: ConversionOptions) -> miette::Result<String> {
let mut markdown_blocks: Vec<MarkdownBlock> = Vec::new();
for node in nodes {
match node {
HtmlNode::Text(text) => {
if !text.trim().is_empty() {
markdown_blocks.push((text.to_string(), true));
}
}
HtmlNode::Element(element) => {
match element.tag_name.as_str() {
"nav" | "aside" | "noscript" => {
}
"html" | "head" | "header" | "footer" | "body" | "div" | "main" | "article" | "section"
| "hgroup" | "details" | "figure" => {
if is_css_dropdown_widget(element) {
} else if is_heading_with_aux_siblings(element) {
for child in &element.children {
if let HtmlNode::Element(el) = child
&& matches!(el.tag_name.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
{
markdown_blocks.push((handle_heading_element(el)?, false));
}
}
} else {
let markdown_block = convert_nodes_to_markdown(&element.children, options)?;
if !markdown_block.is_empty() {
markdown_blocks.push((markdown_block, false));
}
}
}
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
markdown_blocks.push((handle_heading_element(element)?, false))
}
"p" => markdown_blocks.push((handle_paragraph_element(element)?, false)),
"hr" => markdown_blocks.push((handle_hr_element()?, false)),
"ul" | "ol" => markdown_blocks.push((handle_list_element(element, options)?, false)),
"blockquote" => markdown_blocks.push((handle_blockquote_element(element, options)?, false)),
"pre" => markdown_blocks.push((handle_pre_element(element, options)?, false)),
"table" => {
let table_md = handle_table_element(element, options)?;
if !table_md.is_empty() {
markdown_blocks.push((table_md, false));
}
}
"dl" => {
let dl_md = handle_dl_element(element, options)?;
if !dl_md.is_empty() {
markdown_blocks.push((dl_md, false));
}
}
"summary" => {
let summary_text = convert_children_to_string(&element.children)?;
if !summary_text.is_empty() {
markdown_blocks.push((format!("**{}**", summary_text.trim()), false));
}
}
"figcaption" => {
let caption = handle_paragraph_element(element)?;
if !caption.is_empty() {
markdown_blocks.push((caption, false));
}
}
"address" => {
let content = convert_children_to_string(&element.children)?;
if !content.is_empty() {
markdown_blocks.push((format!("*{}*", content.trim()), false));
}
}
"script" => {
if let Some(script_md) = handle_script_element(element, options)? {
markdown_blocks.push((script_md, false));
}
}
"style" | "title" => { }
"iframe" | "video" | "audio" | "embed" | "object" => {
if let Some(embed_md) = handle_embedded_content_element(element)? {
markdown_blocks.push((embed_md, false));
}
}
"svg" => markdown_blocks.push((handle_svg_element(element)?, false)),
"a" => {
if element.attributes.get("href").and_then(|v| v.as_deref()).is_some() {
let inline_md = convert_children_to_string(&[HtmlNode::Element(element.clone())])?;
if !inline_md.is_empty() {
markdown_blocks.push((inline_md.trim().to_string(), true));
}
} else {
let block_md = convert_nodes_to_markdown(&element.children, options)?;
if !block_md.is_empty() {
markdown_blocks.push((block_md, false));
}
}
}
"br" => {
markdown_blocks.push((" \n".to_string(), true));
}
"strong" | "em" | "code" | "span" | "img" | "input" | "s" | "strike" | "del" | "kbd" | "sub"
| "sup" | "q" | "cite" | "mark" | "abbr" | "picture" | "ruby" | "dfn" | "time" | "small"
| "bdi" => {
let inline_md = convert_children_to_string(&[HtmlNode::Element(element.clone())])?;
if !inline_md.is_empty() {
markdown_blocks.push((inline_md.trim().to_string(), true));
}
}
_ => {
let block_md = convert_nodes_to_markdown(&element.children, options)?;
if !block_md.is_empty() {
markdown_blocks.push((block_md, false));
}
}
}
}
HtmlNode::Comment(_) => {}
}
}
let mut result = String::new();
for (i, (block_content, is_inline)) in markdown_blocks.iter().enumerate() {
if !is_inline
&& i > 0
&& !block_content.is_empty()
&& !result.ends_with("\n\n")
&& !result.ends_with("```\n")
&& !result.ends_with(">\n")
&& !result.ends_with(" \n")
{
if !(result.ends_with('\n') && block_content.starts_with('\n')) {
result.push_str("\n\n");
} else if !result.ends_with('\n') {
result.push_str("\n\n");
}
}
result.push_str(if *is_inline {
block_content
} else {
block_content.trim_start()
});
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
use rustc_hash::FxHashMap;
fn text_node(text: &str) -> HtmlNode {
HtmlNode::Text(text.to_string())
}
fn element_node(tag: &str, children: Vec<HtmlNode>) -> HtmlNode {
HtmlNode::Element(HtmlElement {
tag_name: tag.to_string(),
attributes: FxHashMap::default(),
children,
})
}
#[rstest]
#[case(
vec![element_node("p", vec![text_node("Hello, world!")])],
"Hello, world!"
)]
#[case(
vec![element_node("h2", vec![text_node("Title")])],
"## Title"
)]
#[case(
vec![element_node(
"p",
vec![
element_node("strong", vec![text_node("Bold")]),
text_node(" and "),
element_node("em", vec![text_node("Italic")]),
],
)],
"**Bold** and *Italic*"
)]
#[case(
{
let mut node = element_node("a", vec![text_node("link")]);
if let HtmlNode::Element(ref mut el) = node {
el.attributes.insert("href".to_string(), Some("https://example.com".to_string()));
}
vec![node]
},
"[link](https://example.com)"
)]
#[case(
vec![element_node(
"ul",
vec![
element_node("li", vec![text_node("Item 1")]),
element_node("li", vec![text_node("Item 2")]),
],
)],
"* Item 1\n* Item 2"
)]
#[case(
vec![element_node(
"ol",
vec![
element_node("li", vec![text_node("First")]),
element_node("li", vec![text_node("Second")]),
],
)],
"1. First\n2. Second"
)]
#[case(
vec![element_node(
"pre",
vec![element_node("code", vec![text_node("let x = 1;")])],
)],
"```\nlet x = 1;\n```"
)]
#[case(
{
let th = element_node("th", vec![text_node("Header")]);
let td = element_node("td", vec![text_node("Cell")]);
let tr_head = element_node("tr", vec![th]);
let tr_body = element_node("tr", vec![td]);
let thead = element_node("thead", vec![tr_head]);
let tbody = element_node("tbody", vec![tr_body]);
let table = HtmlNode::Element(HtmlElement {
tag_name: "table".to_string(),
attributes: FxHashMap::default(),
children: vec![thead, tbody],
});
vec![table]
},
"| Header |\n|---|\n| Cell |"
)]
#[case(
vec![element_node(
"blockquote",
vec![element_node("p", vec![text_node("Quote")])],
)],
"> Quote"
)]
#[case(
{
let mut attrs = FxHashMap::default();
attrs.insert("src".to_string(), Some("img.png".to_string()));
attrs.insert("alt".to_string(), Some("alt text".to_string()));
let img = HtmlNode::Element(HtmlElement {
tag_name: "img".to_string(),
attributes: attrs,
children: vec![],
});
vec![img]
},
""
)]
fn test_convert_nodes_to_markdown_param(#[case] nodes: Vec<HtmlNode>, #[case] expected: &str) {
let md = convert_nodes_to_markdown(&nodes, ConversionOptions::default()).unwrap();
let md_trimmed = md.trim();
assert_eq!(md_trimmed, expected);
}
#[rstest]
#[case(
vec![element_node("nav", vec![element_node("a", vec![text_node("Home")])])],
""
)]
#[case(
vec![element_node("aside", vec![text_node("Related")])],
""
)]
#[case(
vec![element_node("noscript", vec![text_node("Enable JavaScript")])],
""
)]
fn test_noisy_elements_are_skipped(#[case] nodes: Vec<HtmlNode>, #[case] expected: &str) {
let md = convert_nodes_to_markdown(&nodes, ConversionOptions::default()).unwrap();
assert_eq!(md.trim(), expected);
}
}