use crate::TranslatorConfig;
use crate::config::{default_block_elements, default_ignore_elements};
use crate::parser;
use crate::translator::TranslatorCollection;
use crate::utilities::{
collapse_whitespace, get_trailing_whitespace_info, is_whitespace_only, markdown_url_encode,
surround, tag_surround, trim_newlines,
};
use crate::{CodeBlockStyle, HtmlToMarkdown, Options};
use fancy_regex::Regex as FancyRegex;
use markup5ever_rcdom::Handle;
use regex::Regex;
use std::fmt::Write;
use std::sync::LazyLock;
static LIST_BREAK_RE: LazyLock<FancyRegex> = LazyLock::new(|| {
FancyRegex::new("([^\\r\\n])(?:\\r?\\n)+(?!\\s*[-*+]|\\s*\\d+\\.|\\s*\\|)")
.expect("list break regex is valid")
});
static LIST_TRAILING_SPACE_RE: LazyLock<FancyRegex> = LazyLock::new(|| {
FancyRegex::new("(?m)(\\S+?)[^\\S\\r\\n]+$").expect("list trailing regex is valid")
});
#[derive(Clone, Copy, Default)]
struct Context {
list_kind: Option<ListKind>,
list_index: usize,
indent_level: usize,
no_escape: bool,
preserve_whitespace: bool,
scope: Scope,
}
#[derive(Clone, Copy, Eq, PartialEq)]
enum ListKind {
Ordered,
Unordered,
}
#[derive(Clone, Copy, Default, Eq, PartialEq)]
enum Scope {
#[default]
Normal,
Anchor,
CodeBlock,
TableCell,
}
pub(crate) fn get_markdown_for_html_nodes(instance: &HtmlToMarkdown, root: &Handle) -> String {
let mut visitor = Visitor {
options: instance.options(),
result: String::new(),
url_definitions: Vec::new(),
translators: instance.translators(),
code_block_translators: instance.code_block_translators(),
};
visitor.visit(root, Context::default());
let mut result = visitor.result;
if instance.options().use_link_reference_definitions {
while result.ends_with(['\n', '\r']) {
result.pop();
}
if result
.chars()
.last()
.is_some_and(|ch| ch != '\n' && ch != '\r')
{
result.push('\n');
}
for (index, url) in visitor.url_definitions.iter().enumerate() {
let _ = write!(&mut result, "\n[{}]: {}", index + 1, url);
}
}
if instance.options().max_consecutive_newlines > 0 {
let newline_limit = Regex::new(&format!(
r"(?:\r?\n\s*)+((?:\r?\n\s*){{{}}})",
instance.options().max_consecutive_newlines
))
.expect("newline limit regex is valid");
result = newline_limit.replace_all(&result, "$1").to_string();
}
let mut result = trim_newlines(&result);
if result.ends_with(' ') && !result.ends_with(" ") {
result = result.trim_end().to_owned();
}
result
}
struct Visitor<'a> {
options: &'a Options,
result: String,
url_definitions: Vec<String>,
translators: &'a TranslatorCollection,
code_block_translators: &'a TranslatorCollection,
}
#[allow(clippy::needless_lifetimes, reason = "false positive in Rust 1.85")]
impl<'a> Visitor<'a> {
fn visit(&mut self, node: &Handle, context: Context) {
if parser::is_text(node) {
self.visit_text(node, &context);
return;
}
let Some(tag) = parser::tag_name(node) else {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
};
if self.is_ignored(&tag) {
return;
}
if let Some(custom) = self.custom_translator(context.scope, &tag) {
self.visit_custom(node, context, &custom);
return;
}
if context.scope == Scope::TableCell {
match tag.as_str() {
"A" => self.visit_anchor(node, context),
"B" | "STRONG" => {
self.visit_delimited(node, context, &self.options.strong_delimiter, &tag)
}
"DEL" | "S" | "STRIKE" => {
self.visit_delimited(node, context, &self.options.strike_delimiter, &tag)
}
"EM" | "I" => self.visit_delimited(node, context, &self.options.em_delimiter, &tag),
"IMG" => self.visit_image(node, &context),
_ => {
for child in parser::children(node) {
self.visit(&child, context);
}
}
}
return;
}
match tag.as_str() {
"BR" => self.visit_br(&context),
"HR" => self.visit_hr(&context),
"OL" => self.visit_list(node, context, ListKind::Ordered),
"UL" => self.visit_list(node, context, ListKind::Unordered),
"LI" => {
let _ = self.visit_list_item(node, context);
}
"B" | "STRONG" => {
self.visit_delimited(node, context, &self.options.strong_delimiter, &tag)
}
"DEL" | "S" | "STRIKE" => {
self.visit_delimited(node, context, &self.options.strike_delimiter, &tag)
}
"EM" | "I" => self.visit_delimited(node, context, &self.options.em_delimiter, &tag),
"BLOCKQUOTE" => self.visit_blockquote(node, context),
"H1" | "H2" | "H3" | "H4" | "H5" | "H6" => self.visit_heading(node, context, &tag),
"A" => self.visit_anchor(node, context),
"IMG" => self.visit_image(node, &context),
"PRE" => self.visit_pre(node, context),
"CODE" => self.visit_code(node, context),
"TABLE" => self.visit_table(node, context),
_ => self.visit_generic(node, context, &tag),
}
}
fn visit_text(&mut self, node: &Handle, context: &Context) {
let Some(text) = parser::text(node) else {
return;
};
#[allow(clippy::redundant_clone, reason = "false positive on 1.85")]
let source_text = if context.preserve_whitespace {
text.clone()
} else {
trim_newlines(&text)
};
if is_whitespace_only(&source_text) && !context.preserve_whitespace {
let (whitespace, _) = get_trailing_whitespace_info(&self.result);
if self.result.is_empty() || whitespace > 0 {
return;
}
self.append_result(" ", None, false);
return;
}
let mut processed = self.process_text(&source_text, context);
if !context.preserve_whitespace && !processed.is_empty() {
if text.starts_with('\n') {
processed = processed.strip_prefix(' ').unwrap_or(&processed).to_owned();
}
if !text.chars().last().is_some_and(char::is_whitespace) {
processed = processed.trim_end_matches(' ').to_owned();
}
}
self.append_result(&processed, None, false);
}
fn visit_br(&mut self, context: &Context) {
match context.scope {
Scope::CodeBlock | Scope::Anchor => self.append_result("\n", None, false),
Scope::Normal | Scope::TableCell => self.append_result(" \n", None, false),
}
}
fn visit_hr(&mut self, context: &Context) {
match context.scope {
Scope::Anchor => self.append_result("\n", None, false),
Scope::CodeBlock => self.wrap_content("---", 2),
Scope::Normal | Scope::TableCell => self.wrap_content("---", 2),
}
}
fn visit_list(&mut self, node: &Handle, mut context: Context, kind: ListKind) {
if context.scope == Scope::CodeBlock {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
let surrounding = if context.list_kind.is_some() { 1 } else { 2 };
context.indent_level = if context.list_kind.is_some() {
context.indent_level + 1
} else {
0
};
context.list_kind = Some(kind);
self.append_newlines(surrounding);
let mut ordered_index = 0usize;
for child in parser::children(node) {
if parser::tag_name(&child).as_deref() == Some("LI") {
let mut child_context = context;
if kind == ListKind::Ordered {
child_context.list_index = ordered_index + 1;
}
if self.visit_list_item(&child, child_context) && kind == ListKind::Ordered {
ordered_index += 1;
}
} else {
self.visit(&child, context);
}
}
self.append_newlines(surrounding);
}
fn visit_list_item(&mut self, node: &Handle, context: Context) -> bool {
if context.scope == Scope::CodeBlock {
let rendered = self.render_children_to_string(node, context);
if !is_whitespace_only(&rendered) {
self.append_result(&rendered, None, false);
return true;
}
return false;
}
let start_outer = self.result.len();
self.append_newlines(1);
let marker = match context.list_kind {
Some(ListKind::Ordered) => format!("{}. ", context.list_index),
_ => format!("{} ", self.options.bullet_marker),
};
let prefix = format!(
"{}{}",
self.options.indent.repeat(context.indent_level),
marker
);
self.append_result(&prefix, None, false);
let start_inner = self.result.len();
for child in parser::children(node) {
self.visit(&child, context);
}
let content = self.result[start_inner..].to_owned();
if is_whitespace_only(&content) {
self.append_result("", Some(start_outer), false);
return false;
}
let processed = self.postprocess_list_item(&content, context.indent_level);
self.append_result(&processed, Some(start_inner), false);
self.append_newlines(1);
true
}
fn visit_custom(&mut self, node: &Handle, mut context: Context, custom: &TranslatorConfig) {
if custom.ignore {
return;
}
let child_nodes = parser::children(node);
let has_children = !child_nodes.is_empty();
if custom.content.is_some() && !custom.preserve_if_empty && !has_children {
return;
}
if custom.no_escape {
context.no_escape = true;
}
if custom.preserve_whitespace {
context.preserve_whitespace = true;
}
let start_outer = self.result.len();
if let crate::SurroundingNewlines::Count(count) = custom.surrounding_newlines {
self.append_newlines(count);
}
if let Some(prefix) = &custom.prefix {
self.append_result(prefix, None, false);
}
if let Some(content) = &custom.content {
self.append_result(content, None, custom.space_if_repeating_char);
} else if custom.recurse != Some(false) {
for child in child_nodes {
self.visit(&child, context);
}
}
if custom.preserve_if_empty && start_outer == self.result.len() {
if let Some(content) = &custom.content {
self.append_result(content, None, custom.space_if_repeating_char);
}
}
if let Some(postfix) = &custom.postfix {
self.append_result(postfix, None, false);
}
if let crate::SurroundingNewlines::Count(count) = custom.surrounding_newlines {
self.append_newlines(count);
}
}
fn visit_delimited(&mut self, node: &Handle, context: Context, delimiter: &str, tag: &str) {
if context.scope == Scope::CodeBlock {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
if self.is_block(tag) {
self.append_newlines(2);
}
let start = self.result.len();
for child in parser::children(node) {
self.visit(&child, context);
}
let content = self.result[start..].to_owned();
if is_whitespace_only(&content) {
self.append_result("", Some(start), false);
} else {
self.append_result(&tag_surround(&content, delimiter), Some(start), true);
}
if self.is_block(tag) {
self.append_newlines(2);
}
}
fn visit_blockquote(&mut self, node: &Handle, context: Context) {
if context.scope == Scope::CodeBlock {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
self.append_newlines(2);
let start = self.result.len();
for child in parser::children(node) {
self.visit(&child, context);
}
let content = self.result[start..].to_owned();
let formatted = trim_newlines(&content)
.lines()
.map(format_blockquote_line)
.collect::<Vec<_>>()
.join("\n");
self.append_result(&formatted, Some(start), false);
self.append_newlines(2);
}
fn visit_heading(&mut self, node: &Handle, context: Context, tag: &str) {
if context.scope == Scope::CodeBlock {
self.append_newlines(1);
self.append_result("[", None, false);
for child in parser::children(node) {
self.visit(&child, context);
}
self.append_result("]", None, false);
self.append_newlines(1);
return;
}
self.append_newlines(2);
let level = tag[1..].parse::<usize>().unwrap_or(1);
self.append_result(&format!("{} ", "#".repeat(level)), None, false);
for child in parser::children(node) {
self.visit(&child, context);
}
self.append_newlines(2);
}
fn visit_anchor(&mut self, node: &Handle, mut context: Context) {
if context.scope == Scope::CodeBlock {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
let Some(href) = parser::attr(node, "href") else {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
};
let encoded_href = markdown_url_encode(&href);
let raw_text = parser::inner_text(node);
if raw_text == href && self.options.use_inline_links {
self.append_result(&format!("<{encoded_href}>"), None, false);
return;
}
let title = parser::attr(node, "title");
self.append_result("[", None, false);
let content_start = self.result.len();
context.scope = Scope::Anchor;
for child in parser::children(node) {
self.visit(&child, context);
}
let content = collapse_anchor_newlines(&self.result[content_start..]);
self.append_result(&content, Some(content_start), false);
if self.options.use_link_reference_definitions {
let index = self.add_or_get_url_definition(&encoded_href);
self.append_result(&format!("][{index}]"), None, false);
} else if let Some(title) = title {
self.append_result(&format!("]({encoded_href} \"{title}\")"), None, false);
} else {
self.append_result(&format!("]({encoded_href})"), None, false);
}
}
fn visit_image(&mut self, node: &Handle, context: &Context) {
if context.scope == Scope::CodeBlock {
return;
}
let Some(src) = parser::attr(node, "src") else {
return;
};
if src.is_empty()
|| (!self.options.keep_data_images && src.to_ascii_lowercase().starts_with("data:"))
{
return;
}
let alt = parser::attr(node, "alt").unwrap_or_default();
let title = parser::attr(node, "title").unwrap_or_default();
if title.is_empty() {
self.append_result(&format!(""), None, false);
} else {
self.append_result(&format!(""), None, false);
}
}
fn visit_pre(&mut self, node: &Handle, mut context: Context) {
context.preserve_whitespace = true;
context.no_escape = true;
let children = parser::children(node);
let code_child = children
.iter()
.find(|child| parser::tag_name(child).as_deref() == Some("CODE"));
if let Some(code_child) = code_child {
let language = parser::attr(code_child, "class").and_then(|class| {
class
.split_whitespace()
.find_map(|part| part.strip_prefix("language-").map(ToOwned::to_owned))
});
let content = self.render_node_to_string(
code_child,
Context {
scope: Scope::CodeBlock,
preserve_whitespace: true,
no_escape: true,
..context
},
);
self.append_newlines(2);
match self.options.code_block_style {
CodeBlockStyle::Fenced => {
self.append_result(
&format!(
"{}{}\n{}\n{}",
self.options.code_fence,
language.unwrap_or_default(),
content,
self.options.code_fence
),
None,
false,
);
}
CodeBlockStyle::Indented => {
let indented = content
.lines()
.map(|line| format!(" {line}"))
.collect::<Vec<_>>()
.join("\n");
self.append_result(&indented, None, false);
}
}
self.append_newlines(2);
return;
}
for child in children {
self.visit(&child, context);
}
}
fn visit_table(&mut self, node: &Handle, context: Context) {
if context.scope == Scope::CodeBlock {
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
let (caption, rows) = self.extract_table_rows(node);
if rows.is_empty() {
return;
}
self.append_newlines(2);
if let Some(caption) = caption {
self.append_result(&format!("__{caption}__\n"), None, false);
}
let widths = table_widths(&rows);
for (row_index, row) in rows.iter().enumerate() {
self.append_result("| ", None, false);
for (col_index, width) in widths.iter().enumerate() {
let mut cell = row.get(col_index).cloned().unwrap_or_default();
if cell.len() < *width {
cell.push_str(&" ".repeat(*width - cell.len()));
}
self.append_result(&cell, None, false);
self.append_result(
if col_index + 1 < widths.len() {
" | "
} else {
" |"
},
None,
false,
);
}
if row_index == 0 {
self.append_result("\n|", None, false);
for width in &widths {
self.append_result(&format!(" {} |", "-".repeat(*width)), None, false);
}
if rows.len() > 1 {
self.append_result("\n", None, false);
}
} else if row_index + 1 < rows.len() {
self.append_result("\n", None, false);
}
}
self.append_newlines(2);
}
fn visit_code(&mut self, node: &Handle, mut context: Context) {
if context.scope == Scope::CodeBlock {
context.no_escape = true;
context.preserve_whitespace = true;
for child in parser::children(node) {
self.visit(&child, context);
}
return;
}
let start = self.result.len();
context.no_escape = true;
for child in parser::children(node) {
self.visit(&child, context);
}
let content = self.result[start..].to_owned();
let longest = content
.split(|c| c != '`')
.filter(|segment| !segment.is_empty())
.map(str::len)
.max()
.unwrap_or(0);
let delimiter = "`".repeat(longest + 1);
let padding = if delimiter.len() > 1 { " " } else { "" };
let wrapped = surround(&surround(&content, padding), &delimiter);
self.append_result(&wrapped, Some(start), true);
}
fn visit_generic(&mut self, node: &Handle, context: Context, tag: &str) {
let surrounding = if context.scope == Scope::CodeBlock {
0
} else {
usize::from(self.is_block(tag)) * 2
};
if surrounding > 0 {
self.append_newlines(surrounding);
}
for child in parser::children(node) {
self.visit(&child, context);
}
if surrounding > 0 {
self.append_newlines(surrounding);
}
}
fn process_text(&self, text: &str, context: &Context) -> String {
let mut result = if context.preserve_whitespace {
text.to_owned()
} else {
collapse_whitespace(text)
};
if context.no_escape {
return result;
}
result = self
.options
.global_escape
.0
.replace_all(&result, self.options.global_escape.1.as_str())
.to_string();
result = self
.options
.line_start_escape
.0
.replace_all(&result, self.options.line_start_escape.1.as_str())
.to_string();
for (pattern, replacement) in &self.options.text_replace {
result = pattern
.replace_all(&result, replacement.as_str())
.to_string();
}
result
}
fn postprocess_list_item(&self, content: &str, indent_level: usize) -> String {
let indent = self.options.indent.repeat(indent_level);
let processed = LIST_BREAK_RE
.replace_all(content.trim(), format!("$1 \n{indent}").as_str())
.into_owned();
LIST_TRAILING_SPACE_RE
.replace_all(&processed, "$1 ")
.into_owned()
}
fn render_node_to_string(&self, node: &Handle, context: Context) -> String {
let mut nested = Self {
options: self.options,
result: String::new(),
url_definitions: self.url_definitions.clone(),
translators: self.translators,
code_block_translators: self.code_block_translators,
};
nested.visit(node, context);
nested.result
}
fn append_result(
&mut self,
text: &str,
start_pos: Option<usize>,
space_if_repeating_char: bool,
) {
if text.is_empty() && start_pos.is_none() {
return;
}
if let Some(start_pos) = start_pos {
self.result.truncate(start_pos);
}
if space_if_repeating_char && self.result.chars().last() == text.chars().next() {
self.result.push(' ');
}
self.result.push_str(text);
}
fn append_newlines(&mut self, count: usize) {
let (_, current_newlines) = get_trailing_whitespace_info(&self.result);
for _ in current_newlines..count {
self.result.push('\n');
}
}
fn wrap_content(&mut self, content: &str, surrounding_newlines: usize) {
self.append_newlines(surrounding_newlines);
self.append_result(content, None, false);
self.append_newlines(surrounding_newlines);
}
fn add_or_get_url_definition(&mut self, url: &str) -> usize {
if let Some(index) = self
.url_definitions
.iter()
.position(|existing| existing == url)
{
index + 1
} else {
self.url_definitions.push(url.to_owned());
self.url_definitions.len()
}
}
fn custom_translator(&self, scope: Scope, tag: &str) -> Option<TranslatorConfig> {
match scope {
Scope::CodeBlock => self.code_block_translators.get(tag).cloned(),
Scope::Anchor => None,
Scope::TableCell => None,
Scope::Normal => self.translators.get(tag).cloned(),
}
}
fn is_ignored(&self, tag: &str) -> bool {
default_ignore_elements().contains(&tag)
|| self
.options
.ignore
.iter()
.any(|candidate| candidate.eq_ignore_ascii_case(tag))
}
fn is_block(&self, tag: &str) -> bool {
default_block_elements().contains(&tag)
|| self
.options
.block_elements
.iter()
.any(|candidate| candidate.eq_ignore_ascii_case(tag))
}
fn extract_table_rows(&self, node: &Handle) -> (Option<String>, Vec<Vec<String>>) {
let mut caption = None;
let mut rows = Vec::new();
for child in parser::children(node) {
match parser::tag_name(&child).as_deref() {
Some("CAPTION") => {
let rendered = self.render_children_to_string(
&child,
Context {
scope: Scope::TableCell,
..Context::default()
},
);
let text = collapse_anchor_newlines(&trim_newlines(&rendered))
.trim()
.to_owned();
if !text.is_empty() {
caption = Some(text);
}
}
Some("TR") => {
if let Some(row) = self.extract_table_row(&child) {
rows.push(row);
}
}
Some("TD") | Some("TH") => {
if rows.is_empty() {
rows.push(Vec::new());
}
rows.last_mut()
.expect("row exists")
.push(self.render_table_cell(&child));
}
_ => {
for grandchild in parser::children(&child) {
match parser::tag_name(&grandchild).as_deref() {
Some("TR") => {
if let Some(row) = self.extract_table_row(&grandchild) {
rows.push(row);
}
}
Some("TD") | Some("TH") => {
if rows.is_empty() {
rows.push(Vec::new());
}
rows.last_mut()
.expect("row exists")
.push(self.render_table_cell(&grandchild));
}
Some("CAPTION") => {
let rendered = self.render_children_to_string(
&grandchild,
Context {
scope: Scope::TableCell,
..Context::default()
},
);
let text = collapse_anchor_newlines(&trim_newlines(&rendered))
.trim()
.to_owned();
if !text.is_empty() {
caption = Some(text);
}
}
_ => {}
}
}
}
}
}
(caption, rows)
}
fn extract_table_row(&self, node: &Handle) -> Option<Vec<String>> {
let mut row = Vec::new();
for child in parser::children(node) {
match parser::tag_name(&child).as_deref() {
Some("TD") | Some("TH") => row.push(self.render_table_cell(&child)),
_ => {}
}
}
if row.is_empty() { None } else { Some(row) }
}
fn render_table_cell(&self, node: &Handle) -> String {
let rendered = self.render_children_to_string(
node,
Context {
scope: Scope::TableCell,
..Context::default()
},
);
collapse_anchor_newlines(&trim_newlines(&rendered))
.replacen('|', "\\|", 1)
.trim()
.to_owned()
}
fn render_children_to_string(&self, node: &Handle, context: Context) -> String {
let mut nested = Self {
options: self.options,
result: String::new(),
url_definitions: self.url_definitions.clone(),
translators: self.translators,
code_block_translators: self.code_block_translators,
};
for child in parser::children(node) {
nested.visit(&child, context);
}
nested.result
}
}
fn format_blockquote_line(line: &str) -> String {
let depth = line.chars().take_while(|ch| *ch == '>').count();
let rest = line[depth..].trim_start_matches([' ', '\t']);
format!(">{} {}", ">".repeat(depth), rest)
}
fn collapse_anchor_newlines(text: &str) -> String {
let mut result = String::new();
let mut pending_space = false;
for ch in text.chars() {
if matches!(ch, '\n' | '\r') {
pending_space = true;
continue;
}
if pending_space {
result.push(' ');
pending_space = false;
}
result.push(ch);
}
result
}
fn table_widths(rows: &[Vec<String>]) -> Vec<usize> {
let mut widths = Vec::new();
for row in rows {
for (index, cell) in row.iter().enumerate() {
if widths.len() <= index {
widths.push(cell.len());
} else if widths[index] < cell.len() {
widths[index] = cell.len();
}
}
}
widths
}