use regex::Regex;
use std::sync::LazyLock;
pub fn html_to_markdown(html: &str) -> String {
let md = htmd::convert(html).unwrap_or_default();
let md = strip_anchor_artifacts(&md);
let md = strip_data_uris(&md);
convert_indented_code_to_fenced(&md)
}
fn convert_indented_code_to_fenced(md: &str) -> String {
let mut result = String::with_capacity(md.len());
let mut code_lines: Vec<&str> = Vec::new();
let mut in_fenced = false;
for line in md.lines() {
if line.trim_start().starts_with("```") {
in_fenced = !in_fenced;
if !code_lines.is_empty() {
flush_code_block(&mut result, &mut code_lines);
}
result.push_str(line);
result.push('\n');
continue;
}
if in_fenced {
result.push_str(line);
result.push('\n');
continue;
}
let is_code_indent = line.starts_with(" ") || line.starts_with('\t');
let is_blank = line.trim().is_empty();
if is_code_indent {
let stripped = if let Some(s) = line.strip_prefix(" ") {
s
} else if let Some(s) = line.strip_prefix('\t') {
s
} else {
line
};
code_lines.push(stripped);
} else if is_blank && !code_lines.is_empty() {
code_lines.push("");
} else {
if !code_lines.is_empty() {
flush_code_block(&mut result, &mut code_lines);
}
result.push_str(line);
result.push('\n');
}
}
if !code_lines.is_empty() {
flush_code_block(&mut result, &mut code_lines);
}
if result.ends_with('\n') && !md.ends_with('\n') {
result.pop();
}
result
}
fn flush_code_block(result: &mut String, code_lines: &mut Vec<&str>) {
while code_lines.last() == Some(&"") {
code_lines.pop();
}
if !code_lines.is_empty() {
result.push_str("```\n");
for line in code_lines.iter() {
result.push_str(line);
result.push('\n');
}
result.push_str("```\n");
}
code_lines.clear();
}
fn strip_data_uris(md: &str) -> String {
static DATA_URI_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"!\[[^\]]*\]\(data:[^)]{10,}\)").unwrap());
DATA_URI_RE.replace_all(md, "").to_string()
}
fn strip_anchor_artifacts(md: &str) -> String {
static EMPTY_ANCHOR_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"\[¶?\]\(#[^)]*\)"#).unwrap());
let cleaned = EMPTY_ANCHOR_RE.replace_all(md, "");
cleaned
.replace('\u{00b6}', "") .replace(" \u{00a7}", "") }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn converts_basic_html() {
let html = "<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
let md = html_to_markdown(html);
assert!(md.contains("# Title"));
assert!(md.contains("**bold**"));
}
#[test]
fn strips_pilcrow_from_headers() {
let html = r##"<h2>Section <a href="#section">¶</a></h2>"##;
let md = html_to_markdown(html);
assert!(!md.contains('\u{00b6}'));
assert!(md.contains("Section"));
}
#[test]
fn strips_empty_anchor_links() {
let input = "## Heading [](#heading) rest\n\nSome [¶](#foo \"title\") text";
let result = strip_anchor_artifacts(input);
assert!(!result.contains("[](#"));
assert!(!result.contains("[¶](#"));
assert!(result.contains("## Heading rest"));
assert!(result.contains("Some text"));
}
#[test]
fn converts_links() {
let html = r#"<p><a href="https://example.com">Link</a></p>"#;
let md = html_to_markdown(html);
assert!(md.contains("[Link](https://example.com)"));
}
}