use crate::html::convert_relative_urls;
use crate::Result;
use regex::Regex;
use scraper::{Html, Selector};
use tracing::{debug, info};
pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
info!("Converting HTML to Markdown");
let processed_html = base_url.map_or_else(
|| html.to_string(),
|base| convert_relative_urls(html, base),
);
let cleaned_html = clean_html(&processed_html);
let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
let heading_safe_html = hoist_images_from_headings(&cleaned_html);
let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
let markdown = html2md::parse_html(&heading_safe_html);
let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
let decoded_markdown = crate::html::decode_html_entities(&markdown);
let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
let cleaned_markdown = clean_markdown(&normalized_markdown);
info!(
"Successfully converted to Markdown ({} bytes)",
cleaned_markdown.len()
);
Ok(cleaned_markdown)
}
#[must_use]
pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
let selector = Selector::parse(selector_str).ok()?;
let document = Html::parse_document(html);
document
.select(&selector)
.next()
.map(|element| element.html())
}
fn clean_html(html: &str) -> String {
debug!("Cleaning HTML for Markdown conversion");
let document = Html::parse_document(html);
let mut cleaned = html.to_string();
if let Ok(selector) = Selector::parse("script") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
if let Ok(selector) = Selector::parse("style") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
if let Ok(selector) = Selector::parse("noscript") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
cleaned
}
fn preserve_leading_heading_numbering(html: &str) -> String {
let pattern = Regex::new(
r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
)
.expect("valid regex");
let leading_number_in_strong =
Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
.expect("valid regex");
let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
let unwrapped = pattern
.replace_all(html, |caps: ®ex::Captures<'_>| {
let heading = &caps[1];
if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
{
heading.to_string()
} else {
caps[0].to_string()
}
})
.into_owned();
leading_number_in_strong
.replace_all(&unwrapped, |caps: ®ex::Captures<'_>| {
let open = &caps[1];
let number = &caps[2];
let inner = &caps[3];
format!("{open}{number}<strong>{inner}</strong>")
})
.into_owned()
}
fn hoist_images_from_headings(html: &str) -> String {
use std::fmt::Write;
let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
let mut result = html.to_string();
for level in 1..=6 {
let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
.expect("valid regex");
result = heading_re
.replace_all(&result, |caps: ®ex::Captures<'_>| {
let open = &caps[1];
let inner = &caps[2];
let close = &caps[3];
let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
if imgs.is_empty() {
return caps[0].to_string();
}
let stripped = img_re.replace_all(inner, "").to_string();
let mut out = format!("{open}{stripped}{close}");
for img in imgs {
write!(out, "\n<p>{img}</p>").expect("write to String");
}
out
})
.into_owned();
}
result
}
fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
let document = Html::parse_document(html);
let Ok(ol_selector) = Selector::parse("ol") else {
return Vec::new();
};
let mut numbers = Vec::new();
let mut counter: u32 = 1;
for ol in document.select(&ol_selector) {
let nested = ol.ancestors().any(|n| {
n.value()
.as_element()
.is_some_and(|e| e.name() == "ol" || e.name() == "ul")
});
if nested {
continue;
}
if let Some(start) = ol
.value()
.attr("start")
.and_then(|s| s.trim().parse::<u32>().ok())
{
counter = start;
}
let li_count = ol
.children()
.filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
.count();
for _ in 0..li_count {
numbers.push(counter);
counter = counter.saturating_add(1);
}
}
numbers
}
fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
use std::fmt::Write;
let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
let mut out = String::with_capacity(markdown.len());
let mut idx: usize = 0;
for (i, line) in lines.iter().enumerate() {
let body = line.strip_suffix('\n').unwrap_or(line);
if let Some(caps) = item_re.captures(body) {
let next_body = lines
.get(i + 1)
.map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
let is_setext_heading = setext_re.is_match(next_body);
if !is_setext_heading {
if let Some(&n) = numbers.get(idx) {
let after = &body[caps.get(0).expect("match 0").end()..];
let sep = caps.get(2).expect("group 2").as_str();
write!(out, "{n}.{sep}{after}").expect("write to String");
if line.ends_with('\n') {
out.push('\n');
}
idx += 1;
continue;
}
}
}
out.push_str(line);
}
out
}
pub fn clean_markdown(markdown: &str) -> String {
debug!("Cleaning Markdown output");
let mut result = markdown.to_string();
while result.contains("\n\n\n") {
result = result.replace("\n\n\n", "\n\n");
}
result = result.trim().to_string();
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
result
}