use crate::html::convert_relative_urls;
use crate::Result;
use scraper::{Html, Selector};
use tracing::{debug, info};
pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
info!("Converting HTML to Markdown");
let processed_html = base_url.map_or_else(
|| html.to_string(),
|base| convert_relative_urls(html, base),
);
let cleaned_html = clean_html(&processed_html);
let markdown = html2md::parse_html(&cleaned_html);
let decoded_markdown = crate::html::decode_html_entities(&markdown);
let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
let cleaned_markdown = clean_markdown(&normalized_markdown);
info!(
"Successfully converted to Markdown ({} bytes)",
cleaned_markdown.len()
);
Ok(cleaned_markdown)
}
fn clean_html(html: &str) -> String {
debug!("Cleaning HTML for Markdown conversion");
let document = Html::parse_document(html);
let mut cleaned = html.to_string();
if let Ok(selector) = Selector::parse("script") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
if let Ok(selector) = Selector::parse("style") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
if let Ok(selector) = Selector::parse("noscript") {
for element in document.select(&selector) {
let outer_html = element.html();
cleaned = cleaned.replace(&outer_html, "");
}
}
cleaned
}
pub fn clean_markdown(markdown: &str) -> String {
debug!("Cleaning Markdown output");
let mut result = markdown.to_string();
while result.contains("\n\n\n") {
result = result.replace("\n\n\n", "\n\n");
}
result = result.trim().to_string();
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
result
}