1use crate::html::convert_relative_urls;
6use crate::Result;
7use scraper::{Html, Selector};
8use tracing::{debug, info};
9
10pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
28 info!("Converting HTML to Markdown");
29
30 let processed_html = base_url.map_or_else(
32 || html.to_string(),
33 |base| convert_relative_urls(html, base),
34 );
35
36 let cleaned_html = clean_html(&processed_html);
38
39 let markdown = html2md::parse_html(&cleaned_html);
41
42 let decoded_markdown = crate::html::decode_html_entities(&markdown);
44
45 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
47
48 let cleaned_markdown = clean_markdown(&normalized_markdown);
50
51 info!(
52 "Successfully converted to Markdown ({} bytes)",
53 cleaned_markdown.len()
54 );
55 Ok(cleaned_markdown)
56}
57
58fn clean_html(html: &str) -> String {
62 debug!("Cleaning HTML for Markdown conversion");
63
64 let document = Html::parse_document(html);
65
66 let mut cleaned = html.to_string();
68
69 if let Ok(selector) = Selector::parse("script") {
71 for element in document.select(&selector) {
72 let outer_html = element.html();
73 cleaned = cleaned.replace(&outer_html, "");
74 }
75 }
76
77 if let Ok(selector) = Selector::parse("style") {
79 for element in document.select(&selector) {
80 let outer_html = element.html();
81 cleaned = cleaned.replace(&outer_html, "");
82 }
83 }
84
85 if let Ok(selector) = Selector::parse("noscript") {
87 for element in document.select(&selector) {
88 let outer_html = element.html();
89 cleaned = cleaned.replace(&outer_html, "");
90 }
91 }
92
93 cleaned
94}
95
96pub fn clean_markdown(markdown: &str) -> String {
100 debug!("Cleaning Markdown output");
101
102 let mut result = markdown.to_string();
104
105 while result.contains("\n\n\n") {
107 result = result.replace("\n\n\n", "\n\n");
108 }
109
110 result = result.trim().to_string();
112
113 if !result.is_empty() && !result.ends_with('\n') {
115 result.push('\n');
116 }
117
118 result
119}