1use crate::html::convert_relative_urls;
6use crate::Result;
7use scraper::{Html, Selector};
8use tracing::{debug, info};
9
10pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
28 info!("Converting HTML to Markdown");
29
30 let processed_html = base_url.map_or_else(
32 || html.to_string(),
33 |base| convert_relative_urls(html, base),
34 );
35
36 let cleaned_html = clean_html(&processed_html);
38
39 let markdown = html2md::parse_html(&cleaned_html);
41
42 let decoded_markdown = crate::html::decode_html_entities(&markdown);
44
45 let cleaned_markdown = clean_markdown(&decoded_markdown);
47
48 info!(
49 "Successfully converted to Markdown ({} bytes)",
50 cleaned_markdown.len()
51 );
52 Ok(cleaned_markdown)
53}
54
55fn clean_html(html: &str) -> String {
59 debug!("Cleaning HTML for Markdown conversion");
60
61 let document = Html::parse_document(html);
62
63 let mut cleaned = html.to_string();
65
66 if let Ok(selector) = Selector::parse("script") {
68 for element in document.select(&selector) {
69 let outer_html = element.html();
70 cleaned = cleaned.replace(&outer_html, "");
71 }
72 }
73
74 if let Ok(selector) = Selector::parse("style") {
76 for element in document.select(&selector) {
77 let outer_html = element.html();
78 cleaned = cleaned.replace(&outer_html, "");
79 }
80 }
81
82 if let Ok(selector) = Selector::parse("noscript") {
84 for element in document.select(&selector) {
85 let outer_html = element.html();
86 cleaned = cleaned.replace(&outer_html, "");
87 }
88 }
89
90 cleaned
91}
92
93pub fn clean_markdown(markdown: &str) -> String {
97 debug!("Cleaning Markdown output");
98
99 let mut result = markdown.to_string();
101
102 while result.contains("\n\n\n") {
104 result = result.replace("\n\n\n", "\n\n");
105 }
106
107 result = result.trim().to_string();
109
110 if !result.is_empty() && !result.ends_with('\n') {
112 result.push('\n');
113 }
114
115 result
116}