1use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29 info!("Converting HTML to Markdown");
30
31 let processed_html = base_url.map_or_else(
33 || html.to_string(),
34 |base| convert_relative_urls(html, base),
35 );
36
37 let cleaned_html = clean_html(&processed_html);
39
40 let heading_safe_html = hoist_images_from_headings(&cleaned_html);
44
45 let markdown = html2md::parse_html(&heading_safe_html);
47
48 let decoded_markdown = crate::html::decode_html_entities(&markdown);
50
51 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
53
54 let cleaned_markdown = clean_markdown(&normalized_markdown);
56
57 info!(
58 "Successfully converted to Markdown ({} bytes)",
59 cleaned_markdown.len()
60 );
61 Ok(cleaned_markdown)
62}
63
64#[must_use]
65pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
66 let selector = Selector::parse(selector_str).ok()?;
67 let document = Html::parse_document(html);
68 document
69 .select(&selector)
70 .next()
71 .map(|element| element.html())
72}
73
74fn clean_html(html: &str) -> String {
78 debug!("Cleaning HTML for Markdown conversion");
79
80 let document = Html::parse_document(html);
81
82 let mut cleaned = html.to_string();
84
85 if let Ok(selector) = Selector::parse("script") {
87 for element in document.select(&selector) {
88 let outer_html = element.html();
89 cleaned = cleaned.replace(&outer_html, "");
90 }
91 }
92
93 if let Ok(selector) = Selector::parse("style") {
95 for element in document.select(&selector) {
96 let outer_html = element.html();
97 cleaned = cleaned.replace(&outer_html, "");
98 }
99 }
100
101 if let Ok(selector) = Selector::parse("noscript") {
103 for element in document.select(&selector) {
104 let outer_html = element.html();
105 cleaned = cleaned.replace(&outer_html, "");
106 }
107 }
108
109 cleaned
110}
111
112fn hoist_images_from_headings(html: &str) -> String {
118 use std::fmt::Write;
119
120 let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
121 let mut result = html.to_string();
122
123 for level in 1..=6 {
124 let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
125 .expect("valid regex");
126
127 result = heading_re
128 .replace_all(&result, |caps: ®ex::Captures<'_>| {
129 let open = &caps[1];
130 let inner = &caps[2];
131 let close = &caps[3];
132
133 let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
134
135 if imgs.is_empty() {
136 return caps[0].to_string();
137 }
138
139 let stripped = img_re.replace_all(inner, "").to_string();
140 let mut out = format!("{open}{stripped}{close}");
141 for img in imgs {
142 write!(out, "\n<p>{img}</p>").expect("write to String");
143 }
144 out
145 })
146 .into_owned();
147 }
148
149 result
150}
151
152pub fn clean_markdown(markdown: &str) -> String {
156 debug!("Cleaning Markdown output");
157
158 let mut result = markdown.to_string();
160
161 while result.contains("\n\n\n") {
163 result = result.replace("\n\n\n", "\n\n");
164 }
165
166 result = result.trim().to_string();
168
169 if !result.is_empty() && !result.ends_with('\n') {
171 result.push('\n');
172 }
173
174 result
175}