1use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29 info!("Converting HTML to Markdown");
30
31 let processed_html = base_url.map_or_else(
33 || html.to_string(),
34 |base| convert_relative_urls(html, base),
35 );
36
37 let cleaned_html = clean_html(&processed_html);
39
40 let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
45
46 let heading_safe_html = hoist_images_from_headings(&cleaned_html);
50
51 let markdown = html2md::parse_html(&heading_safe_html);
53
54 let decoded_markdown = crate::html::decode_html_entities(&markdown);
56
57 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
59
60 let cleaned_markdown = clean_markdown(&normalized_markdown);
62
63 info!(
64 "Successfully converted to Markdown ({} bytes)",
65 cleaned_markdown.len()
66 );
67 Ok(cleaned_markdown)
68}
69
70#[must_use]
71pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
72 let selector = Selector::parse(selector_str).ok()?;
73 let document = Html::parse_document(html);
74 document
75 .select(&selector)
76 .next()
77 .map(|element| element.html())
78}
79
80fn clean_html(html: &str) -> String {
84 debug!("Cleaning HTML for Markdown conversion");
85
86 let document = Html::parse_document(html);
87
88 let mut cleaned = html.to_string();
90
91 if let Ok(selector) = Selector::parse("script") {
93 for element in document.select(&selector) {
94 let outer_html = element.html();
95 cleaned = cleaned.replace(&outer_html, "");
96 }
97 }
98
99 if let Ok(selector) = Selector::parse("style") {
101 for element in document.select(&selector) {
102 let outer_html = element.html();
103 cleaned = cleaned.replace(&outer_html, "");
104 }
105 }
106
107 if let Ok(selector) = Selector::parse("noscript") {
109 for element in document.select(&selector) {
110 let outer_html = element.html();
111 cleaned = cleaned.replace(&outer_html, "");
112 }
113 }
114
115 cleaned
116}
117
118fn preserve_leading_heading_numbering(html: &str) -> String {
126 let pattern = Regex::new(
127 r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
128 )
129 .expect("valid regex");
130 let leading_number_in_strong =
131 Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
132 .expect("valid regex");
133 let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
134
135 let unwrapped = pattern
136 .replace_all(html, |caps: ®ex::Captures<'_>| {
137 let heading = &caps[1];
138 if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
139 {
140 heading.to_string()
141 } else {
142 caps[0].to_string()
143 }
144 })
145 .into_owned();
146
147 leading_number_in_strong
148 .replace_all(&unwrapped, |caps: ®ex::Captures<'_>| {
149 let open = &caps[1];
150 let number = &caps[2];
151 let inner = &caps[3];
152 format!("{open}{number}<strong>{inner}</strong>")
153 })
154 .into_owned()
155}
156
157fn hoist_images_from_headings(html: &str) -> String {
163 use std::fmt::Write;
164
165 let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
166 let mut result = html.to_string();
167
168 for level in 1..=6 {
169 let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
170 .expect("valid regex");
171
172 result = heading_re
173 .replace_all(&result, |caps: ®ex::Captures<'_>| {
174 let open = &caps[1];
175 let inner = &caps[2];
176 let close = &caps[3];
177
178 let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
179
180 if imgs.is_empty() {
181 return caps[0].to_string();
182 }
183
184 let stripped = img_re.replace_all(inner, "").to_string();
185 let mut out = format!("{open}{stripped}{close}");
186 for img in imgs {
187 write!(out, "\n<p>{img}</p>").expect("write to String");
188 }
189 out
190 })
191 .into_owned();
192 }
193
194 result
195}
196
197pub fn clean_markdown(markdown: &str) -> String {
201 debug!("Cleaning Markdown output");
202
203 let mut result = markdown.to_string();
205
206 while result.contains("\n\n\n") {
208 result = result.replace("\n\n\n", "\n\n");
209 }
210
211 result = result.trim().to_string();
213
214 if !result.is_empty() && !result.ends_with('\n') {
216 result.push('\n');
217 }
218
219 result
220}