1use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29 info!("Converting HTML to Markdown");
30
31 let processed_html = base_url.map_or_else(
33 || html.to_string(),
34 |base| convert_relative_urls(html, base),
35 );
36
37 let cleaned_html = clean_html(&processed_html);
39
40 let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
45
46 let cleaned_html = strip_empty_img_titles(&cleaned_html);
51
52 let heading_safe_html = hoist_images_from_headings(&cleaned_html);
56
57 let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
62
63 let markdown = html2md::parse_html(&heading_safe_html);
65
66 let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
70
71 let decoded_markdown = crate::html::decode_html_entities(&markdown);
73
74 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
76
77 let cleaned_markdown = clean_markdown(&normalized_markdown);
79
80 info!(
81 "Successfully converted to Markdown ({} bytes)",
82 cleaned_markdown.len()
83 );
84 Ok(cleaned_markdown)
85}
86
87#[must_use]
88pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
89 let selector = Selector::parse(selector_str).ok()?;
90 let document = Html::parse_document(html);
91 document
92 .select(&selector)
93 .next()
94 .map(|element| element.html())
95}
96
97fn clean_html(html: &str) -> String {
101 debug!("Cleaning HTML for Markdown conversion");
102
103 let document = Html::parse_document(html);
104
105 let mut cleaned = html.to_string();
107
108 if let Ok(selector) = Selector::parse("script") {
110 for element in document.select(&selector) {
111 let outer_html = element.html();
112 cleaned = cleaned.replace(&outer_html, "");
113 }
114 }
115
116 if let Ok(selector) = Selector::parse("style") {
118 for element in document.select(&selector) {
119 let outer_html = element.html();
120 cleaned = cleaned.replace(&outer_html, "");
121 }
122 }
123
124 if let Ok(selector) = Selector::parse("noscript") {
126 for element in document.select(&selector) {
127 let outer_html = element.html();
128 cleaned = cleaned.replace(&outer_html, "");
129 }
130 }
131
132 cleaned
133}
134
135fn preserve_leading_heading_numbering(html: &str) -> String {
143 let pattern = Regex::new(
144 r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
145 )
146 .expect("valid regex");
147 let leading_number_in_strong =
148 Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
149 .expect("valid regex");
150 let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
151
152 let unwrapped = pattern
153 .replace_all(html, |caps: ®ex::Captures<'_>| {
154 let heading = &caps[1];
155 if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
156 {
157 heading.to_string()
158 } else {
159 caps[0].to_string()
160 }
161 })
162 .into_owned();
163
164 leading_number_in_strong
165 .replace_all(&unwrapped, |caps: ®ex::Captures<'_>| {
166 let open = &caps[1];
167 let number = &caps[2];
168 let inner = &caps[3];
169 format!("{open}{number}<strong>{inner}</strong>")
170 })
171 .into_owned()
172}
173
174fn hoist_images_from_headings(html: &str) -> String {
180 use std::fmt::Write;
181
182 let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
183 let mut result = html.to_string();
184
185 for level in 1..=6 {
186 let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
187 .expect("valid regex");
188
189 result = heading_re
190 .replace_all(&result, |caps: ®ex::Captures<'_>| {
191 let open = &caps[1];
192 let inner = &caps[2];
193 let close = &caps[3];
194
195 let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
196
197 if imgs.is_empty() {
198 return caps[0].to_string();
199 }
200
201 let stripped = img_re.replace_all(inner, "").to_string();
202 let mut out = format!("{open}{stripped}{close}");
203 for img in imgs {
204 write!(out, "\n<p>{img}</p>").expect("write to String");
205 }
206 out
207 })
208 .into_owned();
209 }
210
211 result
212}
213
214fn strip_empty_img_titles(html: &str) -> String {
227 let img_re = Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
228 let empty_title_re = Regex::new(r#"(?i)\s+title\s*=\s*(?:""|'')"#).expect("valid regex");
229 img_re
230 .replace_all(html, |caps: ®ex::Captures<'_>| {
231 let tag = caps.get(0).expect("match 0").as_str();
232 empty_title_re.replace_all(tag, "").into_owned()
233 })
234 .into_owned()
235}
236
237fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
246 let document = Html::parse_document(html);
247 let Ok(ol_selector) = Selector::parse("ol") else {
248 return Vec::new();
249 };
250 let mut numbers = Vec::new();
251 let mut counter: u32 = 1;
252 for ol in document.select(&ol_selector) {
253 let nested = ol.ancestors().any(|n| {
254 n.value()
255 .as_element()
256 .is_some_and(|e| e.name() == "ol" || e.name() == "ul")
257 });
258 if nested {
259 continue;
260 }
261 if let Some(start) = ol
262 .value()
263 .attr("start")
264 .and_then(|s| s.trim().parse::<u32>().ok())
265 {
266 counter = start;
267 }
268 let li_count = ol
269 .children()
270 .filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
271 .count();
272 for _ in 0..li_count {
273 numbers.push(counter);
274 counter = counter.saturating_add(1);
275 }
276 }
277 numbers
278}
279
280fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
287 use std::fmt::Write;
288
289 let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
290 let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
291 let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
292 let mut out = String::with_capacity(markdown.len());
293 let mut idx: usize = 0;
294
295 for (i, line) in lines.iter().enumerate() {
296 let body = line.strip_suffix('\n').unwrap_or(line);
297 if let Some(caps) = item_re.captures(body) {
298 let next_body = lines
299 .get(i + 1)
300 .map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
301 let is_setext_heading = setext_re.is_match(next_body);
302 if !is_setext_heading {
303 if let Some(&n) = numbers.get(idx) {
304 let after = &body[caps.get(0).expect("match 0").end()..];
305 let sep = caps.get(2).expect("group 2").as_str();
306 write!(out, "{n}.{sep}{after}").expect("write to String");
307 if line.ends_with('\n') {
308 out.push('\n');
309 }
310 idx += 1;
311 continue;
312 }
313 }
314 }
315 out.push_str(line);
316 }
317 out
318}
319
320pub fn clean_markdown(markdown: &str) -> String {
324 debug!("Cleaning Markdown output");
325
326 let mut result = markdown.to_string();
328
329 while result.contains("\n\n\n") {
331 result = result.replace("\n\n\n", "\n\n");
332 }
333
334 result = result.trim().to_string();
336
337 if !result.is_empty() && !result.ends_with('\n') {
339 result.push('\n');
340 }
341
342 result
343}