1use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29 info!("Converting HTML to Markdown");
30
31 let processed_html = base_url.map_or_else(
33 || html.to_string(),
34 |base| convert_relative_urls(html, base),
35 );
36 let processed_html = reorder_visual_layout_elements(&processed_html);
37
38 let cleaned_html = clean_html(&processed_html);
40
41 let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
46
47 let cleaned_html = strip_empty_img_titles(&cleaned_html);
52
53 let heading_safe_html = hoist_images_from_headings(&cleaned_html);
57
58 let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
63
64 let markdown = html2md::parse_html(&heading_safe_html);
66
67 let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
71
72 let decoded_markdown = crate::html::decode_html_entities(&markdown);
74
75 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
77
78 let cleaned_markdown = clean_markdown(&normalized_markdown);
80
81 info!(
82 "Successfully converted to Markdown ({} bytes)",
83 cleaned_markdown.len()
84 );
85 Ok(cleaned_markdown)
86}
87
88#[must_use]
89pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
90 let selector = Selector::parse(selector_str).ok()?;
91 let document = Html::parse_document(html);
92 document
93 .select(&selector)
94 .next()
95 .map(|element| element.html())
96}
97
98fn clean_html(html: &str) -> String {
102 debug!("Cleaning HTML for Markdown conversion");
103
104 let document = Html::parse_document(html);
105
106 let mut cleaned = html.to_string();
108
109 if let Ok(selector) = Selector::parse("script") {
111 for element in document.select(&selector) {
112 let outer_html = element.html();
113 cleaned = cleaned.replace(&outer_html, "");
114 }
115 }
116
117 if let Ok(selector) = Selector::parse("style") {
119 for element in document.select(&selector) {
120 let outer_html = element.html();
121 cleaned = cleaned.replace(&outer_html, "");
122 }
123 }
124
125 if let Ok(selector) = Selector::parse("noscript") {
127 for element in document.select(&selector) {
128 let outer_html = element.html();
129 cleaned = cleaned.replace(&outer_html, "");
130 }
131 }
132
133 cleaned
134}
135
136fn reorder_visual_layout_elements(html: &str) -> String {
137 fn move_before(html: &str, moving: &Regex, anchor: &Regex) -> String {
138 let Some(moving_match) = moving.find(html) else {
139 return html.to_string();
140 };
141 let Some(anchor_match) = anchor.find(html) else {
142 return html.to_string();
143 };
144 if moving_match.start() < anchor_match.start() {
145 return html.to_string();
146 }
147
148 let moved = moving_match.as_str().to_string();
149 let mut without_moved = html.to_string();
150 without_moved.replace_range(moving_match.range(), "");
151 if let Some(anchor_match) = anchor.find(&without_moved) {
152 without_moved.insert_str(anchor_match.start(), &moved);
153 }
154 without_moved
155 }
156
157 fn move_after(html: &str, moving: &Regex, anchor: &Regex) -> String {
158 let Some(moving_match) = moving.find(html) else {
159 return html.to_string();
160 };
161 let Some(anchor_match) = anchor.find(html) else {
162 return html.to_string();
163 };
164 if moving_match.start() > anchor_match.start() {
165 return html.to_string();
166 }
167
168 let moved = moving_match.as_str().to_string();
169 let mut without_moved = html.to_string();
170 without_moved.replace_range(moving_match.range(), "");
171 if let Some(anchor_match) = anchor.find(&without_moved) {
172 without_moved.insert_str(anchor_match.end(), &moved);
173 }
174 without_moved
175 }
176
177 let header = Regex::new(r"(?is)<header\b[^>]*>.*?</header>").expect("valid regex");
178 let main = Regex::new(r"(?is)<main\b[^>]*>.*?</main>").expect("valid regex");
179 let footer = Regex::new(r"(?is)<footer\b[^>]*>.*?</footer>").expect("valid regex");
180
181 let html = move_before(html, &header, &main);
182 move_after(&html, &footer, &main)
183}
184
185fn preserve_leading_heading_numbering(html: &str) -> String {
193 let pattern = Regex::new(
194 r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
195 )
196 .expect("valid regex");
197 let leading_number_in_strong =
198 Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
199 .expect("valid regex");
200 let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
201
202 let unwrapped = pattern
203 .replace_all(html, |caps: ®ex::Captures<'_>| {
204 let heading = &caps[1];
205 if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
206 {
207 heading.to_string()
208 } else {
209 caps[0].to_string()
210 }
211 })
212 .into_owned();
213
214 leading_number_in_strong
215 .replace_all(&unwrapped, |caps: ®ex::Captures<'_>| {
216 let open = &caps[1];
217 let number = &caps[2];
218 let inner = &caps[3];
219 format!("{open}{number}<strong>{inner}</strong>")
220 })
221 .into_owned()
222}
223
224fn hoist_images_from_headings(html: &str) -> String {
230 use std::fmt::Write;
231
232 let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
233 let mut result = html.to_string();
234
235 for level in 1..=6 {
236 let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
237 .expect("valid regex");
238
239 result = heading_re
240 .replace_all(&result, |caps: ®ex::Captures<'_>| {
241 let open = &caps[1];
242 let inner = &caps[2];
243 let close = &caps[3];
244
245 let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
246
247 if imgs.is_empty() {
248 return caps[0].to_string();
249 }
250
251 let stripped = img_re.replace_all(inner, "").to_string();
252 let mut out = format!("{open}{stripped}{close}");
253 for img in imgs {
254 write!(out, "\n<p>{img}</p>").expect("write to String");
255 }
256 out
257 })
258 .into_owned();
259 }
260
261 result
262}
263
264fn strip_empty_img_titles(html: &str) -> String {
277 let img_re = Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
278 let empty_title_re = Regex::new(r#"(?i)\s+title\s*=\s*(?:""|'')"#).expect("valid regex");
279 img_re
280 .replace_all(html, |caps: ®ex::Captures<'_>| {
281 let tag = caps.get(0).expect("match 0").as_str();
282 empty_title_re.replace_all(tag, "").into_owned()
283 })
284 .into_owned()
285}
286
287fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
296 let document = Html::parse_document(html);
297 let Ok(ol_selector) = Selector::parse("ol") else {
298 return Vec::new();
299 };
300 let mut numbers = Vec::new();
301 let mut counter: u32 = 1;
302 for ol in document.select(&ol_selector) {
303 let nested = ol.ancestors().any(|n| {
304 n.value()
305 .as_element()
306 .is_some_and(|e| e.name() == "ol" || e.name() == "ul")
307 });
308 if nested {
309 continue;
310 }
311 if let Some(start) = ol
312 .value()
313 .attr("start")
314 .and_then(|s| s.trim().parse::<u32>().ok())
315 {
316 counter = start;
317 }
318 let li_count = ol
319 .children()
320 .filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
321 .count();
322 for _ in 0..li_count {
323 numbers.push(counter);
324 counter = counter.saturating_add(1);
325 }
326 }
327 numbers
328}
329
330fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
337 use std::fmt::Write;
338
339 let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
340 let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
341 let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
342 let mut out = String::with_capacity(markdown.len());
343 let mut idx: usize = 0;
344
345 for (i, line) in lines.iter().enumerate() {
346 let body = line.strip_suffix('\n').unwrap_or(line);
347 if let Some(caps) = item_re.captures(body) {
348 let next_body = lines
349 .get(i + 1)
350 .map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
351 let is_setext_heading = setext_re.is_match(next_body);
352 if !is_setext_heading {
353 if let Some(&n) = numbers.get(idx) {
354 let after = &body[caps.get(0).expect("match 0").end()..];
355 let sep = caps.get(2).expect("group 2").as_str();
356 write!(out, "{n}.{sep}{after}").expect("write to String");
357 if line.ends_with('\n') {
358 out.push('\n');
359 }
360 idx += 1;
361 continue;
362 }
363 }
364 }
365 out.push_str(line);
366 }
367 out
368}
369
370pub fn clean_markdown(markdown: &str) -> String {
374 debug!("Cleaning Markdown output");
375
376 let mut result = markdown.to_string();
378
379 while result.contains("\n\n\n") {
381 result = result.replace("\n\n\n", "\n\n");
382 }
383
384 result = result.trim().to_string();
386
387 if !result.is_empty() && !result.ends_with('\n') {
389 result.push('\n');
390 }
391
392 result
393}