1use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29 info!("Converting HTML to Markdown");
30
31 let processed_html = base_url.map_or_else(
33 || html.to_string(),
34 |base| convert_relative_urls(html, base),
35 );
36
37 let cleaned_html = clean_html(&processed_html);
39
40 let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
45
46 let heading_safe_html = hoist_images_from_headings(&cleaned_html);
50
51 let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
56
57 let markdown = html2md::parse_html(&heading_safe_html);
59
60 let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
64
65 let decoded_markdown = crate::html::decode_html_entities(&markdown);
67
68 let normalized_markdown = decoded_markdown.replace('\u{00A0}', " ");
70
71 let cleaned_markdown = clean_markdown(&normalized_markdown);
73
74 info!(
75 "Successfully converted to Markdown ({} bytes)",
76 cleaned_markdown.len()
77 );
78 Ok(cleaned_markdown)
79}
80
81#[must_use]
82pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
83 let selector = Selector::parse(selector_str).ok()?;
84 let document = Html::parse_document(html);
85 document
86 .select(&selector)
87 .next()
88 .map(|element| element.html())
89}
90
91fn clean_html(html: &str) -> String {
95 debug!("Cleaning HTML for Markdown conversion");
96
97 let document = Html::parse_document(html);
98
99 let mut cleaned = html.to_string();
101
102 if let Ok(selector) = Selector::parse("script") {
104 for element in document.select(&selector) {
105 let outer_html = element.html();
106 cleaned = cleaned.replace(&outer_html, "");
107 }
108 }
109
110 if let Ok(selector) = Selector::parse("style") {
112 for element in document.select(&selector) {
113 let outer_html = element.html();
114 cleaned = cleaned.replace(&outer_html, "");
115 }
116 }
117
118 if let Ok(selector) = Selector::parse("noscript") {
120 for element in document.select(&selector) {
121 let outer_html = element.html();
122 cleaned = cleaned.replace(&outer_html, "");
123 }
124 }
125
126 cleaned
127}
128
129fn preserve_leading_heading_numbering(html: &str) -> String {
137 let pattern = Regex::new(
138 r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
139 )
140 .expect("valid regex");
141 let leading_number_in_strong =
142 Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
143 .expect("valid regex");
144 let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
145
146 let unwrapped = pattern
147 .replace_all(html, |caps: ®ex::Captures<'_>| {
148 let heading = &caps[1];
149 if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
150 {
151 heading.to_string()
152 } else {
153 caps[0].to_string()
154 }
155 })
156 .into_owned();
157
158 leading_number_in_strong
159 .replace_all(&unwrapped, |caps: ®ex::Captures<'_>| {
160 let open = &caps[1];
161 let number = &caps[2];
162 let inner = &caps[3];
163 format!("{open}{number}<strong>{inner}</strong>")
164 })
165 .into_owned()
166}
167
168fn hoist_images_from_headings(html: &str) -> String {
174 use std::fmt::Write;
175
176 let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
177 let mut result = html.to_string();
178
179 for level in 1..=6 {
180 let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
181 .expect("valid regex");
182
183 result = heading_re
184 .replace_all(&result, |caps: ®ex::Captures<'_>| {
185 let open = &caps[1];
186 let inner = &caps[2];
187 let close = &caps[3];
188
189 let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
190
191 if imgs.is_empty() {
192 return caps[0].to_string();
193 }
194
195 let stripped = img_re.replace_all(inner, "").to_string();
196 let mut out = format!("{open}{stripped}{close}");
197 for img in imgs {
198 write!(out, "\n<p>{img}</p>").expect("write to String");
199 }
200 out
201 })
202 .into_owned();
203 }
204
205 result
206}
207
208fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
217 let document = Html::parse_document(html);
218 let Ok(ol_selector) = Selector::parse("ol") else {
219 return Vec::new();
220 };
221 let mut numbers = Vec::new();
222 let mut counter: u32 = 1;
223 for ol in document.select(&ol_selector) {
224 let nested = ol.ancestors().any(|n| {
225 n.value()
226 .as_element()
227 .is_some_and(|e| e.name() == "ol" || e.name() == "ul")
228 });
229 if nested {
230 continue;
231 }
232 if let Some(start) = ol
233 .value()
234 .attr("start")
235 .and_then(|s| s.trim().parse::<u32>().ok())
236 {
237 counter = start;
238 }
239 let li_count = ol
240 .children()
241 .filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
242 .count();
243 for _ in 0..li_count {
244 numbers.push(counter);
245 counter = counter.saturating_add(1);
246 }
247 }
248 numbers
249}
250
251fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
258 use std::fmt::Write;
259
260 let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
261 let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
262 let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
263 let mut out = String::with_capacity(markdown.len());
264 let mut idx: usize = 0;
265
266 for (i, line) in lines.iter().enumerate() {
267 let body = line.strip_suffix('\n').unwrap_or(line);
268 if let Some(caps) = item_re.captures(body) {
269 let next_body = lines
270 .get(i + 1)
271 .map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
272 let is_setext_heading = setext_re.is_match(next_body);
273 if !is_setext_heading {
274 if let Some(&n) = numbers.get(idx) {
275 let after = &body[caps.get(0).expect("match 0").end()..];
276 let sep = caps.get(2).expect("group 2").as_str();
277 write!(out, "{n}.{sep}{after}").expect("write to String");
278 if line.ends_with('\n') {
279 out.push('\n');
280 }
281 idx += 1;
282 continue;
283 }
284 }
285 }
286 out.push_str(line);
287 }
288 out
289}
290
291pub fn clean_markdown(markdown: &str) -> String {
295 debug!("Cleaning Markdown output");
296
297 let mut result = markdown.to_string();
299
300 while result.contains("\n\n\n") {
302 result = result.replace("\n\n\n", "\n\n");
303 }
304
305 result = result.trim().to_string();
307
308 if !result.is_empty() && !result.ends_with('\n') {
310 result.push('\n');
311 }
312
313 result
314}