web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11/// Convert HTML content to Markdown
12///
13/// This function cleans the HTML (removing scripts, styles, etc.)
14/// and converts it to Markdown format.
15///
16/// # Arguments
17///
18/// * `html` - The HTML content to convert
19/// * `base_url` - Optional base URL for converting relative URLs to absolute
20///
21/// # Returns
22///
23/// The Markdown content as a string
24///
25/// # Errors
26///
27/// Returns an error if conversion fails
28pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29    info!("Converting HTML to Markdown");
30
31    // Convert relative URLs to absolute if base_url is provided
32    let processed_html = base_url.map_or_else(
33        || html.to_string(),
34        |base| convert_relative_urls(html, base),
35    );
36
37    // Parse and clean the HTML
38    let cleaned_html = clean_html(&processed_html);
39
40    // Preserve hierarchical heading numbering (e.g. "13. Foo", "13.1 Bar").
41    // Unwrap <ol><li><hN>13. Foo</hN></li></ol> -> <hN>13. Foo</hN> so that
42    // html2md does not restart the OL counter at "1." and clobber the source
43    // number that already lives inside the heading text.
44    let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
45
46    // Move <img> elements out of headings so html2md always sees them.
47    // Some html2md versions only emit text children for <h1>..<h6>,
48    // silently dropping inline images.
49    let heading_safe_html = hoist_images_from_headings(&cleaned_html);
50
51    // Compute the number that each top-level <ol> item should carry, in
52    // document order, so we can rewrite html2md's per-list-restarting "1."
53    // prefixes into a single continuous sequence (matching the JS converter's
54    // output) and honour explicit `<ol start="N">` attributes.
55    let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
56
57    // Convert to Markdown using html2md
58    let markdown = html2md::parse_html(&heading_safe_html);
59
60    // Renumber unindented ordered-list lines in the markdown output to match
61    // the precomputed numbers. Indented lines (nested lists) are left alone
62    // so html2md's per-list "1." restart for nested levels is preserved.
63    let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
64
65    // Decode HTML entities to unicode characters
66    let decoded_markdown = crate::html::decode_html_entities(&markdown);
67
68    // Preserve non-breaking spaces as &nbsp; entities for clear marking
69    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
70
71    // Clean up the markdown output
72    let cleaned_markdown = clean_markdown(&normalized_markdown);
73
74    info!(
75        "Successfully converted to Markdown ({} bytes)",
76        cleaned_markdown.len()
77    );
78    Ok(cleaned_markdown)
79}
80
81#[must_use]
82pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
83    let selector = Selector::parse(selector_str).ok()?;
84    let document = Html::parse_document(html);
85    document
86        .select(&selector)
87        .next()
88        .map(|element| element.html())
89}
90
91/// Clean HTML content before Markdown conversion
92///
93/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
94fn clean_html(html: &str) -> String {
95    debug!("Cleaning HTML for Markdown conversion");
96
97    let document = Html::parse_document(html);
98
99    // Create a mutable string to build our cleaned HTML
100    let mut cleaned = html.to_string();
101
102    // Remove script tags
103    if let Ok(selector) = Selector::parse("script") {
104        for element in document.select(&selector) {
105            let outer_html = element.html();
106            cleaned = cleaned.replace(&outer_html, "");
107        }
108    }
109
110    // Remove style tags
111    if let Ok(selector) = Selector::parse("style") {
112        for element in document.select(&selector) {
113            let outer_html = element.html();
114            cleaned = cleaned.replace(&outer_html, "");
115        }
116    }
117
118    // Remove noscript tags
119    if let Ok(selector) = Selector::parse("noscript") {
120        for element in document.select(&selector) {
121            let outer_html = element.html();
122            cleaned = cleaned.replace(&outer_html, "");
123        }
124    }
125
126    cleaned
127}
128
129/// Unwrap `<ol><li><hN>...</hN></li></ol>` when the heading text already
130/// carries a leading number (e.g. "13. Foo"), and replace such a list with the
131/// bare heading. Without this, `html2md` restarts ordered-list numbering at
132/// "1." and the document loses the original section number.
133///
134/// Also lifts a leading "13. " out of an inner `<strong>` so html2md emits
135/// `#### 13. Foo` (matchable by the test) rather than `#### **13. Foo**`.
136fn preserve_leading_heading_numbering(html: &str) -> String {
137    let pattern = Regex::new(
138        r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
139    )
140    .expect("valid regex");
141    let leading_number_in_strong =
142        Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
143            .expect("valid regex");
144    let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
145
146    let unwrapped = pattern
147        .replace_all(html, |caps: &regex::Captures<'_>| {
148            let heading = &caps[1];
149            if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
150            {
151                heading.to_string()
152            } else {
153                caps[0].to_string()
154            }
155        })
156        .into_owned();
157
158    leading_number_in_strong
159        .replace_all(&unwrapped, |caps: &regex::Captures<'_>| {
160            let open = &caps[1];
161            let number = &caps[2];
162            let inner = &caps[3];
163            format!("{open}{number}<strong>{inner}</strong>")
164        })
165        .into_owned()
166}
167
168/// Move `<img>` tags out of `<h1>`..`<h6>` elements.
169///
170/// Rewrites `<hN>...<img ...>...text</hN>` →
171/// `<hN>...text</hN>\n<p><img ...></p>` so that any HTML→Markdown
172/// converter sees the images at block level.
173fn hoist_images_from_headings(html: &str) -> String {
174    use std::fmt::Write;
175
176    let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
177    let mut result = html.to_string();
178
179    for level in 1..=6 {
180        let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
181            .expect("valid regex");
182
183        result = heading_re
184            .replace_all(&result, |caps: &regex::Captures<'_>| {
185                let open = &caps[1];
186                let inner = &caps[2];
187                let close = &caps[3];
188
189                let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
190
191                if imgs.is_empty() {
192                    return caps[0].to_string();
193                }
194
195                let stripped = img_re.replace_all(inner, "").to_string();
196                let mut out = format!("{open}{stripped}{close}");
197                for img in imgs {
198                    write!(out, "\n<p>{img}</p>").expect("write to String");
199                }
200                out
201            })
202            .into_owned();
203    }
204
205    result
206}
207
208/// Walk every top-level `<ol>` in document order and return the number that
209/// each direct `<li>` child should carry. Without an explicit `start="N"`,
210/// lists continue the running counter from the previous list (e.g. 1, 2 then
211/// 3, 4 across two consecutive `<ol>`s). With `start="N"`, the counter resets
212/// to `N` for that list and subsequent lists continue from there.
213///
214/// Top-level here means "not nested inside another `<ol>` or `<ul>`" — nested
215/// lists keep their own per-list numbering, matching `html2md`'s default.
216fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
217    let document = Html::parse_document(html);
218    let Ok(ol_selector) = Selector::parse("ol") else {
219        return Vec::new();
220    };
221    let mut numbers = Vec::new();
222    let mut counter: u32 = 1;
223    for ol in document.select(&ol_selector) {
224        let nested = ol.ancestors().any(|n| {
225            n.value()
226                .as_element()
227                .is_some_and(|e| e.name() == "ol" || e.name() == "ul")
228        });
229        if nested {
230            continue;
231        }
232        if let Some(start) = ol
233            .value()
234            .attr("start")
235            .and_then(|s| s.trim().parse::<u32>().ok())
236        {
237            counter = start;
238        }
239        let li_count = ol
240            .children()
241            .filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
242            .count();
243        for _ in 0..li_count {
244            numbers.push(counter);
245            counter = counter.saturating_add(1);
246        }
247    }
248    numbers
249}
250
251/// Replace the prefix of each unindented `^\d+\.\s` line in `markdown` with
252/// the next number from `numbers`, in document order. Indented lines (nested
253/// list items) are skipped so `html2md`'s per-list "1." restart for nested
254/// levels is preserved. Setext heading underlines (`====` / `----`) are
255/// detected so a `1. Headings` line followed by an underline is not treated
256/// as a list item. Lines beyond the end of `numbers` are left untouched.
257fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
258    use std::fmt::Write;
259
260    let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
261    let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
262    let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
263    let mut out = String::with_capacity(markdown.len());
264    let mut idx: usize = 0;
265
266    for (i, line) in lines.iter().enumerate() {
267        let body = line.strip_suffix('\n').unwrap_or(line);
268        if let Some(caps) = item_re.captures(body) {
269            let next_body = lines
270                .get(i + 1)
271                .map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
272            let is_setext_heading = setext_re.is_match(next_body);
273            if !is_setext_heading {
274                if let Some(&n) = numbers.get(idx) {
275                    let after = &body[caps.get(0).expect("match 0").end()..];
276                    let sep = caps.get(2).expect("group 2").as_str();
277                    write!(out, "{n}.{sep}{after}").expect("write to String");
278                    if line.ends_with('\n') {
279                        out.push('\n');
280                    }
281                    idx += 1;
282                    continue;
283                }
284            }
285        }
286        out.push_str(line);
287    }
288    out
289}
290
291/// Clean up Markdown output
292///
293/// Removes excessive whitespace and normalizes the output.
294pub fn clean_markdown(markdown: &str) -> String {
295    debug!("Cleaning Markdown output");
296
297    // Remove excessive blank lines (more than 2 consecutive newlines)
298    let mut result = markdown.to_string();
299
300    // Replace multiple consecutive newlines with at most two
301    while result.contains("\n\n\n") {
302        result = result.replace("\n\n\n", "\n\n");
303    }
304
305    // Trim leading and trailing whitespace
306    result = result.trim().to_string();
307
308    // Ensure the document ends with a newline
309    if !result.is_empty() && !result.ends_with('\n') {
310        result.push('\n');
311    }
312
313    result
314}
web_capture/markdown.rs

web_capture/
markdown.rs