Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11/// Convert HTML content to Markdown
12///
13/// This function cleans the HTML (removing scripts, styles, etc.)
14/// and converts it to Markdown format.
15///
16/// # Arguments
17///
18/// * `html` - The HTML content to convert
19/// * `base_url` - Optional base URL for converting relative URLs to absolute
20///
21/// # Returns
22///
23/// The Markdown content as a string
24///
25/// # Errors
26///
27/// Returns an error if conversion fails
28pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29    info!("Converting HTML to Markdown");
30
31    // Convert relative URLs to absolute if base_url is provided
32    let processed_html = base_url.map_or_else(
33        || html.to_string(),
34        |base| convert_relative_urls(html, base),
35    );
36    let processed_html = reorder_visual_layout_elements(&processed_html);
37
38    // Parse and clean the HTML
39    let cleaned_html = clean_html(&processed_html);
40
41    // Preserve hierarchical heading numbering (e.g. "13. Foo", "13.1 Bar").
42    // Unwrap <ol><li><hN>13. Foo</hN></li></ol> -> <hN>13. Foo</hN> so that
43    // html2md does not restart the OL counter at "1." and clobber the source
44    // number that already lives inside the heading text.
45    let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
46
47    // Drop empty `title=""` attributes from `<img>` tags. html2md emits
48    // `![](src "")` for them, which the markdown-side base64 image extractor
49    // mis-parses (its `[^)]+` payload group swallows the trailing ` ""`,
50    // making the decoded base64 invalid).
51    let cleaned_html = strip_empty_img_titles(&cleaned_html);
52
53    // Move <img> elements out of headings so html2md always sees them.
54    // Some html2md versions only emit text children for <h1>..<h6>,
55    // silently dropping inline images.
56    let heading_safe_html = hoist_images_from_headings(&cleaned_html);
57
58    // Compute the number that each top-level <ol> item should carry, in
59    // document order, so we can rewrite html2md's per-list-restarting "1."
60    // prefixes into a single continuous sequence (matching the JS converter's
61    // output) and honour explicit `<ol start="N">` attributes.
62    let ol_item_numbers = compute_top_level_ordered_list_item_numbers(&heading_safe_html);
63
64    // Convert to Markdown using html2md
65    let markdown = html2md::parse_html(&heading_safe_html);
66
67    // Renumber unindented ordered-list lines in the markdown output to match
68    // the precomputed numbers. Indented lines (nested lists) are left alone
69    // so html2md's per-list "1." restart for nested levels is preserved.
70    let markdown = renumber_top_level_ordered_list_lines(&markdown, &ol_item_numbers);
71
72    // Decode HTML entities to unicode characters
73    let decoded_markdown = crate::html::decode_html_entities(&markdown);
74
75    // Preserve non-breaking spaces as &nbsp; entities for clear marking
76    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
77
78    // Clean up the markdown output
79    let cleaned_markdown = clean_markdown(&normalized_markdown);
80
81    info!(
82        "Successfully converted to Markdown ({} bytes)",
83        cleaned_markdown.len()
84    );
85    Ok(cleaned_markdown)
86}
87
88#[must_use]
89pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
90    let selector = Selector::parse(selector_str).ok()?;
91    let document = Html::parse_document(html);
92    document
93        .select(&selector)
94        .next()
95        .map(|element| element.html())
96}
97
98/// Clean HTML content before Markdown conversion
99///
100/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
101fn clean_html(html: &str) -> String {
102    debug!("Cleaning HTML for Markdown conversion");
103
104    let document = Html::parse_document(html);
105
106    // Create a mutable string to build our cleaned HTML
107    let mut cleaned = html.to_string();
108
109    // Remove script tags
110    if let Ok(selector) = Selector::parse("script") {
111        for element in document.select(&selector) {
112            let outer_html = element.html();
113            cleaned = cleaned.replace(&outer_html, "");
114        }
115    }
116
117    // Remove style tags
118    if let Ok(selector) = Selector::parse("style") {
119        for element in document.select(&selector) {
120            let outer_html = element.html();
121            cleaned = cleaned.replace(&outer_html, "");
122        }
123    }
124
125    // Remove noscript tags
126    if let Ok(selector) = Selector::parse("noscript") {
127        for element in document.select(&selector) {
128            let outer_html = element.html();
129            cleaned = cleaned.replace(&outer_html, "");
130        }
131    }
132
133    cleaned
134}
135
136fn reorder_visual_layout_elements(html: &str) -> String {
137    fn move_before(html: &str, moving: &Regex, anchor: &Regex) -> String {
138        let Some(moving_match) = moving.find(html) else {
139            return html.to_string();
140        };
141        let Some(anchor_match) = anchor.find(html) else {
142            return html.to_string();
143        };
144        if moving_match.start() < anchor_match.start() {
145            return html.to_string();
146        }
147
148        let moved = moving_match.as_str().to_string();
149        let mut without_moved = html.to_string();
150        without_moved.replace_range(moving_match.range(), "");
151        if let Some(anchor_match) = anchor.find(&without_moved) {
152            without_moved.insert_str(anchor_match.start(), &moved);
153        }
154        without_moved
155    }
156
157    fn move_after(html: &str, moving: &Regex, anchor: &Regex) -> String {
158        let Some(moving_match) = moving.find(html) else {
159            return html.to_string();
160        };
161        let Some(anchor_match) = anchor.find(html) else {
162            return html.to_string();
163        };
164        if moving_match.start() > anchor_match.start() {
165            return html.to_string();
166        }
167
168        let moved = moving_match.as_str().to_string();
169        let mut without_moved = html.to_string();
170        without_moved.replace_range(moving_match.range(), "");
171        if let Some(anchor_match) = anchor.find(&without_moved) {
172            without_moved.insert_str(anchor_match.end(), &moved);
173        }
174        without_moved
175    }
176
177    let header = Regex::new(r"(?is)<header\b[^>]*>.*?</header>").expect("valid regex");
178    let main = Regex::new(r"(?is)<main\b[^>]*>.*?</main>").expect("valid regex");
179    let footer = Regex::new(r"(?is)<footer\b[^>]*>.*?</footer>").expect("valid regex");
180
181    let html = move_before(html, &header, &main);
182    move_after(&html, &footer, &main)
183}
184
185/// Unwrap `<ol><li><hN>...</hN></li></ol>` when the heading text already
186/// carries a leading number (e.g. "13. Foo"), and replace such a list with the
187/// bare heading. Without this, `html2md` restarts ordered-list numbering at
188/// "1." and the document loses the original section number.
189///
190/// Also lifts a leading "13. " out of an inner `<strong>` so html2md emits
191/// `#### 13. Foo` (matchable by the test) rather than `#### **13. Foo**`.
192fn preserve_leading_heading_numbering(html: &str) -> String {
193    let pattern = Regex::new(
194        r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
195    )
196    .expect("valid regex");
197    let leading_number_in_strong =
198        Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
199            .expect("valid regex");
200    let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
201
202    let unwrapped = pattern
203        .replace_all(html, |caps: &regex::Captures<'_>| {
204            let heading = &caps[1];
205            if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
206            {
207                heading.to_string()
208            } else {
209                caps[0].to_string()
210            }
211        })
212        .into_owned();
213
214    leading_number_in_strong
215        .replace_all(&unwrapped, |caps: &regex::Captures<'_>| {
216            let open = &caps[1];
217            let number = &caps[2];
218            let inner = &caps[3];
219            format!("{open}{number}<strong>{inner}</strong>")
220        })
221        .into_owned()
222}
223
224/// Move `<img>` tags out of `<h1>`..`<h6>` elements.
225///
226/// Rewrites `<hN>...<img ...>...text</hN>` →
227/// `<hN>...text</hN>\n<p><img ...></p>` so that any HTML→Markdown
228/// converter sees the images at block level.
229fn hoist_images_from_headings(html: &str) -> String {
230    use std::fmt::Write;
231
232    let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
233    let mut result = html.to_string();
234
235    for level in 1..=6 {
236        let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
237            .expect("valid regex");
238
239        result = heading_re
240            .replace_all(&result, |caps: &regex::Captures<'_>| {
241                let open = &caps[1];
242                let inner = &caps[2];
243                let close = &caps[3];
244
245                let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
246
247                if imgs.is_empty() {
248                    return caps[0].to_string();
249                }
250
251                let stripped = img_re.replace_all(inner, "").to_string();
252                let mut out = format!("{open}{stripped}{close}");
253                for img in imgs {
254                    write!(out, "\n<p>{img}</p>").expect("write to String");
255                }
256                out
257            })
258            .into_owned();
259    }
260
261    result
262}
263
264/// Remove `title=""` (case-insensitive, single or double quoted, with any
265/// surrounding whitespace) from `<img>` tags.
266///
267/// Background: Google Docs HTML exports emit `<img title="" alt="" src="...">`.
268/// The `html2md` crate renders that as `![](src "")` — a syntactically valid
269/// markdown image, but the empty title attribute then trips up the markdown-side
270/// base64 image extractor, whose `[^)]+` payload group greedily swallows the
271/// trailing ` ""`, leaving an invalid base64 string that fails to decode.
272///
273/// Stripping the attribute here keeps the rendered markdown clean *and* lets
274/// any downstream tool that parses the markdown image syntax do the right
275/// thing. Non-empty titles are left untouched.
276fn strip_empty_img_titles(html: &str) -> String {
277    let img_re = Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
278    let empty_title_re = Regex::new(r#"(?i)\s+title\s*=\s*(?:""|'')"#).expect("valid regex");
279    img_re
280        .replace_all(html, |caps: &regex::Captures<'_>| {
281            let tag = caps.get(0).expect("match 0").as_str();
282            empty_title_re.replace_all(tag, "").into_owned()
283        })
284        .into_owned()
285}
286
287/// Walk every top-level `<ol>` in document order and return the number that
288/// each direct `<li>` child should carry. Without an explicit `start="N"`,
289/// lists continue the running counter from the previous list (e.g. 1, 2 then
290/// 3, 4 across two consecutive `<ol>`s). With `start="N"`, the counter resets
291/// to `N` for that list and subsequent lists continue from there.
292///
293/// Top-level here means "not nested inside another `<ol>` or `<ul>`" — nested
294/// lists keep their own per-list numbering, matching `html2md`'s default.
295fn compute_top_level_ordered_list_item_numbers(html: &str) -> Vec<u32> {
296    let document = Html::parse_document(html);
297    let Ok(ol_selector) = Selector::parse("ol") else {
298        return Vec::new();
299    };
300    let mut numbers = Vec::new();
301    let mut counter: u32 = 1;
302    for ol in document.select(&ol_selector) {
303        let nested = ol.ancestors().any(|n| {
304            n.value()
305                .as_element()
306                .is_some_and(|e| e.name() == "ol" || e.name() == "ul")
307        });
308        if nested {
309            continue;
310        }
311        if let Some(start) = ol
312            .value()
313            .attr("start")
314            .and_then(|s| s.trim().parse::<u32>().ok())
315        {
316            counter = start;
317        }
318        let li_count = ol
319            .children()
320            .filter(|n| n.value().as_element().is_some_and(|e| e.name() == "li"))
321            .count();
322        for _ in 0..li_count {
323            numbers.push(counter);
324            counter = counter.saturating_add(1);
325        }
326    }
327    numbers
328}
329
330/// Replace the prefix of each unindented `^\d+\.\s` line in `markdown` with
331/// the next number from `numbers`, in document order. Indented lines (nested
332/// list items) are skipped so `html2md`'s per-list "1." restart for nested
333/// levels is preserved. Setext heading underlines (`====` / `----`) are
334/// detected so a `1. Headings` line followed by an underline is not treated
335/// as a list item. Lines beyond the end of `numbers` are left untouched.
336fn renumber_top_level_ordered_list_lines(markdown: &str, numbers: &[u32]) -> String {
337    use std::fmt::Write;
338
339    let item_re = Regex::new(r"^(\d+)\.(\s)").expect("valid regex");
340    let setext_re = Regex::new(r"^(=+|-+)\s*$").expect("valid regex");
341    let lines: Vec<&str> = markdown.split_inclusive('\n').collect();
342    let mut out = String::with_capacity(markdown.len());
343    let mut idx: usize = 0;
344
345    for (i, line) in lines.iter().enumerate() {
346        let body = line.strip_suffix('\n').unwrap_or(line);
347        if let Some(caps) = item_re.captures(body) {
348            let next_body = lines
349                .get(i + 1)
350                .map_or("", |l| l.strip_suffix('\n').unwrap_or(l));
351            let is_setext_heading = setext_re.is_match(next_body);
352            if !is_setext_heading {
353                if let Some(&n) = numbers.get(idx) {
354                    let after = &body[caps.get(0).expect("match 0").end()..];
355                    let sep = caps.get(2).expect("group 2").as_str();
356                    write!(out, "{n}.{sep}{after}").expect("write to String");
357                    if line.ends_with('\n') {
358                        out.push('\n');
359                    }
360                    idx += 1;
361                    continue;
362                }
363            }
364        }
365        out.push_str(line);
366    }
367    out
368}
369
370/// Clean up Markdown output
371///
372/// Removes excessive whitespace and normalizes the output.
373pub fn clean_markdown(markdown: &str) -> String {
374    debug!("Cleaning Markdown output");
375
376    // Remove excessive blank lines (more than 2 consecutive newlines)
377    let mut result = markdown.to_string();
378
379    // Replace multiple consecutive newlines with at most two
380    while result.contains("\n\n\n") {
381        result = result.replace("\n\n\n", "\n\n");
382    }
383
384    // Trim leading and trailing whitespace
385    result = result.trim().to_string();
386
387    // Ensure the document ends with a newline
388    if !result.is_empty() && !result.ends_with('\n') {
389        result.push('\n');
390    }
391
392    result
393}