Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11/// Convert HTML content to Markdown
12///
13/// This function cleans the HTML (removing scripts, styles, etc.)
14/// and converts it to Markdown format.
15///
16/// # Arguments
17///
18/// * `html` - The HTML content to convert
19/// * `base_url` - Optional base URL for converting relative URLs to absolute
20///
21/// # Returns
22///
23/// The Markdown content as a string
24///
25/// # Errors
26///
27/// Returns an error if conversion fails
28pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29    info!("Converting HTML to Markdown");
30
31    // Convert relative URLs to absolute if base_url is provided
32    let processed_html = base_url.map_or_else(
33        || html.to_string(),
34        |base| convert_relative_urls(html, base),
35    );
36
37    // Parse and clean the HTML
38    let cleaned_html = clean_html(&processed_html);
39
40    // Preserve hierarchical heading numbering (e.g. "13. Foo", "13.1 Bar").
41    // Unwrap <ol><li><hN>13. Foo</hN></li></ol> -> <hN>13. Foo</hN> so that
42    // html2md does not restart the OL counter at "1." and clobber the source
43    // number that already lives inside the heading text.
44    let cleaned_html = preserve_leading_heading_numbering(&cleaned_html);
45
46    // Move <img> elements out of headings so html2md always sees them.
47    // Some html2md versions only emit text children for <h1>..<h6>,
48    // silently dropping inline images.
49    let heading_safe_html = hoist_images_from_headings(&cleaned_html);
50
51    // Convert to Markdown using html2md
52    let markdown = html2md::parse_html(&heading_safe_html);
53
54    // Decode HTML entities to unicode characters
55    let decoded_markdown = crate::html::decode_html_entities(&markdown);
56
57    // Preserve non-breaking spaces as &nbsp; entities for clear marking
58    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
59
60    // Clean up the markdown output
61    let cleaned_markdown = clean_markdown(&normalized_markdown);
62
63    info!(
64        "Successfully converted to Markdown ({} bytes)",
65        cleaned_markdown.len()
66    );
67    Ok(cleaned_markdown)
68}
69
70#[must_use]
71pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
72    let selector = Selector::parse(selector_str).ok()?;
73    let document = Html::parse_document(html);
74    document
75        .select(&selector)
76        .next()
77        .map(|element| element.html())
78}
79
80/// Clean HTML content before Markdown conversion
81///
82/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
83fn clean_html(html: &str) -> String {
84    debug!("Cleaning HTML for Markdown conversion");
85
86    let document = Html::parse_document(html);
87
88    // Create a mutable string to build our cleaned HTML
89    let mut cleaned = html.to_string();
90
91    // Remove script tags
92    if let Ok(selector) = Selector::parse("script") {
93        for element in document.select(&selector) {
94            let outer_html = element.html();
95            cleaned = cleaned.replace(&outer_html, "");
96        }
97    }
98
99    // Remove style tags
100    if let Ok(selector) = Selector::parse("style") {
101        for element in document.select(&selector) {
102            let outer_html = element.html();
103            cleaned = cleaned.replace(&outer_html, "");
104        }
105    }
106
107    // Remove noscript tags
108    if let Ok(selector) = Selector::parse("noscript") {
109        for element in document.select(&selector) {
110            let outer_html = element.html();
111            cleaned = cleaned.replace(&outer_html, "");
112        }
113    }
114
115    cleaned
116}
117
118/// Unwrap `<ol><li><hN>...</hN></li></ol>` when the heading text already
119/// carries a leading number (e.g. "13. Foo"), and replace such a list with the
120/// bare heading. Without this, `html2md` restarts ordered-list numbering at
121/// "1." and the document loses the original section number.
122///
123/// Also lifts a leading "13. " out of an inner `<strong>` so html2md emits
124/// `#### 13. Foo` (matchable by the test) rather than `#### **13. Foo**`.
125fn preserve_leading_heading_numbering(html: &str) -> String {
126    let pattern = Regex::new(
127        r"(?is)<ol\b[^>]*>\s*<li\b[^>]*>\s*(<h[1-6]\b[^>]*>(?:.*?)</h[1-6]>)\s*</li>\s*</ol>",
128    )
129    .expect("valid regex");
130    let leading_number_in_strong =
131        Regex::new(r"(?is)(<h[1-6]\b[^>]*>)\s*<strong\b[^>]*>\s*(\d+\.\s+)([\s\S]*?)</strong>")
132            .expect("valid regex");
133    let leading_number_plain = Regex::new(r"(?is)<h[1-6]\b[^>]*>\s*\d+\.\s").expect("valid regex");
134
135    let unwrapped = pattern
136        .replace_all(html, |caps: &regex::Captures<'_>| {
137            let heading = &caps[1];
138            if leading_number_in_strong.is_match(heading) || leading_number_plain.is_match(heading)
139            {
140                heading.to_string()
141            } else {
142                caps[0].to_string()
143            }
144        })
145        .into_owned();
146
147    leading_number_in_strong
148        .replace_all(&unwrapped, |caps: &regex::Captures<'_>| {
149            let open = &caps[1];
150            let number = &caps[2];
151            let inner = &caps[3];
152            format!("{open}{number}<strong>{inner}</strong>")
153        })
154        .into_owned()
155}
156
157/// Move `<img>` tags out of `<h1>`..`<h6>` elements.
158///
159/// Rewrites `<hN>...<img ...>...text</hN>` →
160/// `<hN>...text</hN>\n<p><img ...></p>` so that any HTML→Markdown
161/// converter sees the images at block level.
162fn hoist_images_from_headings(html: &str) -> String {
163    use std::fmt::Write;
164
165    let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
166    let mut result = html.to_string();
167
168    for level in 1..=6 {
169        let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
170            .expect("valid regex");
171
172        result = heading_re
173            .replace_all(&result, |caps: &regex::Captures<'_>| {
174                let open = &caps[1];
175                let inner = &caps[2];
176                let close = &caps[3];
177
178                let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
179
180                if imgs.is_empty() {
181                    return caps[0].to_string();
182                }
183
184                let stripped = img_re.replace_all(inner, "").to_string();
185                let mut out = format!("{open}{stripped}{close}");
186                for img in imgs {
187                    write!(out, "\n<p>{img}</p>").expect("write to String");
188                }
189                out
190            })
191            .into_owned();
192    }
193
194    result
195}
196
197/// Clean up Markdown output
198///
199/// Removes excessive whitespace and normalizes the output.
200pub fn clean_markdown(markdown: &str) -> String {
201    debug!("Cleaning Markdown output");
202
203    // Remove excessive blank lines (more than 2 consecutive newlines)
204    let mut result = markdown.to_string();
205
206    // Replace multiple consecutive newlines with at most two
207    while result.contains("\n\n\n") {
208        result = result.replace("\n\n\n", "\n\n");
209    }
210
211    // Trim leading and trailing whitespace
212    result = result.trim().to_string();
213
214    // Ensure the document ends with a newline
215    if !result.is_empty() && !result.ends_with('\n') {
216        result.push('\n');
217    }
218
219    result
220}