Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11/// Convert HTML content to Markdown
12///
13/// This function cleans the HTML (removing scripts, styles, etc.)
14/// and converts it to Markdown format.
15///
16/// # Arguments
17///
18/// * `html` - The HTML content to convert
19/// * `base_url` - Optional base URL for converting relative URLs to absolute
20///
21/// # Returns
22///
23/// The Markdown content as a string
24///
25/// # Errors
26///
27/// Returns an error if conversion fails
28pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29    info!("Converting HTML to Markdown");
30
31    // Convert relative URLs to absolute if base_url is provided
32    let processed_html = base_url.map_or_else(
33        || html.to_string(),
34        |base| convert_relative_urls(html, base),
35    );
36
37    // Parse and clean the HTML
38    let cleaned_html = clean_html(&processed_html);
39
40    // Move <img> elements out of headings so html2md always sees them.
41    // Some html2md versions only emit text children for <h1>..<h6>,
42    // silently dropping inline images.
43    let heading_safe_html = hoist_images_from_headings(&cleaned_html);
44
45    // Convert to Markdown using html2md
46    let markdown = html2md::parse_html(&heading_safe_html);
47
48    // Decode HTML entities to unicode characters
49    let decoded_markdown = crate::html::decode_html_entities(&markdown);
50
51    // Preserve non-breaking spaces as &nbsp; entities for clear marking
52    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
53
54    // Clean up the markdown output
55    let cleaned_markdown = clean_markdown(&normalized_markdown);
56
57    info!(
58        "Successfully converted to Markdown ({} bytes)",
59        cleaned_markdown.len()
60    );
61    Ok(cleaned_markdown)
62}
63
64#[must_use]
65pub fn select_html(html: &str, selector_str: &str) -> Option<String> {
66    let selector = Selector::parse(selector_str).ok()?;
67    let document = Html::parse_document(html);
68    document
69        .select(&selector)
70        .next()
71        .map(|element| element.html())
72}
73
74/// Clean HTML content before Markdown conversion
75///
76/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
77fn clean_html(html: &str) -> String {
78    debug!("Cleaning HTML for Markdown conversion");
79
80    let document = Html::parse_document(html);
81
82    // Create a mutable string to build our cleaned HTML
83    let mut cleaned = html.to_string();
84
85    // Remove script tags
86    if let Ok(selector) = Selector::parse("script") {
87        for element in document.select(&selector) {
88            let outer_html = element.html();
89            cleaned = cleaned.replace(&outer_html, "");
90        }
91    }
92
93    // Remove style tags
94    if let Ok(selector) = Selector::parse("style") {
95        for element in document.select(&selector) {
96            let outer_html = element.html();
97            cleaned = cleaned.replace(&outer_html, "");
98        }
99    }
100
101    // Remove noscript tags
102    if let Ok(selector) = Selector::parse("noscript") {
103        for element in document.select(&selector) {
104            let outer_html = element.html();
105            cleaned = cleaned.replace(&outer_html, "");
106        }
107    }
108
109    cleaned
110}
111
112/// Move `<img>` tags out of `<h1>`..`<h6>` elements.
113///
114/// Rewrites `<hN>...<img ...>...text</hN>` →
115/// `<hN>...text</hN>\n<p><img ...></p>` so that any HTML→Markdown
116/// converter sees the images at block level.
117fn hoist_images_from_headings(html: &str) -> String {
118    use std::fmt::Write;
119
120    let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
121    let mut result = html.to_string();
122
123    for level in 1..=6 {
124        let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
125            .expect("valid regex");
126
127        result = heading_re
128            .replace_all(&result, |caps: &regex::Captures<'_>| {
129                let open = &caps[1];
130                let inner = &caps[2];
131                let close = &caps[3];
132
133                let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
134
135                if imgs.is_empty() {
136                    return caps[0].to_string();
137                }
138
139                let stripped = img_re.replace_all(inner, "").to_string();
140                let mut out = format!("{open}{stripped}{close}");
141                for img in imgs {
142                    write!(out, "\n<p>{img}</p>").expect("write to String");
143                }
144                out
145            })
146            .into_owned();
147    }
148
149    result
150}
151
152/// Clean up Markdown output
153///
154/// Removes excessive whitespace and normalizes the output.
155pub fn clean_markdown(markdown: &str) -> String {
156    debug!("Cleaning Markdown output");
157
158    // Remove excessive blank lines (more than 2 consecutive newlines)
159    let mut result = markdown.to_string();
160
161    // Replace multiple consecutive newlines with at most two
162    while result.contains("\n\n\n") {
163        result = result.replace("\n\n\n", "\n\n");
164    }
165
166    // Trim leading and trailing whitespace
167    result = result.trim().to_string();
168
169    // Ensure the document ends with a newline
170    if !result.is_empty() && !result.ends_with('\n') {
171        result.push('\n');
172    }
173
174    result
175}