Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use regex::Regex;
8use scraper::{Html, Selector};
9use tracing::{debug, info};
10
11/// Convert HTML content to Markdown
12///
13/// This function cleans the HTML (removing scripts, styles, etc.)
14/// and converts it to Markdown format.
15///
16/// # Arguments
17///
18/// * `html` - The HTML content to convert
19/// * `base_url` - Optional base URL for converting relative URLs to absolute
20///
21/// # Returns
22///
23/// The Markdown content as a string
24///
25/// # Errors
26///
27/// Returns an error if conversion fails
28pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
29    info!("Converting HTML to Markdown");
30
31    // Convert relative URLs to absolute if base_url is provided
32    let processed_html = base_url.map_or_else(
33        || html.to_string(),
34        |base| convert_relative_urls(html, base),
35    );
36
37    // Parse and clean the HTML
38    let cleaned_html = clean_html(&processed_html);
39
40    // Move <img> elements out of headings so html2md always sees them.
41    // Some html2md versions only emit text children for <h1>..<h6>,
42    // silently dropping inline images.
43    let heading_safe_html = hoist_images_from_headings(&cleaned_html);
44
45    // Convert to Markdown using html2md
46    let markdown = html2md::parse_html(&heading_safe_html);
47
48    // Decode HTML entities to unicode characters
49    let decoded_markdown = crate::html::decode_html_entities(&markdown);
50
51    // Preserve non-breaking spaces as &nbsp; entities for clear marking
52    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
53
54    // Clean up the markdown output
55    let cleaned_markdown = clean_markdown(&normalized_markdown);
56
57    info!(
58        "Successfully converted to Markdown ({} bytes)",
59        cleaned_markdown.len()
60    );
61    Ok(cleaned_markdown)
62}
63
64/// Clean HTML content before Markdown conversion
65///
66/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
67fn clean_html(html: &str) -> String {
68    debug!("Cleaning HTML for Markdown conversion");
69
70    let document = Html::parse_document(html);
71
72    // Create a mutable string to build our cleaned HTML
73    let mut cleaned = html.to_string();
74
75    // Remove script tags
76    if let Ok(selector) = Selector::parse("script") {
77        for element in document.select(&selector) {
78            let outer_html = element.html();
79            cleaned = cleaned.replace(&outer_html, "");
80        }
81    }
82
83    // Remove style tags
84    if let Ok(selector) = Selector::parse("style") {
85        for element in document.select(&selector) {
86            let outer_html = element.html();
87            cleaned = cleaned.replace(&outer_html, "");
88        }
89    }
90
91    // Remove noscript tags
92    if let Ok(selector) = Selector::parse("noscript") {
93        for element in document.select(&selector) {
94            let outer_html = element.html();
95            cleaned = cleaned.replace(&outer_html, "");
96        }
97    }
98
99    cleaned
100}
101
102/// Move `<img>` tags out of `<h1>`..`<h6>` elements.
103///
104/// Rewrites `<hN>...<img ...>...text</hN>` →
105/// `<hN>...text</hN>\n<p><img ...></p>` so that any HTML→Markdown
106/// converter sees the images at block level.
107fn hoist_images_from_headings(html: &str) -> String {
108    use std::fmt::Write;
109
110    let img_re = Regex::new(r"<img\s[^>]*>").expect("valid regex");
111    let mut result = html.to_string();
112
113    for level in 1..=6 {
114        let heading_re = Regex::new(&format!(r"(?si)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
115            .expect("valid regex");
116
117        result = heading_re
118            .replace_all(&result, |caps: &regex::Captures<'_>| {
119                let open = &caps[1];
120                let inner = &caps[2];
121                let close = &caps[3];
122
123                let imgs: Vec<&str> = img_re.find_iter(inner).map(|m| m.as_str()).collect();
124
125                if imgs.is_empty() {
126                    return caps[0].to_string();
127                }
128
129                let stripped = img_re.replace_all(inner, "").to_string();
130                let mut out = format!("{open}{stripped}{close}");
131                for img in imgs {
132                    write!(out, "\n<p>{img}</p>").expect("write to String");
133                }
134                out
135            })
136            .into_owned();
137    }
138
139    result
140}
141
142/// Clean up Markdown output
143///
144/// Removes excessive whitespace and normalizes the output.
145pub fn clean_markdown(markdown: &str) -> String {
146    debug!("Cleaning Markdown output");
147
148    // Remove excessive blank lines (more than 2 consecutive newlines)
149    let mut result = markdown.to_string();
150
151    // Replace multiple consecutive newlines with at most two
152    while result.contains("\n\n\n") {
153        result = result.replace("\n\n\n", "\n\n");
154    }
155
156    // Trim leading and trailing whitespace
157    result = result.trim().to_string();
158
159    // Ensure the document ends with a newline
160    if !result.is_empty() && !result.ends_with('\n') {
161        result.push('\n');
162    }
163
164    result
165}