Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use scraper::{Html, Selector};
8use tracing::{debug, info};
9
10/// Convert HTML content to Markdown
11///
12/// This function cleans the HTML (removing scripts, styles, etc.)
13/// and converts it to Markdown format.
14///
15/// # Arguments
16///
17/// * `html` - The HTML content to convert
18/// * `base_url` - Optional base URL for converting relative URLs to absolute
19///
20/// # Returns
21///
22/// The Markdown content as a string
23///
24/// # Errors
25///
26/// Returns an error if conversion fails
27pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
28    info!("Converting HTML to Markdown");
29
30    // Convert relative URLs to absolute if base_url is provided
31    let processed_html = base_url.map_or_else(
32        || html.to_string(),
33        |base| convert_relative_urls(html, base),
34    );
35
36    // Parse and clean the HTML
37    let cleaned_html = clean_html(&processed_html);
38
39    // Convert to Markdown using html2md
40    let markdown = html2md::parse_html(&cleaned_html);
41
42    // Decode HTML entities to unicode characters
43    let decoded_markdown = crate::html::decode_html_entities(&markdown);
44
45    // Clean up the markdown output
46    let cleaned_markdown = clean_markdown(&decoded_markdown);
47
48    info!(
49        "Successfully converted to Markdown ({} bytes)",
50        cleaned_markdown.len()
51    );
52    Ok(cleaned_markdown)
53}
54
55/// Clean HTML content before Markdown conversion
56///
57/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
58fn clean_html(html: &str) -> String {
59    debug!("Cleaning HTML for Markdown conversion");
60
61    let document = Html::parse_document(html);
62
63    // Create a mutable string to build our cleaned HTML
64    let mut cleaned = html.to_string();
65
66    // Remove script tags
67    if let Ok(selector) = Selector::parse("script") {
68        for element in document.select(&selector) {
69            let outer_html = element.html();
70            cleaned = cleaned.replace(&outer_html, "");
71        }
72    }
73
74    // Remove style tags
75    if let Ok(selector) = Selector::parse("style") {
76        for element in document.select(&selector) {
77            let outer_html = element.html();
78            cleaned = cleaned.replace(&outer_html, "");
79        }
80    }
81
82    // Remove noscript tags
83    if let Ok(selector) = Selector::parse("noscript") {
84        for element in document.select(&selector) {
85            let outer_html = element.html();
86            cleaned = cleaned.replace(&outer_html, "");
87        }
88    }
89
90    cleaned
91}
92
93/// Clean up Markdown output
94///
95/// Removes excessive whitespace and normalizes the output.
96pub fn clean_markdown(markdown: &str) -> String {
97    debug!("Cleaning Markdown output");
98
99    // Remove excessive blank lines (more than 2 consecutive newlines)
100    let mut result = markdown.to_string();
101
102    // Replace multiple consecutive newlines with at most two
103    while result.contains("\n\n\n") {
104        result = result.replace("\n\n\n", "\n\n");
105    }
106
107    // Trim leading and trailing whitespace
108    result = result.trim().to_string();
109
110    // Ensure the document ends with a newline
111    if !result.is_empty() && !result.ends_with('\n') {
112        result.push('\n');
113    }
114
115    result
116}