Skip to main content

web_capture/
markdown.rs

1//! Markdown conversion module
2//!
3//! This module provides functions for converting HTML to Markdown format.
4
5use crate::html::convert_relative_urls;
6use crate::Result;
7use scraper::{Html, Selector};
8use tracing::{debug, info};
9
10/// Convert HTML content to Markdown
11///
12/// This function cleans the HTML (removing scripts, styles, etc.)
13/// and converts it to Markdown format.
14///
15/// # Arguments
16///
17/// * `html` - The HTML content to convert
18/// * `base_url` - Optional base URL for converting relative URLs to absolute
19///
20/// # Returns
21///
22/// The Markdown content as a string
23///
24/// # Errors
25///
26/// Returns an error if conversion fails
27pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
28    info!("Converting HTML to Markdown");
29
30    // Convert relative URLs to absolute if base_url is provided
31    let processed_html = base_url.map_or_else(
32        || html.to_string(),
33        |base| convert_relative_urls(html, base),
34    );
35
36    // Parse and clean the HTML
37    let cleaned_html = clean_html(&processed_html);
38
39    // Convert to Markdown using html2md
40    let markdown = html2md::parse_html(&cleaned_html);
41
42    // Decode HTML entities to unicode characters
43    let decoded_markdown = crate::html::decode_html_entities(&markdown);
44
45    // Preserve non-breaking spaces as &nbsp; entities for clear marking
46    let normalized_markdown = decoded_markdown.replace('\u{00A0}', "&nbsp;");
47
48    // Clean up the markdown output
49    let cleaned_markdown = clean_markdown(&normalized_markdown);
50
51    info!(
52        "Successfully converted to Markdown ({} bytes)",
53        cleaned_markdown.len()
54    );
55    Ok(cleaned_markdown)
56}
57
58/// Clean HTML content before Markdown conversion
59///
60/// Removes scripts, styles, and other elements that shouldn't be in Markdown.
61fn clean_html(html: &str) -> String {
62    debug!("Cleaning HTML for Markdown conversion");
63
64    let document = Html::parse_document(html);
65
66    // Create a mutable string to build our cleaned HTML
67    let mut cleaned = html.to_string();
68
69    // Remove script tags
70    if let Ok(selector) = Selector::parse("script") {
71        for element in document.select(&selector) {
72            let outer_html = element.html();
73            cleaned = cleaned.replace(&outer_html, "");
74        }
75    }
76
77    // Remove style tags
78    if let Ok(selector) = Selector::parse("style") {
79        for element in document.select(&selector) {
80            let outer_html = element.html();
81            cleaned = cleaned.replace(&outer_html, "");
82        }
83    }
84
85    // Remove noscript tags
86    if let Ok(selector) = Selector::parse("noscript") {
87        for element in document.select(&selector) {
88            let outer_html = element.html();
89            cleaned = cleaned.replace(&outer_html, "");
90        }
91    }
92
93    cleaned
94}
95
96/// Clean up Markdown output
97///
98/// Removes excessive whitespace and normalizes the output.
99pub fn clean_markdown(markdown: &str) -> String {
100    debug!("Cleaning Markdown output");
101
102    // Remove excessive blank lines (more than 2 consecutive newlines)
103    let mut result = markdown.to_string();
104
105    // Replace multiple consecutive newlines with at most two
106    while result.contains("\n\n\n") {
107        result = result.replace("\n\n\n", "\n\n");
108    }
109
110    // Trim leading and trailing whitespace
111    result = result.trim().to_string();
112
113    // Ensure the document ends with a newline
114    if !result.is_empty() && !result.ends_with('\n') {
115        result.push('\n');
116    }
117
118    result
119}