Skip to main content

web_capture/
html.rs

1//! HTML processing module
2//!
3//! This module provides functions for fetching, parsing, and processing HTML content.
4
5use crate::{Result, WebCaptureError};
6use regex::Regex;
7use tracing::{debug, info};
8use url::Url;
9
10/// Default user agent string
11const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
12
13/// Fetch HTML content from a URL
14///
15/// This function makes a simple HTTP GET request to fetch the HTML content.
16///
17/// # Arguments
18///
19/// * `url` - The URL to fetch
20///
21/// # Returns
22///
23/// The HTML content as a string
24///
25/// # Errors
26///
27/// Returns an error if the fetch fails or the response cannot be decoded
28pub async fn fetch_html(url: &str) -> Result<String> {
29    info!("Fetching HTML from URL: {}", url);
30
31    if crate::stackoverflow::is_stackoverflow_question_url(url) {
32        return crate::stackoverflow::fetch_stackoverflow_html(url).await;
33    }
34
35    let client = reqwest::Client::builder()
36        .user_agent(USER_AGENT)
37        .build()
38        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
39
40    let response = client
41        .get(url)
42        .header("Accept-Language", "en-US,en;q=0.9")
43        .header("Accept-Charset", "utf-8")
44        .send()
45        .await
46        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
47
48    let html = response
49        .text()
50        .await
51        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
52
53    info!("Successfully fetched HTML ({} bytes)", html.len());
54    Ok(html)
55}
56
57/// Convert relative URLs to absolute URLs in HTML content
58///
59/// Processes various HTML attributes that contain URLs and converts
60/// relative URLs to absolute URLs using the provided base URL.
61///
62/// # Arguments
63///
64/// * `html` - The HTML content to process
65/// * `base_url` - The base URL to use for resolving relative URLs
66///
67/// # Returns
68///
69/// The HTML content with absolute URLs
70pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
71    debug!(
72        "Converting relative URLs to absolute using base: {}",
73        base_url
74    );
75
76    let Ok(base) = Url::parse(base_url) else {
77        return html.to_string();
78    };
79
80    let mut result = html.to_string();
81
82    // List of tag/attribute combinations to process
83    let attributes = [
84        ("a", "href"),
85        ("img", "src"),
86        ("script", "src"),
87        ("link", "href"),
88        ("form", "action"),
89        ("video", "src"),
90        ("audio", "src"),
91        ("source", "src"),
92        ("track", "src"),
93        ("embed", "src"),
94        ("object", "data"),
95        ("iframe", "src"),
96    ];
97
98    for (tag, attr) in &attributes {
99        let pattern = format!(r#"<{tag}[^>]*{attr}=["']([^"']+)["'][^>]*>"#);
100        if let Ok(regex) = Regex::new(&pattern) {
101            result = regex
102                .replace_all(&result, |caps: &regex::Captures| {
103                    let full_match = caps.get(0).map_or("", |m| m.as_str());
104                    let url_match = caps.get(1).map_or("", |m| m.as_str());
105
106                    let absolute_url = to_absolute_url(url_match, &base);
107                    full_match.replace(url_match, &absolute_url)
108                })
109                .to_string();
110        }
111    }
112
113    // Handle inline styles with url()
114    if let Ok(url_regex) = Regex::new(r#"url\(['"]?([^'"()]+)['"]?\)"#) {
115        result = url_regex
116            .replace_all(&result, |caps: &regex::Captures| {
117                let url_match = caps.get(1).map_or("", |m| m.as_str());
118                let absolute_url = to_absolute_url(url_match, &base);
119                format!(r#"url("{absolute_url}")"#)
120            })
121            .to_string();
122    }
123
124    debug!("URL conversion complete");
125    result
126}
127
128/// Convert a potentially relative URL to an absolute URL
129fn to_absolute_url(url: &str, base: &Url) -> String {
130    // Skip data:, blob:, and javascript: URLs
131    if url.is_empty()
132        || url.starts_with("data:")
133        || url.starts_with("blob:")
134        || url.starts_with("javascript:")
135    {
136        return url.to_string();
137    }
138
139    // Try to resolve the URL against the base
140    base.join(url)
141        .map_or_else(|_| url.to_string(), |absolute| absolute.to_string())
142}
143
144/// Convert HTML content to UTF-8 encoding
145///
146/// Detects the current encoding from meta tags and ensures UTF-8 encoding.
147///
148/// # Arguments
149///
150/// * `html` - The HTML content to convert
151///
152/// # Returns
153///
154/// The UTF-8 encoded HTML content
155pub fn convert_to_utf8(html: &str) -> String {
156    debug!("Converting HTML to UTF-8");
157
158    // Check for charset meta tag
159    let charset_regex = Regex::new(r#"<meta[^>]+charset=["']?([^"'>\s]+)"#).ok();
160
161    let current_charset = charset_regex
162        .as_ref()
163        .and_then(|re| re.captures(html))
164        .and_then(|caps| caps.get(1))
165        .map_or_else(|| "utf-8".to_string(), |m| m.as_str().to_lowercase());
166
167    // If already UTF-8, ensure the meta tag is present
168    if current_charset == "utf-8" || current_charset == "utf8" {
169        // Add meta charset if not present
170        if !html.to_lowercase().contains("charset") {
171            if let Ok(head_regex) = Regex::new(r"<head[^>]*>") {
172                return head_regex
173                    .replace(html, r#"$0<meta charset="utf-8">"#)
174                    .to_string();
175            }
176        }
177        return html.to_string();
178    }
179
180    // For other charsets, try to convert and update the meta tag
181    let charset_update_regex = Regex::new(r#"<meta[^>]+charset=["']?[^"'>\s]+["']?"#).ok();
182
183    charset_update_regex.map_or_else(
184        || html.to_string(),
185        |regex| regex.replace(html, r#"<meta charset="utf-8""#).to_string(),
186    )
187}
188
189/// Check if HTML content contains JavaScript
190///
191/// # Arguments
192///
193/// * `html` - The HTML content to check
194///
195/// # Returns
196///
197/// True if the HTML contains JavaScript
198#[must_use]
199pub fn has_javascript(html: &str) -> bool {
200    let pattern = r"<script[^>]*>[\s\S]*?</script>|<script[^>]*/\s*>|javascript:";
201    Regex::new(pattern).is_ok_and(|re| re.is_match(html))
202}
203
204/// Check if content is valid HTML
205///
206/// # Arguments
207///
208/// * `html` - The content to check
209///
210/// # Returns
211///
212/// True if the content appears to be valid HTML
213#[must_use]
214pub fn is_html(html: &str) -> bool {
215    let pattern = r"<html[^>]*>[\s\S]*?</html>";
216    Regex::new(pattern).is_ok_and(|re| re.is_match(html))
217}
218
219/// Decode HTML entities to unicode characters.
220///
221/// Converts HTML entities like `&amp;`, `&lt;`, `&#39;`, `&#x27;` etc.
222/// to their actual unicode character equivalents.
223///
224/// # Arguments
225///
226/// * `html` - The HTML content containing entities to decode
227///
228/// # Returns
229///
230/// The content with all HTML entities decoded to unicode
231#[must_use]
232pub fn decode_html_entities(html: &str) -> String {
233    html_escape::decode_html_entities(html).into_owned()
234}
235
236/// Pretty-print HTML with indentation.
237///
238/// Adds newlines and indentation to make HTML human-readable.
239/// Void elements (br, hr, img, input, meta, link) are not indented as blocks.
240///
241/// # Arguments
242///
243/// * `html` - The HTML content to format
244///
245/// # Returns
246///
247/// The pretty-printed HTML content
248#[must_use]
249pub fn pretty_print_html(html: &str) -> String {
250    use std::sync::OnceLock;
251
252    static TAG_RE: OnceLock<Regex> = OnceLock::new();
253    static VOID_RE: OnceLock<Regex> = OnceLock::new();
254
255    let re = TAG_RE.get_or_init(|| Regex::new(r"(</?[a-zA-Z][^>]*?>)").unwrap());
256    let void_pat = VOID_RE.get_or_init(|| {
257        Regex::new(
258            r"(?i)^<(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)\b",
259        )
260        .unwrap()
261    });
262    let mut result = String::with_capacity(html.len() * 2);
263    let mut indent: usize = 0;
264    let indent_str = "  ";
265    let mut last_end = 0;
266    let mut parts: Vec<(bool, &str)> = Vec::new();
267
268    for m in re.find_iter(html) {
269        let before = &html[last_end..m.start()];
270        if !before.trim().is_empty() {
271            parts.push((false, before));
272        }
273        parts.push((true, m.as_str()));
274        last_end = m.end();
275    }
276    let trailing = &html[last_end..];
277    if !trailing.trim().is_empty() {
278        parts.push((false, trailing));
279    }
280
281    for (is_tag, content) in &parts {
282        if *is_tag {
283            let tag = *content;
284            let is_closing = tag.starts_with("</");
285            let is_void = void_pat.is_match(tag);
286            let is_self_closing = tag.ends_with("/>");
287
288            if is_closing {
289                indent = indent.saturating_sub(1);
290            }
291            for _ in 0..indent {
292                result.push_str(indent_str);
293            }
294            result.push_str(tag);
295            result.push('\n');
296            if !is_closing && !is_void && !is_self_closing {
297                indent += 1;
298            }
299        } else {
300            let text = content.trim();
301            if !text.is_empty() {
302                for _ in 0..indent {
303                    result.push_str(indent_str);
304                }
305                result.push_str(text);
306                result.push('\n');
307            }
308        }
309    }
310
311    result
312}
313
314/// Normalize URL to ensure it's absolute.
315///
316/// Prepends `https://` if no scheme is present and validates the URL.
317///
318/// # Errors
319///
320/// Returns an error string if the URL is empty or invalid.
321pub fn normalize_url(url: &str) -> std::result::Result<String, String> {
322    if url.is_empty() {
323        return Err("Missing url parameter".to_string());
324    }
325
326    let absolute_url = if url.starts_with("http://") || url.starts_with("https://") {
327        url.to_string()
328    } else {
329        format!("https://{url}")
330    };
331
332    // Validate the URL
333    Url::parse(&absolute_url).map_err(|e| format!("Invalid URL: {e}"))?;
334
335    Ok(absolute_url)
336}