Skip to main content

web_capture/
html.rs

1//! HTML processing module
2//!
3//! This module provides functions for fetching, parsing, and processing HTML content.
4
5use crate::{Result, WebCaptureError};
6use regex::Regex;
7use tracing::{debug, info};
8use url::Url;
9
10/// Default user agent string
11const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
12
13/// Fetch HTML content from a URL
14///
15/// This function makes a simple HTTP GET request to fetch the HTML content.
16///
17/// # Arguments
18///
19/// * `url` - The URL to fetch
20///
21/// # Returns
22///
23/// The HTML content as a string
24///
25/// # Errors
26///
27/// Returns an error if the fetch fails or the response cannot be decoded
28pub async fn fetch_html(url: &str) -> Result<String> {
29    info!("Fetching HTML from URL: {}", url);
30
31    let client = reqwest::Client::builder()
32        .user_agent(USER_AGENT)
33        .build()
34        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
35
36    let response = client
37        .get(url)
38        .header("Accept-Language", "en-US,en;q=0.9")
39        .header("Accept-Charset", "utf-8")
40        .send()
41        .await
42        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
43
44    let html = response
45        .text()
46        .await
47        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
48
49    info!("Successfully fetched HTML ({} bytes)", html.len());
50    Ok(html)
51}
52
53/// Convert relative URLs to absolute URLs in HTML content
54///
55/// Processes various HTML attributes that contain URLs and converts
56/// relative URLs to absolute URLs using the provided base URL.
57///
58/// # Arguments
59///
60/// * `html` - The HTML content to process
61/// * `base_url` - The base URL to use for resolving relative URLs
62///
63/// # Returns
64///
65/// The HTML content with absolute URLs
66pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
67    debug!(
68        "Converting relative URLs to absolute using base: {}",
69        base_url
70    );
71
72    let Ok(base) = Url::parse(base_url) else {
73        return html.to_string();
74    };
75
76    let mut result = html.to_string();
77
78    // List of tag/attribute combinations to process
79    let attributes = [
80        ("a", "href"),
81        ("img", "src"),
82        ("script", "src"),
83        ("link", "href"),
84        ("form", "action"),
85        ("video", "src"),
86        ("audio", "src"),
87        ("source", "src"),
88        ("track", "src"),
89        ("embed", "src"),
90        ("object", "data"),
91        ("iframe", "src"),
92    ];
93
94    for (tag, attr) in &attributes {
95        let pattern = format!(r#"<{tag}[^>]*{attr}=["']([^"']+)["'][^>]*>"#);
96        if let Ok(regex) = Regex::new(&pattern) {
97            result = regex
98                .replace_all(&result, |caps: &regex::Captures| {
99                    let full_match = caps.get(0).map_or("", |m| m.as_str());
100                    let url_match = caps.get(1).map_or("", |m| m.as_str());
101
102                    let absolute_url = to_absolute_url(url_match, &base);
103                    full_match.replace(url_match, &absolute_url)
104                })
105                .to_string();
106        }
107    }
108
109    // Handle inline styles with url()
110    if let Ok(url_regex) = Regex::new(r#"url\(['"]?([^'"()]+)['"]?\)"#) {
111        result = url_regex
112            .replace_all(&result, |caps: &regex::Captures| {
113                let url_match = caps.get(1).map_or("", |m| m.as_str());
114                let absolute_url = to_absolute_url(url_match, &base);
115                format!(r#"url("{absolute_url}")"#)
116            })
117            .to_string();
118    }
119
120    debug!("URL conversion complete");
121    result
122}
123
124/// Convert a potentially relative URL to an absolute URL
125fn to_absolute_url(url: &str, base: &Url) -> String {
126    // Skip data:, blob:, and javascript: URLs
127    if url.is_empty()
128        || url.starts_with("data:")
129        || url.starts_with("blob:")
130        || url.starts_with("javascript:")
131    {
132        return url.to_string();
133    }
134
135    // Try to resolve the URL against the base
136    base.join(url)
137        .map_or_else(|_| url.to_string(), |absolute| absolute.to_string())
138}
139
140/// Convert HTML content to UTF-8 encoding
141///
142/// Detects the current encoding from meta tags and ensures UTF-8 encoding.
143///
144/// # Arguments
145///
146/// * `html` - The HTML content to convert
147///
148/// # Returns
149///
150/// The UTF-8 encoded HTML content
151pub fn convert_to_utf8(html: &str) -> String {
152    debug!("Converting HTML to UTF-8");
153
154    // Check for charset meta tag
155    let charset_regex = Regex::new(r#"<meta[^>]+charset=["']?([^"'>\s]+)"#).ok();
156
157    let current_charset = charset_regex
158        .as_ref()
159        .and_then(|re| re.captures(html))
160        .and_then(|caps| caps.get(1))
161        .map_or_else(|| "utf-8".to_string(), |m| m.as_str().to_lowercase());
162
163    // If already UTF-8, ensure the meta tag is present
164    if current_charset == "utf-8" || current_charset == "utf8" {
165        // Add meta charset if not present
166        if !html.to_lowercase().contains("charset") {
167            if let Ok(head_regex) = Regex::new(r"<head[^>]*>") {
168                return head_regex
169                    .replace(html, r#"$0<meta charset="utf-8">"#)
170                    .to_string();
171            }
172        }
173        return html.to_string();
174    }
175
176    // For other charsets, try to convert and update the meta tag
177    let charset_update_regex = Regex::new(r#"<meta[^>]+charset=["']?[^"'>\s]+["']?"#).ok();
178
179    charset_update_regex.map_or_else(
180        || html.to_string(),
181        |regex| regex.replace(html, r#"<meta charset="utf-8""#).to_string(),
182    )
183}
184
185/// Check if HTML content contains JavaScript
186///
187/// # Arguments
188///
189/// * `html` - The HTML content to check
190///
191/// # Returns
192///
193/// True if the HTML contains JavaScript
194#[must_use]
195pub fn has_javascript(html: &str) -> bool {
196    let pattern = r"<script[^>]*>[\s\S]*?</script>|<script[^>]*/\s*>|javascript:";
197    Regex::new(pattern)
198        .map(|re| re.is_match(html))
199        .unwrap_or(false)
200}
201
202/// Check if content is valid HTML
203///
204/// # Arguments
205///
206/// * `html` - The content to check
207///
208/// # Returns
209///
210/// True if the content appears to be valid HTML
211#[must_use]
212pub fn is_html(html: &str) -> bool {
213    let pattern = r"<html[^>]*>[\s\S]*?</html>";
214    Regex::new(pattern)
215        .map(|re| re.is_match(html))
216        .unwrap_or(false)
217}
218
219/// Decode HTML entities to unicode characters.
220///
221/// Converts HTML entities like `&amp;`, `&lt;`, `&#39;`, `&#x27;` etc.
222/// to their actual unicode character equivalents.
223///
224/// # Arguments
225///
226/// * `html` - The HTML content containing entities to decode
227///
228/// # Returns
229///
230/// The content with all HTML entities decoded to unicode
231#[must_use]
232pub fn decode_html_entities(html: &str) -> String {
233    html_escape::decode_html_entities(html).into_owned()
234}
235
236/// Pretty-print HTML with indentation.
237///
238/// Adds newlines and indentation to make HTML human-readable.
239/// Void elements (br, hr, img, input, meta, link) are not indented as blocks.
240///
241/// # Arguments
242///
243/// * `html` - The HTML content to format
244///
245/// # Returns
246///
247/// The pretty-printed HTML content
248#[must_use]
249pub fn pretty_print_html(html: &str) -> String {
250    use std::sync::OnceLock;
251
252    static TAG_RE: OnceLock<Regex> = OnceLock::new();
253    static VOID_RE: OnceLock<Regex> = OnceLock::new();
254
255    let re = TAG_RE.get_or_init(|| Regex::new(r"(</?[a-zA-Z][^>]*?>)").unwrap());
256    let void_pat = VOID_RE.get_or_init(|| {
257        Regex::new(
258            r"(?i)^<(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)\b",
259        )
260        .unwrap()
261    });
262    let mut result = String::with_capacity(html.len() * 2);
263    let mut indent: usize = 0;
264    let indent_str = "  ";
265    let mut last_end = 0;
266    let mut parts: Vec<(bool, &str)> = Vec::new();
267
268    for m in re.find_iter(html) {
269        let before = &html[last_end..m.start()];
270        if !before.trim().is_empty() {
271            parts.push((false, before));
272        }
273        parts.push((true, m.as_str()));
274        last_end = m.end();
275    }
276    let trailing = &html[last_end..];
277    if !trailing.trim().is_empty() {
278        parts.push((false, trailing));
279    }
280
281    for (is_tag, content) in &parts {
282        if *is_tag {
283            let tag = *content;
284            let is_closing = tag.starts_with("</");
285            let is_void = void_pat.is_match(tag);
286            let is_self_closing = tag.ends_with("/>");
287
288            if is_closing {
289                indent = indent.saturating_sub(1);
290            }
291            for _ in 0..indent {
292                result.push_str(indent_str);
293            }
294            result.push_str(tag);
295            result.push('\n');
296            if !is_closing && !is_void && !is_self_closing {
297                indent += 1;
298            }
299        } else {
300            let text = content.trim();
301            if !text.is_empty() {
302                for _ in 0..indent {
303                    result.push_str(indent_str);
304                }
305                result.push_str(text);
306                result.push('\n');
307            }
308        }
309    }
310
311    result
312}
313
314/// Normalize URL to ensure it's absolute.
315///
316/// Prepends `https://` if no scheme is present and validates the URL.
317///
318/// # Errors
319///
320/// Returns an error string if the URL is empty or invalid.
321pub fn normalize_url(url: &str) -> std::result::Result<String, String> {
322    if url.is_empty() {
323        return Err("Missing url parameter".to_string());
324    }
325
326    let absolute_url = if url.starts_with("http://") || url.starts_with("https://") {
327        url.to_string()
328    } else {
329        format!("https://{url}")
330    };
331
332    // Validate the URL
333    Url::parse(&absolute_url).map_err(|e| format!("Invalid URL: {e}"))?;
334
335    Ok(absolute_url)
336}