Skip to main content

web_capture/
html.rs

1//! HTML processing module
2//!
3//! This module provides functions for fetching, parsing, and processing HTML content.
4
5use crate::{Result, WebCaptureError};
6use regex::Regex;
7use tracing::{debug, info};
8use url::Url;
9
10/// Default user agent string
11const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
12
13/// Fetch HTML content from a URL
14///
15/// This function makes a simple HTTP GET request to fetch the HTML content.
16///
17/// # Arguments
18///
19/// * `url` - The URL to fetch
20///
21/// # Returns
22///
23/// The HTML content as a string
24///
25/// # Errors
26///
27/// Returns an error if the fetch fails or the response cannot be decoded
28pub async fn fetch_html(url: &str) -> Result<String> {
29    info!("Fetching HTML from URL: {}", url);
30
31    let client = reqwest::Client::builder()
32        .user_agent(USER_AGENT)
33        .build()
34        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
35
36    let response = client
37        .get(url)
38        .header("Accept-Language", "en-US,en;q=0.9")
39        .header("Accept-Charset", "utf-8")
40        .send()
41        .await
42        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
43
44    let html = response
45        .text()
46        .await
47        .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
48
49    info!("Successfully fetched HTML ({} bytes)", html.len());
50    Ok(html)
51}
52
53/// Convert relative URLs to absolute URLs in HTML content
54///
55/// Processes various HTML attributes that contain URLs and converts
56/// relative URLs to absolute URLs using the provided base URL.
57///
58/// # Arguments
59///
60/// * `html` - The HTML content to process
61/// * `base_url` - The base URL to use for resolving relative URLs
62///
63/// # Returns
64///
65/// The HTML content with absolute URLs
66pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
67    debug!(
68        "Converting relative URLs to absolute using base: {}",
69        base_url
70    );
71
72    let Ok(base) = Url::parse(base_url) else {
73        return html.to_string();
74    };
75
76    let mut result = html.to_string();
77
78    // List of tag/attribute combinations to process
79    let attributes = [
80        ("a", "href"),
81        ("img", "src"),
82        ("script", "src"),
83        ("link", "href"),
84        ("form", "action"),
85        ("video", "src"),
86        ("audio", "src"),
87        ("source", "src"),
88        ("track", "src"),
89        ("embed", "src"),
90        ("object", "data"),
91        ("iframe", "src"),
92    ];
93
94    for (tag, attr) in &attributes {
95        let pattern = format!(r#"<{tag}[^>]*{attr}=["']([^"']+)["'][^>]*>"#);
96        if let Ok(regex) = Regex::new(&pattern) {
97            result = regex
98                .replace_all(&result, |caps: &regex::Captures| {
99                    let full_match = caps.get(0).map_or("", |m| m.as_str());
100                    let url_match = caps.get(1).map_or("", |m| m.as_str());
101
102                    let absolute_url = to_absolute_url(url_match, &base);
103                    full_match.replace(url_match, &absolute_url)
104                })
105                .to_string();
106        }
107    }
108
109    // Handle inline styles with url()
110    if let Ok(url_regex) = Regex::new(r#"url\(['"]?([^'"()]+)['"]?\)"#) {
111        result = url_regex
112            .replace_all(&result, |caps: &regex::Captures| {
113                let url_match = caps.get(1).map_or("", |m| m.as_str());
114                let absolute_url = to_absolute_url(url_match, &base);
115                format!(r#"url("{absolute_url}")"#)
116            })
117            .to_string();
118    }
119
120    debug!("URL conversion complete");
121    result
122}
123
124/// Convert a potentially relative URL to an absolute URL
125fn to_absolute_url(url: &str, base: &Url) -> String {
126    // Skip data:, blob:, and javascript: URLs
127    if url.is_empty()
128        || url.starts_with("data:")
129        || url.starts_with("blob:")
130        || url.starts_with("javascript:")
131    {
132        return url.to_string();
133    }
134
135    // Try to resolve the URL against the base
136    base.join(url)
137        .map_or_else(|_| url.to_string(), |absolute| absolute.to_string())
138}
139
140/// Convert HTML content to UTF-8 encoding
141///
142/// Detects the current encoding from meta tags and ensures UTF-8 encoding.
143///
144/// # Arguments
145///
146/// * `html` - The HTML content to convert
147///
148/// # Returns
149///
150/// The UTF-8 encoded HTML content
151pub fn convert_to_utf8(html: &str) -> String {
152    debug!("Converting HTML to UTF-8");
153
154    // Check for charset meta tag
155    let charset_regex = Regex::new(r#"<meta[^>]+charset=["']?([^"'>\s]+)"#).ok();
156
157    let current_charset = charset_regex
158        .as_ref()
159        .and_then(|re| re.captures(html))
160        .and_then(|caps| caps.get(1))
161        .map_or_else(|| "utf-8".to_string(), |m| m.as_str().to_lowercase());
162
163    // If already UTF-8, ensure the meta tag is present
164    if current_charset == "utf-8" || current_charset == "utf8" {
165        // Add meta charset if not present
166        if !html.to_lowercase().contains("charset") {
167            if let Ok(head_regex) = Regex::new(r"<head[^>]*>") {
168                return head_regex
169                    .replace(html, r#"$0<meta charset="utf-8">"#)
170                    .to_string();
171            }
172        }
173        return html.to_string();
174    }
175
176    // For other charsets, try to convert and update the meta tag
177    let charset_update_regex = Regex::new(r#"<meta[^>]+charset=["']?[^"'>\s]+["']?"#).ok();
178
179    charset_update_regex.map_or_else(
180        || html.to_string(),
181        |regex| regex.replace(html, r#"<meta charset="utf-8""#).to_string(),
182    )
183}
184
185/// Check if HTML content contains JavaScript
186///
187/// # Arguments
188///
189/// * `html` - The HTML content to check
190///
191/// # Returns
192///
193/// True if the HTML contains JavaScript
194#[must_use]
195pub fn has_javascript(html: &str) -> bool {
196    let pattern = r"<script[^>]*>[\s\S]*?</script>|<script[^>]*/\s*>|javascript:";
197    Regex::new(pattern).is_ok_and(|re| re.is_match(html))
198}
199
200/// Check if content is valid HTML
201///
202/// # Arguments
203///
204/// * `html` - The content to check
205///
206/// # Returns
207///
208/// True if the content appears to be valid HTML
209#[must_use]
210pub fn is_html(html: &str) -> bool {
211    let pattern = r"<html[^>]*>[\s\S]*?</html>";
212    Regex::new(pattern).is_ok_and(|re| re.is_match(html))
213}
214
215/// Decode HTML entities to unicode characters.
216///
217/// Converts HTML entities like `&amp;`, `&lt;`, `&#39;`, `&#x27;` etc.
218/// to their actual unicode character equivalents.
219///
220/// # Arguments
221///
222/// * `html` - The HTML content containing entities to decode
223///
224/// # Returns
225///
226/// The content with all HTML entities decoded to unicode
227#[must_use]
228pub fn decode_html_entities(html: &str) -> String {
229    html_escape::decode_html_entities(html).into_owned()
230}
231
232/// Pretty-print HTML with indentation.
233///
234/// Adds newlines and indentation to make HTML human-readable.
235/// Void elements (br, hr, img, input, meta, link) are not indented as blocks.
236///
237/// # Arguments
238///
239/// * `html` - The HTML content to format
240///
241/// # Returns
242///
243/// The pretty-printed HTML content
244#[must_use]
245pub fn pretty_print_html(html: &str) -> String {
246    use std::sync::OnceLock;
247
248    static TAG_RE: OnceLock<Regex> = OnceLock::new();
249    static VOID_RE: OnceLock<Regex> = OnceLock::new();
250
251    let re = TAG_RE.get_or_init(|| Regex::new(r"(</?[a-zA-Z][^>]*?>)").unwrap());
252    let void_pat = VOID_RE.get_or_init(|| {
253        Regex::new(
254            r"(?i)^<(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)\b",
255        )
256        .unwrap()
257    });
258    let mut result = String::with_capacity(html.len() * 2);
259    let mut indent: usize = 0;
260    let indent_str = "  ";
261    let mut last_end = 0;
262    let mut parts: Vec<(bool, &str)> = Vec::new();
263
264    for m in re.find_iter(html) {
265        let before = &html[last_end..m.start()];
266        if !before.trim().is_empty() {
267            parts.push((false, before));
268        }
269        parts.push((true, m.as_str()));
270        last_end = m.end();
271    }
272    let trailing = &html[last_end..];
273    if !trailing.trim().is_empty() {
274        parts.push((false, trailing));
275    }
276
277    for (is_tag, content) in &parts {
278        if *is_tag {
279            let tag = *content;
280            let is_closing = tag.starts_with("</");
281            let is_void = void_pat.is_match(tag);
282            let is_self_closing = tag.ends_with("/>");
283
284            if is_closing {
285                indent = indent.saturating_sub(1);
286            }
287            for _ in 0..indent {
288                result.push_str(indent_str);
289            }
290            result.push_str(tag);
291            result.push('\n');
292            if !is_closing && !is_void && !is_self_closing {
293                indent += 1;
294            }
295        } else {
296            let text = content.trim();
297            if !text.is_empty() {
298                for _ in 0..indent {
299                    result.push_str(indent_str);
300                }
301                result.push_str(text);
302                result.push('\n');
303            }
304        }
305    }
306
307    result
308}
309
310/// Normalize URL to ensure it's absolute.
311///
312/// Prepends `https://` if no scheme is present and validates the URL.
313///
314/// # Errors
315///
316/// Returns an error string if the URL is empty or invalid.
317pub fn normalize_url(url: &str) -> std::result::Result<String, String> {
318    if url.is_empty() {
319        return Err("Missing url parameter".to_string());
320    }
321
322    let absolute_url = if url.starts_with("http://") || url.starts_with("https://") {
323        url.to_string()
324    } else {
325        format!("https://{url}")
326    };
327
328    // Validate the URL
329    Url::parse(&absolute_url).map_err(|e| format!("Invalid URL: {e}"))?;
330
331    Ok(absolute_url)
332}