1use crate::{Result, WebCaptureError};
6use regex::Regex;
7use tracing::{debug, info};
8use url::Url;
9
10const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
12
13pub async fn fetch_html(url: &str) -> Result<String> {
29 info!("Fetching HTML from URL: {}", url);
30
31 let client = reqwest::Client::builder()
32 .user_agent(USER_AGENT)
33 .build()
34 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
35
36 let response = client
37 .get(url)
38 .header("Accept-Language", "en-US,en;q=0.9")
39 .header("Accept-Charset", "utf-8")
40 .send()
41 .await
42 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
43
44 let html = response
45 .text()
46 .await
47 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
48
49 info!("Successfully fetched HTML ({} bytes)", html.len());
50 Ok(html)
51}
52
53pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
67 debug!(
68 "Converting relative URLs to absolute using base: {}",
69 base_url
70 );
71
72 let Ok(base) = Url::parse(base_url) else {
73 return html.to_string();
74 };
75
76 let mut result = html.to_string();
77
78 let attributes = [
80 ("a", "href"),
81 ("img", "src"),
82 ("script", "src"),
83 ("link", "href"),
84 ("form", "action"),
85 ("video", "src"),
86 ("audio", "src"),
87 ("source", "src"),
88 ("track", "src"),
89 ("embed", "src"),
90 ("object", "data"),
91 ("iframe", "src"),
92 ];
93
94 for (tag, attr) in &attributes {
95 let pattern = format!(r#"<{tag}[^>]*{attr}=["']([^"']+)["'][^>]*>"#);
96 if let Ok(regex) = Regex::new(&pattern) {
97 result = regex
98 .replace_all(&result, |caps: ®ex::Captures| {
99 let full_match = caps.get(0).map_or("", |m| m.as_str());
100 let url_match = caps.get(1).map_or("", |m| m.as_str());
101
102 let absolute_url = to_absolute_url(url_match, &base);
103 full_match.replace(url_match, &absolute_url)
104 })
105 .to_string();
106 }
107 }
108
109 if let Ok(url_regex) = Regex::new(r#"url\(['"]?([^'"()]+)['"]?\)"#) {
111 result = url_regex
112 .replace_all(&result, |caps: ®ex::Captures| {
113 let url_match = caps.get(1).map_or("", |m| m.as_str());
114 let absolute_url = to_absolute_url(url_match, &base);
115 format!(r#"url("{absolute_url}")"#)
116 })
117 .to_string();
118 }
119
120 debug!("URL conversion complete");
121 result
122}
123
124fn to_absolute_url(url: &str, base: &Url) -> String {
126 if url.is_empty()
128 || url.starts_with("data:")
129 || url.starts_with("blob:")
130 || url.starts_with("javascript:")
131 {
132 return url.to_string();
133 }
134
135 base.join(url)
137 .map_or_else(|_| url.to_string(), |absolute| absolute.to_string())
138}
139
140pub fn convert_to_utf8(html: &str) -> String {
152 debug!("Converting HTML to UTF-8");
153
154 let charset_regex = Regex::new(r#"<meta[^>]+charset=["']?([^"'>\s]+)"#).ok();
156
157 let current_charset = charset_regex
158 .as_ref()
159 .and_then(|re| re.captures(html))
160 .and_then(|caps| caps.get(1))
161 .map_or_else(|| "utf-8".to_string(), |m| m.as_str().to_lowercase());
162
163 if current_charset == "utf-8" || current_charset == "utf8" {
165 if !html.to_lowercase().contains("charset") {
167 if let Ok(head_regex) = Regex::new(r"<head[^>]*>") {
168 return head_regex
169 .replace(html, r#"$0<meta charset="utf-8">"#)
170 .to_string();
171 }
172 }
173 return html.to_string();
174 }
175
176 let charset_update_regex = Regex::new(r#"<meta[^>]+charset=["']?[^"'>\s]+["']?"#).ok();
178
179 charset_update_regex.map_or_else(
180 || html.to_string(),
181 |regex| regex.replace(html, r#"<meta charset="utf-8""#).to_string(),
182 )
183}
184
185#[must_use]
195pub fn has_javascript(html: &str) -> bool {
196 let pattern = r"<script[^>]*>[\s\S]*?</script>|<script[^>]*/\s*>|javascript:";
197 Regex::new(pattern).is_ok_and(|re| re.is_match(html))
198}
199
200#[must_use]
210pub fn is_html(html: &str) -> bool {
211 let pattern = r"<html[^>]*>[\s\S]*?</html>";
212 Regex::new(pattern).is_ok_and(|re| re.is_match(html))
213}
214
215#[must_use]
228pub fn decode_html_entities(html: &str) -> String {
229 html_escape::decode_html_entities(html).into_owned()
230}
231
232#[must_use]
245pub fn pretty_print_html(html: &str) -> String {
246 use std::sync::OnceLock;
247
248 static TAG_RE: OnceLock<Regex> = OnceLock::new();
249 static VOID_RE: OnceLock<Regex> = OnceLock::new();
250
251 let re = TAG_RE.get_or_init(|| Regex::new(r"(</?[a-zA-Z][^>]*?>)").unwrap());
252 let void_pat = VOID_RE.get_or_init(|| {
253 Regex::new(
254 r"(?i)^<(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)\b",
255 )
256 .unwrap()
257 });
258 let mut result = String::with_capacity(html.len() * 2);
259 let mut indent: usize = 0;
260 let indent_str = " ";
261 let mut last_end = 0;
262 let mut parts: Vec<(bool, &str)> = Vec::new();
263
264 for m in re.find_iter(html) {
265 let before = &html[last_end..m.start()];
266 if !before.trim().is_empty() {
267 parts.push((false, before));
268 }
269 parts.push((true, m.as_str()));
270 last_end = m.end();
271 }
272 let trailing = &html[last_end..];
273 if !trailing.trim().is_empty() {
274 parts.push((false, trailing));
275 }
276
277 for (is_tag, content) in &parts {
278 if *is_tag {
279 let tag = *content;
280 let is_closing = tag.starts_with("</");
281 let is_void = void_pat.is_match(tag);
282 let is_self_closing = tag.ends_with("/>");
283
284 if is_closing {
285 indent = indent.saturating_sub(1);
286 }
287 for _ in 0..indent {
288 result.push_str(indent_str);
289 }
290 result.push_str(tag);
291 result.push('\n');
292 if !is_closing && !is_void && !is_self_closing {
293 indent += 1;
294 }
295 } else {
296 let text = content.trim();
297 if !text.is_empty() {
298 for _ in 0..indent {
299 result.push_str(indent_str);
300 }
301 result.push_str(text);
302 result.push('\n');
303 }
304 }
305 }
306
307 result
308}
309
310pub fn normalize_url(url: &str) -> std::result::Result<String, String> {
318 if url.is_empty() {
319 return Err("Missing url parameter".to_string());
320 }
321
322 let absolute_url = if url.starts_with("http://") || url.starts_with("https://") {
323 url.to_string()
324 } else {
325 format!("https://{url}")
326 };
327
328 Url::parse(&absolute_url).map_err(|e| format!("Invalid URL: {e}"))?;
330
331 Ok(absolute_url)
332}