1use crate::{Result, WebCaptureError};
6use regex::Regex;
7use tracing::{debug, info};
8use url::Url;
9
10const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
12
13pub async fn fetch_html(url: &str) -> Result<String> {
29 info!("Fetching HTML from URL: {}", url);
30
31 if crate::stackoverflow::is_stackoverflow_question_url(url) {
32 return crate::stackoverflow::fetch_stackoverflow_html(url).await;
33 }
34
35 let client = reqwest::Client::builder()
36 .user_agent(USER_AGENT)
37 .build()
38 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
39
40 let response = client
41 .get(url)
42 .header("Accept-Language", "en-US,en;q=0.9")
43 .header("Accept-Charset", "utf-8")
44 .send()
45 .await
46 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
47
48 let html = response
49 .text()
50 .await
51 .map_err(|e| WebCaptureError::FetchError(e.to_string()))?;
52
53 info!("Successfully fetched HTML ({} bytes)", html.len());
54 Ok(html)
55}
56
57pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
71 debug!(
72 "Converting relative URLs to absolute using base: {}",
73 base_url
74 );
75
76 let Ok(base) = Url::parse(base_url) else {
77 return html.to_string();
78 };
79
80 let mut result = html.to_string();
81
82 let attributes = [
84 ("a", "href"),
85 ("img", "src"),
86 ("script", "src"),
87 ("link", "href"),
88 ("form", "action"),
89 ("video", "src"),
90 ("audio", "src"),
91 ("source", "src"),
92 ("track", "src"),
93 ("embed", "src"),
94 ("object", "data"),
95 ("iframe", "src"),
96 ];
97
98 for (tag, attr) in &attributes {
99 let pattern = format!(r#"<{tag}[^>]*{attr}=["']([^"']+)["'][^>]*>"#);
100 if let Ok(regex) = Regex::new(&pattern) {
101 result = regex
102 .replace_all(&result, |caps: ®ex::Captures| {
103 let full_match = caps.get(0).map_or("", |m| m.as_str());
104 let url_match = caps.get(1).map_or("", |m| m.as_str());
105
106 let absolute_url = to_absolute_url(url_match, &base);
107 full_match.replace(url_match, &absolute_url)
108 })
109 .to_string();
110 }
111 }
112
113 if let Ok(url_regex) = Regex::new(r#"url\(['"]?([^'"()]+)['"]?\)"#) {
115 result = url_regex
116 .replace_all(&result, |caps: ®ex::Captures| {
117 let url_match = caps.get(1).map_or("", |m| m.as_str());
118 let absolute_url = to_absolute_url(url_match, &base);
119 format!(r#"url("{absolute_url}")"#)
120 })
121 .to_string();
122 }
123
124 debug!("URL conversion complete");
125 result
126}
127
128fn to_absolute_url(url: &str, base: &Url) -> String {
130 if url.is_empty()
132 || url.starts_with("data:")
133 || url.starts_with("blob:")
134 || url.starts_with("javascript:")
135 {
136 return url.to_string();
137 }
138
139 base.join(url)
141 .map_or_else(|_| url.to_string(), |absolute| absolute.to_string())
142}
143
144pub fn convert_to_utf8(html: &str) -> String {
156 debug!("Converting HTML to UTF-8");
157
158 let charset_regex = Regex::new(r#"<meta[^>]+charset=["']?([^"'>\s]+)"#).ok();
160
161 let current_charset = charset_regex
162 .as_ref()
163 .and_then(|re| re.captures(html))
164 .and_then(|caps| caps.get(1))
165 .map_or_else(|| "utf-8".to_string(), |m| m.as_str().to_lowercase());
166
167 if current_charset == "utf-8" || current_charset == "utf8" {
169 if !html.to_lowercase().contains("charset") {
171 if let Ok(head_regex) = Regex::new(r"<head[^>]*>") {
172 return head_regex
173 .replace(html, r#"$0<meta charset="utf-8">"#)
174 .to_string();
175 }
176 }
177 return html.to_string();
178 }
179
180 let charset_update_regex = Regex::new(r#"<meta[^>]+charset=["']?[^"'>\s]+["']?"#).ok();
182
183 charset_update_regex.map_or_else(
184 || html.to_string(),
185 |regex| regex.replace(html, r#"<meta charset="utf-8""#).to_string(),
186 )
187}
188
189#[must_use]
199pub fn has_javascript(html: &str) -> bool {
200 let pattern = r"<script[^>]*>[\s\S]*?</script>|<script[^>]*/\s*>|javascript:";
201 Regex::new(pattern).is_ok_and(|re| re.is_match(html))
202}
203
204#[must_use]
214pub fn is_html(html: &str) -> bool {
215 let pattern = r"<html[^>]*>[\s\S]*?</html>";
216 Regex::new(pattern).is_ok_and(|re| re.is_match(html))
217}
218
219#[must_use]
232pub fn decode_html_entities(html: &str) -> String {
233 html_escape::decode_html_entities(html).into_owned()
234}
235
236#[must_use]
249pub fn pretty_print_html(html: &str) -> String {
250 use std::sync::OnceLock;
251
252 static TAG_RE: OnceLock<Regex> = OnceLock::new();
253 static VOID_RE: OnceLock<Regex> = OnceLock::new();
254
255 let re = TAG_RE.get_or_init(|| Regex::new(r"(</?[a-zA-Z][^>]*?>)").unwrap());
256 let void_pat = VOID_RE.get_or_init(|| {
257 Regex::new(
258 r"(?i)^<(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)\b",
259 )
260 .unwrap()
261 });
262 let mut result = String::with_capacity(html.len() * 2);
263 let mut indent: usize = 0;
264 let indent_str = " ";
265 let mut last_end = 0;
266 let mut parts: Vec<(bool, &str)> = Vec::new();
267
268 for m in re.find_iter(html) {
269 let before = &html[last_end..m.start()];
270 if !before.trim().is_empty() {
271 parts.push((false, before));
272 }
273 parts.push((true, m.as_str()));
274 last_end = m.end();
275 }
276 let trailing = &html[last_end..];
277 if !trailing.trim().is_empty() {
278 parts.push((false, trailing));
279 }
280
281 for (is_tag, content) in &parts {
282 if *is_tag {
283 let tag = *content;
284 let is_closing = tag.starts_with("</");
285 let is_void = void_pat.is_match(tag);
286 let is_self_closing = tag.ends_with("/>");
287
288 if is_closing {
289 indent = indent.saturating_sub(1);
290 }
291 for _ in 0..indent {
292 result.push_str(indent_str);
293 }
294 result.push_str(tag);
295 result.push('\n');
296 if !is_closing && !is_void && !is_self_closing {
297 indent += 1;
298 }
299 } else {
300 let text = content.trim();
301 if !text.is_empty() {
302 for _ in 0..indent {
303 result.push_str(indent_str);
304 }
305 result.push_str(text);
306 result.push('\n');
307 }
308 }
309 }
310
311 result
312}
313
314pub fn normalize_url(url: &str) -> std::result::Result<String, String> {
322 if url.is_empty() {
323 return Err("Missing url parameter".to_string());
324 }
325
326 let absolute_url = if url.starts_with("http://") || url.starts_with("https://") {
327 url.to_string()
328 } else {
329 format!("https://{url}")
330 };
331
332 Url::parse(&absolute_url).map_err(|e| format!("Invalid URL: {e}"))?;
334
335 Ok(absolute_url)
336}