1use crate::{
2 engines::{RawScrapeResult, ScrapeEngine},
3 error::{Result, ScrapeError},
4 types::ScrapeRequest,
5 utils::{
6 dns_cache::DnsCache,
7 retry::{retry_with_backoff, RetryStrategy},
8 url_rewrites::rewrite_url,
9 user_agents::random_user_agent,
10 },
11};
12use async_trait::async_trait;
13use encoding_rs::Encoding;
14use reqwest::{redirect::Policy, Client};
15use std::time::{Duration, Instant};
16use tracing::{debug, info, warn};
17
18pub struct HttpEngine {
19 client: Client,
20 dns_cache: DnsCache,
21}
22
23impl HttpEngine {
24 pub fn new() -> Result<Self> {
25 let client = Client::builder()
26 .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
27 .gzip(true)
28 .brotli(true)
29 .redirect(Policy::limited(10))
30 .timeout(Duration::from_secs(30))
31 .connect_timeout(Duration::from_secs(10))
32 .pool_max_idle_per_host(10)
33 .pool_idle_timeout(Duration::from_secs(90))
34 .tcp_keepalive(Duration::from_secs(60))
35 .build()
36 .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
37
38 let dns_cache = DnsCache::new()?;
39
40 Ok(Self { client, dns_cache })
41 }
42
43 pub fn with_timeout(timeout_ms: u64) -> Result<Self> {
44 let client = Client::builder()
45 .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
46 .gzip(true)
47 .brotli(true)
48 .redirect(Policy::limited(10))
49 .timeout(Duration::from_millis(timeout_ms))
50 .connect_timeout(Duration::from_millis(timeout_ms.min(10000)))
51 .pool_max_idle_per_host(10)
52 .pool_idle_timeout(Duration::from_secs(90))
53 .tcp_keepalive(Duration::from_secs(60))
54 .build()
55 .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
56
57 let dns_cache = DnsCache::new()?;
58
59 Ok(Self { client, dns_cache })
60 }
61
62 pub fn with_options(timeout_ms: u64, skip_tls_verification: bool) -> Result<Self> {
63 let mut builder = Client::builder()
64 .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
65 .gzip(true)
66 .brotli(true)
67 .redirect(Policy::limited(10))
68 .timeout(Duration::from_millis(timeout_ms))
69 .connect_timeout(Duration::from_millis(timeout_ms.min(10000)))
70 .pool_max_idle_per_host(10)
71 .pool_idle_timeout(Duration::from_secs(90))
72 .tcp_keepalive(Duration::from_secs(60));
73
74 if skip_tls_verification {
75 builder = builder.danger_accept_invalid_certs(true);
76 }
77
78 let client = builder
79 .build()
80 .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
81
82 let dns_cache = DnsCache::new()?;
83
84 Ok(Self { client, dns_cache })
85 }
86
87 pub async fn dns_stats(&self) -> crate::utils::dns_cache::CacheStats {
89 self.dns_cache.stats().await
90 }
91
92 pub async fn clear_dns_cache(&self) {
94 self.dns_cache.clear().await
95 }
96}
97
98impl Default for HttpEngine {
99 fn default() -> Self {
100 Self::new().expect("Failed to create default HTTP engine")
101 }
102}
103
104#[async_trait]
105impl ScrapeEngine for HttpEngine {
106 async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
107 let start = Instant::now();
108
109 let retry_config = RetryStrategy::Aggressive.to_config();
112
113 debug!(
114 "HTTP engine starting request to {} with retry config: max_retries={}, initial_delay={:?}",
115 request.url,
116 retry_config.max_retries,
117 retry_config.initial_interval
118 );
119
120 let result = retry_with_backoff(
122 || async { self.scrape_once(request).await },
123 &retry_config,
124 )
125 .await;
126
127 let duration = start.elapsed().as_secs_f64();
129
130 if let Err(ref e) = result {
132 warn!(
133 "HTTP engine failed for {} after {:.2}s: {}",
134 request.url,
135 duration,
136 e
137 );
138 } else {
139 info!(
140 "HTTP engine succeeded for {} in {:.2}s",
141 request.url,
142 duration
143 );
144 }
145
146 result
147 }
148}
149
150impl HttpEngine {
151 async fn scrape_once(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
153 let request_start = Instant::now();
154
155 let url_str = rewrite_url(&request.url);
157
158 let url = reqwest::Url::parse(&url_str)
160 .map_err(|e| {
161 warn!("URL parsing failed for '{}': {}", url_str, e);
162 ScrapeError::InvalidUrl(format!("Invalid URL: {}", e))
163 })?;
164
165 debug!("Starting HTTP request to: {}", url);
166
167 if let Some(host) = url.host_str() {
169 let dns_start = Instant::now();
170 match self.dns_cache.lookup(host).await {
171 Ok(ips) => {
172 let dns_elapsed = dns_start.elapsed();
173 let stats = self.dns_cache.stats().await;
174 debug!(
175 "DNS resolved {} to {} IPs in {:.2}ms (cache hit rate: {:.1}%)",
176 host,
177 ips.len(),
178 dns_elapsed.as_secs_f64() * 1000.0,
179 stats.hit_rate() * 100.0
180 );
181 }
182 Err(e) => {
183 debug!("DNS pre-flight resolution failed for {}: {}", host, e);
186 }
187 }
188 }
189
190 let mut req_builder = self.client.get(url.clone());
192
193 let user_agent = request
195 .headers
196 .get("User-Agent")
197 .or_else(|| request.headers.get("user-agent"))
198 .cloned()
199 .unwrap_or_else(|| random_user_agent().to_string());
200
201 debug!("Using User-Agent: {}", user_agent);
202 req_builder = req_builder
203 .header("User-Agent", &user_agent)
204 .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
206 .header("Accept-Language", "en-US,en;q=0.5")
207 .header("Upgrade-Insecure-Requests", "1")
208 .header("Sec-Fetch-Dest", "document")
209 .header("Sec-Fetch-Mode", "navigate")
210 .header("Sec-Fetch-Site", "none")
211 .header("Sec-Fetch-User", "?1");
212
213 for (key, value) in &request.headers {
215 if key.to_lowercase() != "user-agent" {
216 req_builder = req_builder.header(key, value);
217 }
218 }
219
220 let send_start = Instant::now();
222 let response = req_builder.send().await.map_err(|e| {
223 let elapsed = send_start.elapsed();
224
225 if e.is_timeout() {
227 warn!(
228 "Request timeout for {} after {:.2}s",
229 url,
230 elapsed.as_secs_f64()
231 );
232 ScrapeError::Timeout
233 } else if e.is_connect() {
234 warn!(
235 "Connection failed for {} after {:.2}ms: {}",
236 url,
237 elapsed.as_secs_f64() * 1000.0,
238 e
239 );
240 ScrapeError::RequestFailed(e)
241 } else if e.is_request() {
242 warn!(
243 "Request error for {} after {:.2}ms: {}",
244 url,
245 elapsed.as_secs_f64() * 1000.0,
246 e
247 );
248 ScrapeError::RequestFailed(e)
249 } else {
250 warn!(
251 "Network error for {} after {:.2}ms: {}",
252 url,
253 elapsed.as_secs_f64() * 1000.0,
254 e
255 );
256 ScrapeError::RequestFailed(e)
257 }
258 })?;
259
260 let request_elapsed = request_start.elapsed();
261 info!(
262 "HTTP request to {} completed in {:.2}ms (status: {})",
263 url,
264 request_elapsed.as_secs_f64() * 1000.0,
265 response.status()
266 );
267
268 let final_url = response.url().to_string();
270 let status_code = response.status().as_u16();
271 let content_type = response
272 .headers()
273 .get(reqwest::header::CONTENT_TYPE)
274 .and_then(|v| v.to_str().ok())
275 .map(|s| s.to_string());
276
277 let headers: Vec<(String, String)> = response
279 .headers()
280 .iter()
281 .filter_map(|(k, v)| v.to_str().ok().map(|val| (k.to_string(), val.to_string())))
282 .collect();
283
284 let bytes = response
286 .bytes()
287 .await
288 .map_err(ScrapeError::RequestFailed)?;
289
290 let encoding = detect_charset(&bytes, content_type.as_deref());
292
293 let (html, _, had_errors) = encoding.decode(&bytes);
295 if had_errors {
296 debug!("Charset decoding had errors for {}, encoding: {}", url, encoding.name());
297 }
298 let html = html.into_owned();
299
300 let max_response_size_mb = std::env::var("MAX_RESPONSE_SIZE_MB")
302 .ok()
303 .and_then(|s| s.parse::<usize>().ok())
304 .unwrap_or(50);
305
306 let max_size_bytes = max_response_size_mb * 1024 * 1024;
307
308 if html.len() > max_size_bytes {
309 return Err(ScrapeError::ResourceLimit(format!(
310 "Response too large: {:.2}MB > {}MB",
311 html.len() as f64 / (1024.0 * 1024.0),
312 max_response_size_mb
313 )));
314 }
315
316 Ok(RawScrapeResult {
317 url: final_url,
318 status_code,
319 content_type,
320 html,
321 headers,
322 })
323 }
324}
325
326fn detect_charset(bytes: &[u8], content_type: Option<&str>) -> &'static Encoding {
328 if let Some(ct) = content_type {
330 if let Some(charset) = extract_charset_from_header(ct) {
331 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
332 debug!("Detected charset from Content-Type: {}", charset);
333 return encoding;
334 }
335 }
336 }
337
338 let preview = std::str::from_utf8(&bytes[..bytes.len().min(2048)]).unwrap_or("");
340 if let Some(charset) = extract_charset_from_meta(preview) {
341 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
342 debug!("Detected charset from meta tag: {}", charset);
343 return encoding;
344 }
345 }
346
347 debug!("No charset detected, using UTF-8");
349 encoding_rs::UTF_8
350}
351
352fn extract_charset_from_header(content_type: &str) -> Option<String> {
354 content_type
356 .split(';')
357 .find(|part| part.trim().starts_with("charset="))
358 .and_then(|charset_part| {
359 charset_part
360 .trim()
361 .strip_prefix("charset=")
362 .map(|s| s.trim().trim_matches('"').to_string())
363 })
364}
365
366fn extract_charset_from_meta(html: &str) -> Option<String> {
368 use regex::Regex;
370
371 if let Ok(re) = Regex::new(r#"(?i)<meta\s+[^>]*charset\s*=\s*["']?([^"'\s/>]+)"#) {
373 if let Some(caps) = re.captures(html) {
374 if let Some(m) = caps.get(1) {
375 return Some(m.as_str().to_string());
376 }
377 }
378 }
379
380 if let Ok(re) = Regex::new(r#"(?i)<meta\s+http-equiv\s*=\s*["']?content-type["']?\s+content\s*=\s*["'][^"']*charset=([^"'\s;]+)"#) {
382 if let Some(caps) = re.captures(html) {
383 if let Some(m) = caps.get(1) {
384 return Some(m.as_str().to_string());
385 }
386 }
387 }
388
389 None
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[test]
397 fn test_engine_creation() {
398 let engine = HttpEngine::new();
399 assert!(engine.is_ok());
400 }
401
402 #[test]
403 fn test_engine_with_timeout() {
404 let engine = HttpEngine::with_timeout(5000);
405 assert!(engine.is_ok());
406 }
407
408 #[test]
409 fn test_engine_with_options() {
410 let engine = HttpEngine::with_options(5000, true);
411 assert!(engine.is_ok());
412 }
413
414 #[test]
415 fn test_detect_charset_from_content_type() {
416 let encoding = detect_charset(b"", Some("text/html; charset=EUC-JP"));
417 assert_eq!(encoding.name(), "EUC-JP");
418 }
419
420 #[test]
421 fn test_detect_charset_from_meta() {
422 let html = r#"<html><head><meta charset="ISO-8859-1"></head></html>"#;
423 let encoding = detect_charset(html.as_bytes(), None);
424 assert_eq!(encoding.name(), "windows-1252");
426 }
427
428 #[test]
429 fn test_detect_charset_from_meta_http_equiv() {
430 let html = r#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=GB2312"></head></html>"#;
431 let encoding = detect_charset(html.as_bytes(), None);
432 assert_eq!(encoding.name(), "GBK");
433 }
434
435 #[test]
436 fn test_detect_charset_utf8_default() {
437 let encoding = detect_charset(b"", None);
438 assert_eq!(encoding.name(), "UTF-8");
439 }
440
441 #[test]
442 fn test_extract_charset_from_header() {
443 assert_eq!(
444 extract_charset_from_header("text/html; charset=UTF-8"),
445 Some("UTF-8".to_string())
446 );
447 assert_eq!(
448 extract_charset_from_header("text/html; charset=\"ISO-8859-1\""),
449 Some("ISO-8859-1".to_string())
450 );
451 assert_eq!(extract_charset_from_header("text/html"), None);
452 }
453
454 #[test]
455 fn test_extract_charset_from_meta() {
456 assert_eq!(
457 extract_charset_from_meta(r#"<meta charset="UTF-8">"#),
458 Some("UTF-8".to_string())
459 );
460 assert_eq!(
461 extract_charset_from_meta(r#"<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">"#),
462 Some("ISO-8859-1".to_string())
463 );
464 assert_eq!(extract_charset_from_meta("<html></html>"), None);
465 }
466}