Skip to main content

essence/engines/
http.rs

1use crate::{
2    engines::{RawScrapeResult, ScrapeEngine},
3    error::{Result, ScrapeError},
4    types::ScrapeRequest,
5    utils::{
6        dns_cache::DnsCache,
7        retry::{retry_with_backoff, RetryStrategy},
8        url_rewrites::rewrite_url,
9        user_agents::random_user_agent,
10    },
11};
12use async_trait::async_trait;
13use encoding_rs::Encoding;
14use reqwest::{redirect::Policy, Client};
15use std::time::{Duration, Instant};
16use tracing::{debug, info, warn};
17
18pub struct HttpEngine {
19    client: Client,
20    dns_cache: DnsCache,
21}
22
23impl HttpEngine {
24    pub fn new() -> Result<Self> {
25        let client = Client::builder()
26            .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
27            .gzip(true)
28            .brotli(true)
29            .redirect(Policy::limited(10))
30            .timeout(Duration::from_secs(30))
31            .connect_timeout(Duration::from_secs(10))
32            .pool_max_idle_per_host(10)
33            .pool_idle_timeout(Duration::from_secs(90))
34            .tcp_keepalive(Duration::from_secs(60))
35            .build()
36            .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
37
38        let dns_cache = DnsCache::new()?;
39
40        Ok(Self { client, dns_cache })
41    }
42
43    pub fn with_timeout(timeout_ms: u64) -> Result<Self> {
44        let client = Client::builder()
45            .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
46            .gzip(true)
47            .brotli(true)
48            .redirect(Policy::limited(10))
49            .timeout(Duration::from_millis(timeout_ms))
50            .connect_timeout(Duration::from_millis(timeout_ms.min(10000)))
51            .pool_max_idle_per_host(10)
52            .pool_idle_timeout(Duration::from_secs(90))
53            .tcp_keepalive(Duration::from_secs(60))
54            .build()
55            .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
56
57        let dns_cache = DnsCache::new()?;
58
59        Ok(Self { client, dns_cache })
60    }
61
62    pub fn with_options(timeout_ms: u64, skip_tls_verification: bool) -> Result<Self> {
63        let mut builder = Client::builder()
64            .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
65            .gzip(true)
66            .brotli(true)
67            .redirect(Policy::limited(10))
68            .timeout(Duration::from_millis(timeout_ms))
69            .connect_timeout(Duration::from_millis(timeout_ms.min(10000)))
70            .pool_max_idle_per_host(10)
71            .pool_idle_timeout(Duration::from_secs(90))
72            .tcp_keepalive(Duration::from_secs(60));
73
74        if skip_tls_verification {
75            builder = builder.danger_accept_invalid_certs(true);
76        }
77
78        let client = builder
79            .build()
80            .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
81
82        let dns_cache = DnsCache::new()?;
83
84        Ok(Self { client, dns_cache })
85    }
86
87    /// Get DNS cache statistics
88    pub async fn dns_stats(&self) -> crate::utils::dns_cache::CacheStats {
89        self.dns_cache.stats().await
90    }
91
92    /// Clear DNS cache
93    pub async fn clear_dns_cache(&self) {
94        self.dns_cache.clear().await
95    }
96}
97
98impl Default for HttpEngine {
99    fn default() -> Self {
100        Self::new().expect("Failed to create default HTTP engine")
101    }
102}
103
104#[async_trait]
105impl ScrapeEngine for HttpEngine {
106    async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
107        let start = Instant::now();
108
109        // Use aggressive retry strategy for HTTP requests to handle network failures
110        // This will retry up to 5 times with exponential backoff starting at 200ms
111        let retry_config = RetryStrategy::Aggressive.to_config();
112
113        debug!(
114            "HTTP engine starting request to {} with retry config: max_retries={}, initial_delay={:?}",
115            request.url,
116            retry_config.max_retries,
117            retry_config.initial_interval
118        );
119
120        // Wrap the scrape operation in retry logic
121        let result = retry_with_backoff(
122            || async { self.scrape_once(request).await },
123            &retry_config,
124        )
125        .await;
126
127        // Track duration
128        let duration = start.elapsed().as_secs_f64();
129
130        // Log result
131        if let Err(ref e) = result {
132            warn!(
133                "HTTP engine failed for {} after {:.2}s: {}",
134                request.url,
135                duration,
136                e
137            );
138        } else {
139            info!(
140                "HTTP engine succeeded for {} in {:.2}s",
141                request.url,
142                duration
143            );
144        }
145
146        result
147    }
148}
149
150impl HttpEngine {
151    /// Perform a single scrape attempt without retry logic
152    async fn scrape_once(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
153        let request_start = Instant::now();
154
155        // Rewrite URL if needed (e.g., Google Docs → export URL)
156        let url_str = rewrite_url(&request.url);
157
158        // Validate URL
159        let url = reqwest::Url::parse(&url_str)
160            .map_err(|e| {
161                warn!("URL parsing failed for '{}': {}", url_str, e);
162                ScrapeError::InvalidUrl(format!("Invalid URL: {}", e))
163            })?;
164
165        debug!("Starting HTTP request to: {}", url);
166
167        // Pre-flight DNS resolution for metrics and early failure detection
168        if let Some(host) = url.host_str() {
169            let dns_start = Instant::now();
170            match self.dns_cache.lookup(host).await {
171                Ok(ips) => {
172                    let dns_elapsed = dns_start.elapsed();
173                    let stats = self.dns_cache.stats().await;
174                    debug!(
175                        "DNS resolved {} to {} IPs in {:.2}ms (cache hit rate: {:.1}%)",
176                        host,
177                        ips.len(),
178                        dns_elapsed.as_secs_f64() * 1000.0,
179                        stats.hit_rate() * 100.0
180                    );
181                }
182                Err(e) => {
183                    // DNS resolution failed - this will likely fail the HTTP request too
184                    // but we let reqwest handle it for consistency
185                    debug!("DNS pre-flight resolution failed for {}: {}", host, e);
186                }
187            }
188        }
189
190        // Build request with custom headers
191        let mut req_builder = self.client.get(url.clone());
192
193        // Use random user agent if not provided in request headers
194        let user_agent = request
195            .headers
196            .get("User-Agent")
197            .or_else(|| request.headers.get("user-agent"))
198            .cloned()
199            .unwrap_or_else(|| random_user_agent().to_string());
200
201        debug!("Using User-Agent: {}", user_agent);
202        req_builder = req_builder
203            .header("User-Agent", &user_agent)
204            // Browser-like headers to reduce anti-bot detection
205            .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
206            .header("Accept-Language", "en-US,en;q=0.5")
207            .header("Upgrade-Insecure-Requests", "1")
208            .header("Sec-Fetch-Dest", "document")
209            .header("Sec-Fetch-Mode", "navigate")
210            .header("Sec-Fetch-Site", "none")
211            .header("Sec-Fetch-User", "?1");
212
213        // Add custom headers (overriding defaults if specified)
214        for (key, value) in &request.headers {
215            if key.to_lowercase() != "user-agent" {
216                req_builder = req_builder.header(key, value);
217            }
218        }
219
220        // Send request
221        let send_start = Instant::now();
222        let response = req_builder.send().await.map_err(|e| {
223            let elapsed = send_start.elapsed();
224
225            // Classify the error for better diagnostics
226            if e.is_timeout() {
227                warn!(
228                    "Request timeout for {} after {:.2}s",
229                    url,
230                    elapsed.as_secs_f64()
231                );
232                ScrapeError::Timeout
233            } else if e.is_connect() {
234                warn!(
235                    "Connection failed for {} after {:.2}ms: {}",
236                    url,
237                    elapsed.as_secs_f64() * 1000.0,
238                    e
239                );
240                ScrapeError::RequestFailed(e)
241            } else if e.is_request() {
242                warn!(
243                    "Request error for {} after {:.2}ms: {}",
244                    url,
245                    elapsed.as_secs_f64() * 1000.0,
246                    e
247                );
248                ScrapeError::RequestFailed(e)
249            } else {
250                warn!(
251                    "Network error for {} after {:.2}ms: {}",
252                    url,
253                    elapsed.as_secs_f64() * 1000.0,
254                    e
255                );
256                ScrapeError::RequestFailed(e)
257            }
258        })?;
259
260        let request_elapsed = request_start.elapsed();
261        info!(
262            "HTTP request to {} completed in {:.2}ms (status: {})",
263            url,
264            request_elapsed.as_secs_f64() * 1000.0,
265            response.status()
266        );
267
268        // Extract metadata
269        let final_url = response.url().to_string();
270        let status_code = response.status().as_u16();
271        let content_type = response
272            .headers()
273            .get(reqwest::header::CONTENT_TYPE)
274            .and_then(|v| v.to_str().ok())
275            .map(|s| s.to_string());
276
277        // Collect headers
278        let headers: Vec<(String, String)> = response
279            .headers()
280            .iter()
281            .filter_map(|(k, v)| v.to_str().ok().map(|val| (k.to_string(), val.to_string())))
282            .collect();
283
284        // Get HTML content with charset detection
285        let bytes = response
286            .bytes()
287            .await
288            .map_err(ScrapeError::RequestFailed)?;
289
290        // Detect charset from Content-Type or HTML meta tag
291        let encoding = detect_charset(&bytes, content_type.as_deref());
292
293        // Decode with detected charset
294        let (html, _, had_errors) = encoding.decode(&bytes);
295        if had_errors {
296            debug!("Charset decoding had errors for {}, encoding: {}", url, encoding.name());
297        }
298        let html = html.into_owned();
299
300        // Check response size limit
301        let max_response_size_mb = std::env::var("MAX_RESPONSE_SIZE_MB")
302            .ok()
303            .and_then(|s| s.parse::<usize>().ok())
304            .unwrap_or(50);
305
306        let max_size_bytes = max_response_size_mb * 1024 * 1024;
307
308        if html.len() > max_size_bytes {
309            return Err(ScrapeError::ResourceLimit(format!(
310                "Response too large: {:.2}MB > {}MB",
311                html.len() as f64 / (1024.0 * 1024.0),
312                max_response_size_mb
313            )));
314        }
315
316        Ok(RawScrapeResult {
317            url: final_url,
318            status_code,
319            content_type,
320            html,
321            headers,
322        })
323    }
324}
325
326/// Detect charset from Content-Type header or HTML meta tag
327fn detect_charset(bytes: &[u8], content_type: Option<&str>) -> &'static Encoding {
328    // 1. Try Content-Type header first
329    if let Some(ct) = content_type {
330        if let Some(charset) = extract_charset_from_header(ct) {
331            if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
332                debug!("Detected charset from Content-Type: {}", charset);
333                return encoding;
334            }
335        }
336    }
337
338    // 2. Try HTML meta tag (look in first 2KB)
339    let preview = std::str::from_utf8(&bytes[..bytes.len().min(2048)]).unwrap_or("");
340    if let Some(charset) = extract_charset_from_meta(preview) {
341        if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
342            debug!("Detected charset from meta tag: {}", charset);
343            return encoding;
344        }
345    }
346
347    // 3. Default to UTF-8
348    debug!("No charset detected, using UTF-8");
349    encoding_rs::UTF_8
350}
351
352/// Extract charset from Content-Type header
353fn extract_charset_from_header(content_type: &str) -> Option<String> {
354    // Parse "text/html; charset=UTF-8"
355    content_type
356        .split(';')
357        .find(|part| part.trim().starts_with("charset="))
358        .and_then(|charset_part| {
359            charset_part
360                .trim()
361                .strip_prefix("charset=")
362                .map(|s| s.trim().trim_matches('"').to_string())
363        })
364}
365
366/// Extract charset from HTML meta tag
367fn extract_charset_from_meta(html: &str) -> Option<String> {
368    // Match: <meta charset="UTF-8"> or <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
369    use regex::Regex;
370
371    // Try <meta charset="..."> first
372    if let Ok(re) = Regex::new(r#"(?i)<meta\s+[^>]*charset\s*=\s*["']?([^"'\s/>]+)"#) {
373        if let Some(caps) = re.captures(html) {
374            if let Some(m) = caps.get(1) {
375                return Some(m.as_str().to_string());
376            }
377        }
378    }
379
380    // Try <meta http-equiv="Content-Type" content="...; charset=...">
381    if let Ok(re) = Regex::new(r#"(?i)<meta\s+http-equiv\s*=\s*["']?content-type["']?\s+content\s*=\s*["'][^"']*charset=([^"'\s;]+)"#) {
382        if let Some(caps) = re.captures(html) {
383            if let Some(m) = caps.get(1) {
384                return Some(m.as_str().to_string());
385            }
386        }
387    }
388
389    None
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    #[test]
397    fn test_engine_creation() {
398        let engine = HttpEngine::new();
399        assert!(engine.is_ok());
400    }
401
402    #[test]
403    fn test_engine_with_timeout() {
404        let engine = HttpEngine::with_timeout(5000);
405        assert!(engine.is_ok());
406    }
407
408    #[test]
409    fn test_engine_with_options() {
410        let engine = HttpEngine::with_options(5000, true);
411        assert!(engine.is_ok());
412    }
413
414    #[test]
415    fn test_detect_charset_from_content_type() {
416        let encoding = detect_charset(b"", Some("text/html; charset=EUC-JP"));
417        assert_eq!(encoding.name(), "EUC-JP");
418    }
419
420    #[test]
421    fn test_detect_charset_from_meta() {
422        let html = r#"<html><head><meta charset="ISO-8859-1"></head></html>"#;
423        let encoding = detect_charset(html.as_bytes(), None);
424        // ISO-8859-1 maps to windows-1252 in encoding_rs
425        assert_eq!(encoding.name(), "windows-1252");
426    }
427
428    #[test]
429    fn test_detect_charset_from_meta_http_equiv() {
430        let html = r#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=GB2312"></head></html>"#;
431        let encoding = detect_charset(html.as_bytes(), None);
432        assert_eq!(encoding.name(), "GBK");
433    }
434
435    #[test]
436    fn test_detect_charset_utf8_default() {
437        let encoding = detect_charset(b"", None);
438        assert_eq!(encoding.name(), "UTF-8");
439    }
440
441    #[test]
442    fn test_extract_charset_from_header() {
443        assert_eq!(
444            extract_charset_from_header("text/html; charset=UTF-8"),
445            Some("UTF-8".to_string())
446        );
447        assert_eq!(
448            extract_charset_from_header("text/html; charset=\"ISO-8859-1\""),
449            Some("ISO-8859-1".to_string())
450        );
451        assert_eq!(extract_charset_from_header("text/html"), None);
452    }
453
454    #[test]
455    fn test_extract_charset_from_meta() {
456        assert_eq!(
457            extract_charset_from_meta(r#"<meta charset="UTF-8">"#),
458            Some("UTF-8".to_string())
459        );
460        assert_eq!(
461            extract_charset_from_meta(r#"<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">"#),
462            Some("ISO-8859-1".to_string())
463        );
464        assert_eq!(extract_charset_from_meta("<html></html>"), None);
465    }
466}