Skip to main content

stygian_graph/adapters/
http.rs

1//! HTTP scraping adapter with anti-bot features
2//!
3//! Implements the `ScrapingService` port using reqwest with:
4//! - Realistic browser headers and User-Agent rotation
5//! - Cookie jar persistence across requests in a session
6//! - Exponential backoff retry (up to 3 attempts)
7//! - Configurable timeouts
8//! - Optional proxy support
9//!
10//! # Example
11//!
12//! ```no_run
13//! use stygian_graph::adapters::http::{HttpAdapter, HttpConfig};
14//! use stygian_graph::ports::{ScrapingService, ServiceInput};
15//! use serde_json::json;
16//!
17//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
18//! let adapter = HttpAdapter::with_config(HttpConfig::default());
19//! let input = ServiceInput {
20//!     url: "https://httpbin.org/get".to_string(),
21//!     params: json!({}),
22//! };
23//! // let result = adapter.execute(input).await.unwrap();
24//! # });
25//! ```
26
27use std::time::Duration;
28
29use async_trait::async_trait;
30use reqwest::{Client, Proxy, header};
31
32use crate::domain::error::{Result, ServiceError, StygianError};
33use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
34
35/// Rotating pool of realistic browser User-Agent strings
36static USER_AGENTS: &[&str] = &[
37    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
38    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
39    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
40    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
41    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0",
42    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
43];
44
45/// Configuration for the HTTP adapter
46#[derive(Debug, Clone)]
47pub struct HttpConfig {
48    /// Request timeout (default: 30 seconds)
49    pub timeout: Duration,
50    /// Number of retry attempts on transient failures (default: 3)
51    pub max_retries: u32,
52    /// Base delay for exponential backoff (default: 1 second)
53    pub retry_base_delay: Duration,
54    /// Optional HTTP/SOCKS5 proxy URL
55    pub proxy_url: Option<String>,
56    /// Whether to rotate User-Agent header on each request
57    pub rotate_user_agent: bool,
58    /// Index into `USER_AGENTS` for round-robin rotation (wraps)
59    pub(crate) ua_counter: std::sync::Arc<std::sync::atomic::AtomicUsize>,
60}
61
62impl Default for HttpConfig {
63    fn default() -> Self {
64        Self {
65            timeout: Duration::from_secs(30),
66            max_retries: 3,
67            retry_base_delay: Duration::from_secs(1),
68            proxy_url: None,
69            rotate_user_agent: true,
70            ua_counter: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)),
71        }
72    }
73}
74
75/// HTTP client adapter with anti-bot features.
76///
77/// Thread-safe and cheaply cloneable — the internal `reqwest::Client` uses
78/// an `Arc` internally and maintains a shared cookie jar.
79#[derive(Clone)]
80pub struct HttpAdapter {
81    client: Client,
82    config: HttpConfig,
83}
84
85impl HttpAdapter {
86    /// Create a new HTTP adapter with default configuration.
87    ///
88    /// # Example
89    ///
90    /// ```no_run
91    /// use stygian_graph::adapters::http::HttpAdapter;
92    /// let adapter = HttpAdapter::new();
93    /// ```
94    #[must_use]
95    pub fn new() -> Self {
96        Self::with_config(HttpConfig::default())
97    }
98
99    /// Create an HTTP adapter with custom configuration.
100    ///
101    /// # Panics
102    ///
103    /// Panics only if TLS configuration is unavailable (extremely rare).
104    #[must_use]
105    pub fn with_config(config: HttpConfig) -> Self {
106        let mut builder = Client::builder()
107            .timeout(config.timeout)
108            .cookie_store(true)
109            .gzip(true)
110            .brotli(true)
111            .use_rustls_tls()
112            .default_headers(Self::default_headers());
113
114        if let Some(ref proxy_url) = config.proxy_url
115            && let Ok(proxy) = Proxy::all(proxy_url)
116        {
117            builder = builder.proxy(proxy);
118        }
119
120        // SAFETY: TLS via rustls is always available; build() can only fail if
121        // TLS backend is completely absent, which cannot happen with use_rustls_tls().
122        #[allow(clippy::expect_used)]
123        let client = builder.build().expect("TLS backend unavailable");
124
125        Self { client, config }
126    }
127
128    /// Build a realistic set of browser-like default headers.
129    fn default_headers() -> header::HeaderMap {
130        let mut headers = header::HeaderMap::new();
131        headers.insert(
132            header::ACCEPT,
133            header::HeaderValue::from_static(
134                "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
135            ),
136        );
137        headers.insert(
138            header::ACCEPT_LANGUAGE,
139            header::HeaderValue::from_static("en-US,en;q=0.5"),
140        );
141        headers.insert(
142            header::ACCEPT_ENCODING,
143            header::HeaderValue::from_static("gzip, deflate, br"),
144        );
145        headers.insert("DNT", header::HeaderValue::from_static("1"));
146        headers.insert(
147            "Upgrade-Insecure-Requests",
148            header::HeaderValue::from_static("1"),
149        );
150        headers
151    }
152
153    /// Pick the next User-Agent via round-robin.
154    fn next_user_agent(&self) -> &'static str {
155        let idx = self
156            .config
157            .ua_counter
158            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
159        let len = USER_AGENTS.len();
160        USER_AGENTS.get(idx % len).copied().unwrap_or("")
161    }
162
163    /// Execute a single HTTP GET with the provided URL and return raw content.
164    async fn fetch(&self, url: &str) -> Result<(String, serde_json::Value)> {
165        let ua = if self.config.rotate_user_agent {
166            self.next_user_agent()
167        } else {
168            USER_AGENTS.first().copied().unwrap_or("")
169        };
170
171        let response = self
172            .client
173            .get(url)
174            .header(header::USER_AGENT, ua)
175            .send()
176            .await
177            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
178
179        let status = response.status();
180        let content_type = response
181            .headers()
182            .get(header::CONTENT_TYPE)
183            .and_then(|v| v.to_str().ok())
184            .unwrap_or("text/plain")
185            .to_string();
186
187        if !status.is_success() {
188            return Err(StygianError::Service(ServiceError::Unavailable(format!(
189                "HTTP {status} for {url}"
190            ))));
191        }
192
193        let body = response
194            .text()
195            .await
196            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
197
198        let metadata = serde_json::json!({
199            "status_code": status.as_u16(),
200            "content_type": content_type,
201            "user_agent": ua,
202            "url": url,
203        });
204
205        Ok((body, metadata))
206    }
207
208    /// Check whether a status code is a transient error worth retrying.
209    const fn is_retryable_status(code: u16) -> bool {
210        matches!(code, 429 | 500 | 502 | 503 | 504)
211    }
212}
213
214impl Default for HttpAdapter {
215    fn default() -> Self {
216        Self::new()
217    }
218}
219
220#[async_trait]
221impl ScrapingService for HttpAdapter {
222    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
223        let mut last_err: Option<StygianError> = None;
224
225        for attempt in 0..=self.config.max_retries {
226            if attempt > 0 {
227                // Exponential backoff: 1s, 2s, 4s, …
228                let delay = self.config.retry_base_delay * 2u32.saturating_pow(attempt - 1);
229                tokio::time::sleep(delay).await;
230            }
231
232            match self.fetch(&input.url).await {
233                Ok((data, metadata)) => {
234                    return Ok(ServiceOutput { data, metadata });
235                }
236                Err(StygianError::Service(ServiceError::Unavailable(ref msg))) => {
237                    // Check if we got a retryable HTTP status embedded in the message
238                    let retryable = msg
239                        .split_whitespace()
240                        .find_map(|w| w.parse::<u16>().ok())
241                        .is_none_or(Self::is_retryable_status);
242
243                    if retryable && attempt < self.config.max_retries {
244                        last_err = Some(StygianError::Service(ServiceError::Unavailable(
245                            msg.clone(),
246                        )));
247                        continue;
248                    }
249                    return Err(StygianError::Service(ServiceError::Unavailable(
250                        msg.clone(),
251                    )));
252                }
253                Err(e) => return Err(e),
254            }
255        }
256
257        Err(last_err.unwrap_or_else(|| {
258            StygianError::Service(ServiceError::Unavailable("Max retries exceeded".into()))
259        }))
260    }
261
262    fn name(&self) -> &'static str {
263        "http"
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270
271    #[test]
272    fn test_default_config() {
273        let config = HttpConfig::default();
274        assert_eq!(config.max_retries, 3);
275        assert!(config.rotate_user_agent);
276        assert!(config.proxy_url.is_none());
277    }
278
279    #[test]
280    fn test_user_agent_rotation() {
281        let adapter = HttpAdapter::new();
282        let ua1 = adapter.next_user_agent();
283        let ua2 = adapter.next_user_agent();
284        // Both should be in the pool
285        assert!(USER_AGENTS.contains(&ua1));
286        assert!(USER_AGENTS.contains(&ua2));
287        // Consecutive calls return different agents
288        assert_ne!(ua1, ua2);
289    }
290
291    #[test]
292    fn test_user_agent_wraps_around() {
293        let adapter = HttpAdapter::new();
294        // Exhaust one full rotation
295        for _ in 0..USER_AGENTS.len() {
296            adapter.next_user_agent();
297        }
298        // Still valid after wrap
299        let ua = adapter.next_user_agent();
300        assert!(USER_AGENTS.contains(&ua));
301    }
302
303    #[test]
304    fn test_retryable_status_codes() {
305        assert!(HttpAdapter::is_retryable_status(429));
306        assert!(HttpAdapter::is_retryable_status(503));
307        assert!(!HttpAdapter::is_retryable_status(404));
308        assert!(!HttpAdapter::is_retryable_status(200));
309    }
310
311    #[test]
312    fn test_adapter_name() {
313        let adapter = HttpAdapter::new();
314        assert_eq!(adapter.name(), "http");
315    }
316}