Skip to main content

scrapling_fetch/
client.rs

1//! HTTP client implementations for making requests.
2//!
3//! This module provides two client types for different use cases:
4//!
5//! - [`Fetcher`] -- A stateless client that creates a fresh wreq client for every
6//!   request. This is the simplest option and works well when you do not need to
7//!   persist cookies or connection state between requests.
8//!
9//! - [`FetcherSession`] -- A session-based client that maintains a persistent wreq
10//!   client with an automatic cookie jar. Use this when you need to log in to a site
11//!   and carry cookies across subsequent requests. Call [`open()`](FetcherSession::open)
12//!   before making requests and [`close()`](FetcherSession::close) when done.
13//!
14//! Both clients support automatic retries, proxy rotation, browser impersonation, and
15//! per-request configuration overrides via [`RequestConfig`].
16
17use std::collections::HashMap;
18use std::time::Duration;
19
20use serde_json::Value;
21use tracing::{debug, error, warn};
22use wreq_util::Emulation;
23
24use crate::config::{
25    FetcherConfig, FetcherConfigBuilder, FollowRedirects, Impersonate, ParserConfig,
26};
27use crate::error::{FetchError, Result};
28use crate::fingerprint::{default_user_agent, generate_headers};
29use crate::proxy::{Proxy, ProxyRotator};
30use crate::response::{Response, build_response_async};
31
32fn merge_headers(
33    base: &HashMap<String, String>,
34    req: &RequestConfig,
35    stealth: bool,
36    impersonate_enabled: bool,
37) -> HashMap<String, String> {
38    let mut headers = base.clone();
39
40    if let Some(req_headers) = &req.headers {
41        headers.extend(req_headers.iter().map(|(k, v)| (k.clone(), v.clone())));
42    }
43
44    let keys_lower: std::collections::HashSet<String> =
45        headers.keys().map(|k| k.to_lowercase()).collect();
46
47    match (stealth, impersonate_enabled) {
48        (true, _) => {
49            if !keys_lower.contains("referer") {
50                headers.insert("referer".into(), "https://www.google.com/".into());
51            }
52            if !impersonate_enabled {
53                generate_headers(false)
54                    .into_iter()
55                    .filter(|(k, _)| !keys_lower.contains(&k.to_lowercase()))
56                    .for_each(|(k, v)| {
57                        headers.insert(k, v);
58                    });
59            }
60        }
61        (false, false) if !keys_lower.contains("user-agent") => {
62            headers.insert("User-Agent".into(), default_user_agent());
63        }
64        _ => {}
65    }
66
67    headers
68}
69
70/// Maps a human-friendly impersonation name (e.g., `"chrome"`, `"firefox135"`) to
71/// the corresponding [`wreq_util::Emulation`] profile. Returns `None` if the name
72/// is not recognized. Unversioned names like `"chrome"` resolve to the latest
73/// available version.
74fn resolve_emulation(name: &str) -> Option<Emulation> {
75    match name.to_lowercase().as_str() {
76        "chrome" | "chrome145" => Some(Emulation::Chrome145),
77        "chrome100" => Some(Emulation::Chrome100),
78        "chrome120" => Some(Emulation::Chrome120),
79        "chrome124" => Some(Emulation::Chrome124),
80        "chrome131" => Some(Emulation::Chrome131),
81        "chrome136" => Some(Emulation::Chrome136),
82        "chrome140" => Some(Emulation::Chrome140),
83        "chrome142" => Some(Emulation::Chrome142),
84        "chrome143" => Some(Emulation::Chrome143),
85        "chrome144" => Some(Emulation::Chrome144),
86        "edge" | "edge145" => Some(Emulation::Edge145),
87        "edge140" => Some(Emulation::Edge140),
88        "edge134" => Some(Emulation::Edge134),
89        "safari" | "safari26" => Some(Emulation::Safari26),
90        "safari18" => Some(Emulation::Safari18_5),
91        "firefox" | "firefox135" => Some(Emulation::Firefox135),
92        "firefox133" => Some(Emulation::Firefox133),
93        "firefox128" => Some(Emulation::Firefox128),
94        _ => None,
95    }
96}
97
98// ---------------------------------------------------------------------------
99// Request-level overrides
100// ---------------------------------------------------------------------------
101
102/// Per-request configuration overrides that take precedence over [`FetcherConfig`] defaults.
103///
104/// Every field is `Option` -- when `None`, the corresponding value from the fetcher's
105/// [`FetcherConfig`] is used instead. This lets you customize individual requests
106/// (e.g., use a different proxy or longer timeout) without affecting the global config.
107///
108/// Pass this to methods like [`Fetcher::get()`] and [`FetcherSession::post()`].
109#[derive(Debug, Default)]
110pub struct RequestConfig {
111    /// Custom headers for this request. These are merged with the fetcher's default
112    /// headers, with per-request values taking precedence on name collisions.
113    pub headers: Option<HashMap<String, String>>,
114    /// Cookies to send with this request, serialized into a single `Cookie` header.
115    /// In a [`FetcherSession`], the session's cookie jar is used in addition to these.
116    pub cookies: Option<HashMap<String, String>>,
117    /// URL query parameters to append to the request URL.
118    pub params: Option<HashMap<String, String>>,
119    /// Request timeout override in seconds. When set, this replaces the fetcher's
120    /// default timeout for this single request.
121    pub timeout_secs: Option<u64>,
122    /// Redirect policy override for this request.
123    pub follow_redirects: Option<FollowRedirects>,
124    /// Maximum redirects override for this request.
125    pub max_redirects: Option<usize>,
126    /// Retry count override for this request. Set to `Some(1)` to disable retries.
127    pub retries: Option<u32>,
128    /// Retry delay override in seconds for this request.
129    pub retry_delay_secs: Option<u64>,
130    /// Proxy override for this request. Overrides both the static proxy and the
131    /// proxy rotator for this single request.
132    pub proxy: Option<Proxy>,
133    /// TLS verification override for this request.
134    pub verify: Option<bool>,
135    /// Browser impersonation override for this request.
136    pub impersonate: Option<Impersonate>,
137    /// Stealth headers override for this request.
138    pub stealthy_headers: Option<bool>,
139    /// Raw request body bytes. Mutually exclusive with `json` -- if both are set,
140    /// `json` takes precedence.
141    pub data: Option<Vec<u8>>,
142    /// JSON request body. Automatically serialized and sent with a
143    /// `Content-Type: application/json` header. Takes precedence over `data`.
144    pub json: Option<Value>,
145    /// HTTP basic authentication credentials as `(username, password)`.
146    pub auth: Option<(String, String)>,
147}
148
149// ---------------------------------------------------------------------------
150// Fetcher — async, creates a new wreq client per request
151// ---------------------------------------------------------------------------
152
153/// Stateless async HTTP fetcher that creates a new wreq client per request.
154///
155/// Because a fresh client is built for each request, there is no shared state between
156/// calls -- no cookie jar, no persistent connections. This is the right choice for
157/// simple scraping tasks, parallel crawling from different IP addresses, or when you
158/// want maximum isolation between requests.
159///
160/// For login flows or multi-step interactions that need cookies, use [`FetcherSession`].
161pub struct Fetcher {
162    config: FetcherConfig,
163    proxy_rotator: Option<ProxyRotator>,
164    parser_config: ParserConfig,
165}
166
167impl Fetcher {
168    /// Creates a new fetcher with default configuration (Chrome impersonation, 30s
169    /// timeout, 3 retries, stealth headers enabled).
170    pub fn new() -> Self {
171        Self {
172            config: FetcherConfig::default(),
173            proxy_rotator: None,
174            parser_config: ParserConfig::default(),
175        }
176    }
177
178    /// Creates a new fetcher with the given configuration. Use this when you want
179    /// full control over the config without going through the builder.
180    pub fn with_config(config: FetcherConfig) -> Self {
181        Self {
182            config,
183            proxy_rotator: None,
184            parser_config: ParserConfig::default(),
185        }
186    }
187
188    /// Returns a new [`FetcherConfigBuilder`] for constructing a validated config.
189    /// This is a convenience shortcut for `FetcherConfigBuilder::new()`.
190    pub fn builder() -> FetcherConfigBuilder {
191        FetcherConfigBuilder::new()
192    }
193
194    /// Constructs a fetcher from a completed builder. The builder is consumed and
195    /// validated. Returns an error if the builder configuration is invalid (e.g., both
196    /// a static proxy and a proxy rotator are set).
197    pub fn from_builder(builder: FetcherConfigBuilder) -> Result<Self> {
198        let (config, rotator) = builder.build()?;
199        Ok(Self {
200            config,
201            proxy_rotator: rotator,
202            parser_config: ParserConfig::default(),
203        })
204    }
205
206    /// Sets the proxy rotator for distributing requests across proxies. Each request
207    /// will use the next proxy from the rotator according to its rotation strategy.
208    pub fn set_proxy_rotator(&mut self, rotator: ProxyRotator) {
209        self.proxy_rotator = Some(rotator);
210    }
211
212    /// Sets the parser configuration for HTML processing. This controls adaptive
213    /// parsing behavior on the [`Response`] objects returned by this fetcher.
214    pub fn set_parser_config(&mut self, parser_config: ParserConfig) {
215        self.parser_config = parser_config;
216    }
217
218    /// Returns a reference to the current fetcher configuration. Useful for
219    /// inspecting defaults or logging the active settings.
220    pub fn config(&self) -> &FetcherConfig {
221        &self.config
222    }
223
224    /// Sends an HTTP GET request to the given URL. Pass `None` for `req` to use
225    /// the fetcher's default configuration, or pass a [`RequestConfig`] to override
226    /// specific settings for this request.
227    pub async fn get(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
228        self.request("GET", url, req.unwrap_or_default()).await
229    }
230
231    /// Sends an HTTP POST request to the given URL. Use [`RequestConfig::json`] or
232    /// [`RequestConfig::data`] to attach a request body.
233    pub async fn post(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
234        self.request("POST", url, req.unwrap_or_default()).await
235    }
236
237    /// Sends an HTTP PUT request to the given URL. Use [`RequestConfig::json`] or
238    /// [`RequestConfig::data`] to attach a request body.
239    pub async fn put(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
240        self.request("PUT", url, req.unwrap_or_default()).await
241    }
242
243    /// Sends an HTTP DELETE request to the given URL. Some APIs accept a body with
244    /// DELETE requests -- use [`RequestConfig::data`] or [`RequestConfig::json`] if needed.
245    pub async fn delete(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
246        self.request("DELETE", url, req.unwrap_or_default()).await
247    }
248
249    async fn request(&self, method: &str, url: &str, req: RequestConfig) -> Result<Response> {
250        let max_retries = req.retries.unwrap_or(self.config.retries);
251        let retry_delay = req.retry_delay_secs.unwrap_or(self.config.retry_delay_secs);
252        let static_proxy = req.proxy.clone();
253
254        let mut last_error: Option<FetchError> = None;
255
256        for attempt in 0..max_retries {
257            let proxy = match (&self.proxy_rotator, &static_proxy) {
258                (Some(rotator), None) => Some(rotator.get_proxy()),
259                _ => static_proxy.clone().or_else(|| self.config.proxy.clone()),
260            };
261
262            match self
263                .execute_request(method, url, &req, proxy.as_ref())
264                .await
265            {
266                Ok(response) => return Ok(response),
267                Err(e) => {
268                    match attempt < max_retries - 1 {
269                        true => {
270                            warn!(attempt = attempt + 1, error = %e, "request failed, retrying in {retry_delay}s");
271                            tokio::time::sleep(Duration::from_secs(retry_delay)).await;
272                        }
273                        false => {
274                            error!(attempts = max_retries, error = %e, "all retries exhausted");
275                        }
276                    }
277                    last_error = Some(e);
278                }
279            }
280        }
281
282        Err(FetchError::MaxRetriesExceeded {
283            attempts: max_retries,
284            last_error: Box::new(last_error.unwrap_or(FetchError::Other("unknown error".into()))),
285        })
286    }
287
288    async fn execute_request(
289        &self,
290        method: &str,
291        url: &str,
292        req: &RequestConfig,
293        proxy: Option<&Proxy>,
294    ) -> Result<Response> {
295        let stealth = req.stealthy_headers.unwrap_or(self.config.stealthy_headers);
296        let impersonate = req.impersonate.as_ref().unwrap_or(&self.config.impersonate);
297        let impersonate_selected = impersonate.select();
298        let timeout = req.timeout_secs.unwrap_or(self.config.timeout_secs);
299        let follow = req.follow_redirects.unwrap_or(self.config.follow_redirects);
300        let max_redirects = req.max_redirects.unwrap_or(self.config.max_redirects);
301        let verify = req.verify.unwrap_or(self.config.verify);
302
303        let final_headers = merge_headers(
304            &self.config.headers,
305            req,
306            stealth,
307            impersonate_selected.is_some(),
308        );
309
310        // Build wreq client
311        let mut client_builder = wreq::Client::builder().timeout(Duration::from_secs(timeout));
312
313        if !verify {
314            client_builder = client_builder.cert_verification(false);
315        }
316
317        match follow {
318            FollowRedirects::None => {
319                client_builder = client_builder.redirect(wreq::redirect::Policy::none());
320            }
321            FollowRedirects::All | FollowRedirects::Safe => {
322                client_builder =
323                    client_builder.redirect(wreq::redirect::Policy::limited(max_redirects));
324            }
325        }
326
327        if let Some(p) = proxy {
328            let rp = wreq::Proxy::all(p.server())
329                .map_err(|e| FetchError::InvalidProxy(e.to_string()))?;
330            client_builder = client_builder.proxy(rp);
331        }
332
333        let client = client_builder.build()?;
334
335        // Build request with emulation
336        let http_method: wreq::Method = method
337            .parse()
338            .map_err(|_| FetchError::Other(format!("invalid HTTP method: {method}")))?;
339
340        let mut full_url = url::Url::parse(url)?;
341        if let Some(params) = &req.params {
342            let mut pairs = full_url.query_pairs_mut();
343            params.iter().for_each(|(k, v)| {
344                pairs.append_pair(k, v);
345            });
346        }
347
348        let mut request_builder = client.request(http_method, full_url.as_str());
349
350        // Apply browser emulation
351        if let Some(browser_name) = impersonate_selected {
352            if let Some(emulation) = resolve_emulation(browser_name) {
353                request_builder = request_builder.emulation(emulation);
354            }
355        }
356
357        // Headers
358        for (k, v) in &final_headers {
359            request_builder = request_builder.header(k.as_str(), v.as_str());
360        }
361
362        // Cookies
363        if let Some(cookies) = &req.cookies {
364            let cookie_str = cookies
365                .iter()
366                .map(|(k, v)| format!("{k}={v}"))
367                .collect::<Vec<_>>()
368                .join("; ");
369            request_builder = request_builder.header("cookie", cookie_str);
370        }
371
372        // Auth
373        if let Some((user, pass)) = &req.auth {
374            request_builder = request_builder.basic_auth(user, Some(pass));
375        }
376
377        // Body
378        if let Some(json_body) = &req.json {
379            request_builder = request_builder
380                .header("content-type", "application/json")
381                .body(serde_json::to_vec(json_body)?);
382        } else if let Some(data) = &req.data {
383            request_builder = request_builder.body(data.clone());
384        }
385
386        let request_headers_map = final_headers;
387
388        debug!(method, url, "sending request via wreq");
389
390        let resp = request_builder.send().await?;
391
392        let mut meta = HashMap::new();
393        if let Some(p) = proxy {
394            meta.insert("proxy".to_owned(), Value::String(p.server().to_owned()));
395        }
396
397        build_response_async(resp, request_headers_map, method, meta).await
398    }
399}
400
401impl Default for Fetcher {
402    fn default() -> Self {
403        Self::new()
404    }
405}
406
407// ---------------------------------------------------------------------------
408// FetcherSession — persistent client with cookie store
409// ---------------------------------------------------------------------------
410
411/// Session-based async HTTP fetcher that reuses a persistent client with cookie storage.
412///
413/// Unlike [`Fetcher`], this struct maintains a single wreq client across requests,
414/// which means cookies set by one response are automatically sent with subsequent
415/// requests. This is essential for login flows, CSRF-protected forms, and any
416/// multi-step interaction where server-side session state matters.
417///
418/// Lifecycle: create with [`new()`](Self::new), call [`open()`](Self::open) to start
419/// the session, make requests, then call [`close()`](Self::close) (or just drop it).
420pub struct FetcherSession {
421    config: FetcherConfig,
422    proxy_rotator: Option<ProxyRotator>,
423    parser_config: ParserConfig,
424    client: Option<wreq::Client>,
425}
426
427impl FetcherSession {
428    /// Creates a new session with the given configuration. The session is not yet
429    /// active -- you must call [`open()`](Self::open) before making requests.
430    pub fn new(config: FetcherConfig) -> Self {
431        Self {
432            config,
433            proxy_rotator: None,
434            parser_config: ParserConfig::default(),
435            client: None,
436        }
437    }
438
439    /// Attaches a proxy rotator to the session. Must be called before
440    /// [`open()`](Self::open) since the proxy is configured on the underlying client.
441    pub fn with_rotator(mut self, rotator: ProxyRotator) -> Self {
442        self.proxy_rotator = Some(rotator);
443        self
444    }
445
446    /// Sets the parser configuration for the session. Controls how responses from
447    /// this session parse and interpret HTML.
448    pub fn with_parser_config(mut self, parser_config: ParserConfig) -> Self {
449        self.parser_config = parser_config;
450        self
451    }
452
453    /// Opens the session by creating the underlying HTTP client with a cookie store.
454    /// Returns an error if the session is already active. After this call, you can
455    /// make requests with [`get()`](Self::get), [`post()`](Self::post), etc.
456    pub fn open(&mut self) -> Result<()> {
457        if self.client.is_some() {
458            return Err(FetchError::SessionAlreadyActive);
459        }
460
461        let mut builder = wreq::Client::builder()
462            .timeout(Duration::from_secs(self.config.timeout_secs))
463            .cookie_store(true);
464
465        if !self.config.verify {
466            builder = builder.cert_verification(false);
467        }
468
469        match self.config.follow_redirects {
470            FollowRedirects::None => {
471                builder = builder.redirect(wreq::redirect::Policy::none());
472            }
473            FollowRedirects::All | FollowRedirects::Safe => {
474                builder =
475                    builder.redirect(wreq::redirect::Policy::limited(self.config.max_redirects));
476            }
477        }
478
479        if let Some(ref p) = self.config.proxy {
480            let rp = wreq::Proxy::all(p.server())
481                .map_err(|e| FetchError::InvalidProxy(e.to_string()))?;
482            builder = builder.proxy(rp);
483        }
484
485        self.client = Some(builder.build()?);
486        Ok(())
487    }
488
489    /// Closes the session and drops the underlying HTTP client. All cookies and
490    /// connection state are discarded. The session can be re-opened with [`open()`](Self::open).
491    pub fn close(&mut self) {
492        self.client = None;
493    }
494
495    /// Returns `true` if the session is currently active (i.e., [`open()`](Self::open)
496    /// has been called and [`close()`](Self::close) has not).
497    pub fn is_active(&self) -> bool {
498        self.client.is_some()
499    }
500
501    /// Sends an HTTP GET request using the session client. Cookies from prior
502    /// responses are automatically included. Returns an error if the session is not active.
503    pub async fn get(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
504        self.request("GET", url, req.unwrap_or_default()).await
505    }
506
507    /// Sends an HTTP POST request using the session client. Use [`RequestConfig::json`]
508    /// or [`RequestConfig::data`] to attach a body.
509    pub async fn post(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
510        self.request("POST", url, req.unwrap_or_default()).await
511    }
512
513    /// Sends an HTTP PUT request using the session client. Use [`RequestConfig::json`]
514    /// or [`RequestConfig::data`] to attach a body.
515    pub async fn put(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
516        self.request("PUT", url, req.unwrap_or_default()).await
517    }
518
519    /// Sends an HTTP DELETE request using the session client.
520    pub async fn delete(&self, url: &str, req: Option<RequestConfig>) -> Result<Response> {
521        self.request("DELETE", url, req.unwrap_or_default()).await
522    }
523
524    async fn request(&self, method: &str, url: &str, req: RequestConfig) -> Result<Response> {
525        let client = self.client.as_ref().ok_or(FetchError::SessionNotActive)?;
526
527        let stealth = req.stealthy_headers.unwrap_or(self.config.stealthy_headers);
528        let impersonate = req.impersonate.as_ref().unwrap_or(&self.config.impersonate);
529        let impersonate_selected = impersonate.select();
530
531        let final_headers = merge_headers(
532            &self.config.headers,
533            &req,
534            stealth,
535            impersonate_selected.is_some(),
536        );
537
538        let http_method: wreq::Method = method
539            .parse()
540            .map_err(|_| FetchError::Other(format!("invalid HTTP method: {method}")))?;
541
542        let mut full_url = url::Url::parse(url)?;
543        if let Some(params) = &req.params {
544            let mut pairs = full_url.query_pairs_mut();
545            params.iter().for_each(|(k, v)| {
546                pairs.append_pair(k, v);
547            });
548        }
549
550        let mut request_builder = client.request(http_method, full_url.as_str());
551
552        if let Some(browser_name) = impersonate_selected {
553            if let Some(emulation) = resolve_emulation(browser_name) {
554                request_builder = request_builder.emulation(emulation);
555            }
556        }
557
558        for (k, v) in &final_headers {
559            request_builder = request_builder.header(k.as_str(), v.as_str());
560        }
561
562        if let Some(cookies) = &req.cookies {
563            let cookie_str = cookies
564                .iter()
565                .map(|(k, v)| format!("{k}={v}"))
566                .collect::<Vec<_>>()
567                .join("; ");
568            request_builder = request_builder.header("cookie", cookie_str);
569        }
570
571        if let Some((user, pass)) = &req.auth {
572            request_builder = request_builder.basic_auth(user, Some(pass));
573        }
574
575        if let Some(json_body) = &req.json {
576            request_builder = request_builder
577                .header("content-type", "application/json")
578                .body(serde_json::to_vec(json_body)?);
579        } else if let Some(data) = &req.data {
580            request_builder = request_builder.body(data.clone());
581        }
582
583        debug!(method, url, "sending request via wreq session");
584
585        let resp = request_builder.send().await?;
586
587        build_response_async(resp, final_headers, method, HashMap::new()).await
588    }
589}
590
591impl Drop for FetcherSession {
592    fn drop(&mut self) {
593        self.close();
594    }
595}