Skip to main content

scrapling_fetch/
config.rs

1//! Configuration types for the HTTP fetcher.
2//!
3//! This module contains all the knobs you can turn to control how requests are made.
4//! The central type is [`FetcherConfig`], which holds defaults for timeouts, retries,
5//! proxies, browser impersonation, and redirect behavior. Use [`FetcherConfigBuilder`]
6//! to construct a validated config with a fluent API.
7//!
8//! [`ParserConfig`] is a separate, smaller struct that controls how the HTML parser
9//! behaves (e.g., whether adaptive parsing is enabled).
10
11use std::collections::HashMap;
12
13use crate::proxy::{Proxy, ProxyRotator};
14
15/// Policy for following HTTP redirects.
16///
17/// This controls whether the client automatically follows 3xx responses. The default
18/// is [`Safe`](FollowRedirects::Safe), which only follows redirects for GET and HEAD
19/// requests -- this prevents accidentally re-submitting POST bodies to a new URL.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum FollowRedirects {
22    /// Do not follow any redirects. The caller receives the raw 3xx response and is
23    /// responsible for handling the `Location` header manually.
24    None,
25    /// Follow redirects only for safe (non-mutating) HTTP methods like GET and HEAD.
26    /// This is the default and usually what you want.
27    Safe,
28    /// Follow all redirects regardless of HTTP method, including POST and PUT.
29    /// Use with caution -- this can re-submit request bodies to unexpected URLs.
30    All,
31}
32
33/// Configuration for the HTTP fetcher.
34///
35/// This struct holds the default settings applied to every request made by a
36/// [`Fetcher`](crate::Fetcher) or [`FetcherSession`](crate::FetcherSession).
37/// Individual requests can override most of these via [`RequestConfig`](crate::RequestConfig).
38/// Use [`FetcherConfigBuilder`] for a validated, fluent construction path.
39#[derive(Debug, Clone)]
40pub struct FetcherConfig {
41    /// The browser impersonation profile to use. Controls which TLS and HTTP/2
42    /// fingerprint the client presents to the server. Defaults to Chrome.
43    pub impersonate: Impersonate,
44    /// Whether to inject stealth headers (Referer, Sec-Ch-Ua, etc.) that make
45    /// requests look like they come from a real browser. Enabled by default.
46    pub stealthy_headers: bool,
47    /// An optional static proxy to route all requests through. If you need to
48    /// rotate across multiple proxies, use a [`ProxyRotator`](crate::ProxyRotator) instead.
49    pub proxy: Option<Proxy>,
50    /// Request timeout in seconds. Defaults to 30. Applies to the entire request
51    /// lifecycle including DNS resolution, connection, and response body download.
52    pub timeout_secs: u64,
53    /// Default headers to include with every request. These are merged with
54    /// per-request headers, with per-request values taking precedence on conflict.
55    pub headers: HashMap<String, String>,
56    /// Maximum number of retry attempts per request. Defaults to 3. Set to 1 to
57    /// disable retries entirely.
58    pub retries: u32,
59    /// Delay in seconds between retry attempts. Defaults to 1. This is a fixed
60    /// delay, not exponential backoff.
61    pub retry_delay_secs: u64,
62    /// The redirect-following policy. Defaults to [`FollowRedirects::Safe`].
63    pub follow_redirects: FollowRedirects,
64    /// Maximum number of redirects to follow before giving up. Defaults to 30.
65    /// Only applies when `follow_redirects` is not [`FollowRedirects::None`].
66    pub max_redirects: usize,
67    /// Whether to verify TLS certificates. Defaults to `true`. Set to `false`
68    /// only for testing against self-signed certificates -- never in production.
69    pub verify: bool,
70}
71
72impl Default for FetcherConfig {
73    fn default() -> Self {
74        Self {
75            impersonate: Impersonate::default(),
76            stealthy_headers: true,
77            proxy: None,
78            timeout_secs: 30,
79            headers: HashMap::new(),
80            retries: 3,
81            retry_delay_secs: 1,
82            follow_redirects: FollowRedirects::Safe,
83            max_redirects: 30,
84            verify: true,
85        }
86    }
87}
88
89/// Browser impersonation strategy for TLS/HTTP fingerprinting.
90///
91/// Modern bot-detection services fingerprint the TLS ClientHello and HTTP/2 settings
92/// to distinguish real browsers from HTTP libraries. This enum controls which browser
93/// profile the underlying wreq client emulates. The default is `Single("chrome")`.
94#[derive(Debug, Clone)]
95pub enum Impersonate {
96    /// No browser impersonation. The client uses wreq's default TLS settings, which
97    /// may be detected as non-browser traffic by sophisticated bot-detection systems.
98    None,
99    /// Impersonate a single specific browser profile for all requests. Pass a string
100    /// like `"chrome"`, `"firefox"`, or `"safari"` (see [`client::resolve_emulation`](crate::client)
101    /// for the full list of supported names).
102    Single(String),
103    /// Randomly select from a list of browser profiles on each request. This adds
104    /// diversity to your fingerprint, which can help avoid detection when scraping
105    /// at scale.
106    Random(Vec<String>),
107}
108
109impl Default for Impersonate {
110    fn default() -> Self {
111        Self::Single("chrome".to_owned())
112    }
113}
114
115impl Impersonate {
116    /// Returns the browser profile name to use for the current request, or `None`
117    /// if impersonation is disabled.
118    ///
119    /// For [`Impersonate::Random`], a new profile is selected each time this method
120    /// is called, so consecutive calls may return different values.
121    pub fn select(&self) -> Option<&str> {
122        match self {
123            Self::None => None,
124            Self::Single(s) => Some(s.as_str()),
125            Self::Random(list) => {
126                if list.is_empty() {
127                    None
128                } else {
129                    use rand::Rng;
130                    let idx = rand::thread_rng().gen_range(0..list.len());
131                    Some(list[idx].as_str())
132                }
133            }
134        }
135    }
136}
137
138/// Builder for constructing a [`FetcherConfig`] with validation.
139///
140/// The builder provides a fluent API for setting configuration options and catches
141/// invalid combinations at build time (e.g., setting both a static proxy and a proxy
142/// rotator). Call [`build()`](FetcherConfigBuilder::build) to get the validated config.
143///
144/// ```rust,no_run
145/// # use scrapling_fetch::config::*;
146/// let (config, rotator) = FetcherConfigBuilder::new()
147///     .timeout_secs(10)
148///     .retries(5)
149///     .follow_redirects(FollowRedirects::All)
150///     .build()
151///     .unwrap();
152/// ```
153pub struct FetcherConfigBuilder {
154    config: FetcherConfig,
155    proxy_rotator: Option<ProxyRotator>,
156}
157
158impl FetcherConfigBuilder {
159    /// Creates a new builder pre-populated with the same defaults as
160    /// [`FetcherConfig::default()`] -- 30s timeout, 3 retries, Chrome impersonation, etc.
161    pub fn new() -> Self {
162        Self {
163            config: FetcherConfig::default(),
164            proxy_rotator: None,
165        }
166    }
167
168    /// Sets the browser impersonation profile. See [`Impersonate`] for the available
169    /// strategies (none, single browser, or random rotation).
170    pub fn impersonate(mut self, imp: Impersonate) -> Self {
171        self.config.impersonate = imp;
172        self
173    }
174
175    /// Enables or disables stealth header injection. When enabled, the fetcher adds
176    /// browser-like headers (Referer, Sec-Fetch-*, etc.) to help bypass bot detection.
177    pub fn stealthy_headers(mut self, enabled: bool) -> Self {
178        self.config.stealthy_headers = enabled;
179        self
180    }
181
182    /// Sets a static proxy for all requests. Cannot be combined with
183    /// [`proxy_rotator()`](Self::proxy_rotator) -- the builder will return an error on
184    /// [`build()`](Self::build) if both are set.
185    pub fn proxy(mut self, proxy: Proxy) -> Self {
186        self.config.proxy = Some(proxy);
187        self
188    }
189
190    /// Sets the request timeout in seconds. This covers the entire request lifecycle
191    /// including DNS, TLS handshake, and body download.
192    pub fn timeout_secs(mut self, secs: u64) -> Self {
193        self.config.timeout_secs = secs;
194        self
195    }
196
197    /// Adds a single default header that will be sent with every request.
198    /// Call multiple times to add several headers. Per-request headers in
199    /// [`RequestConfig`](crate::RequestConfig) take precedence over these.
200    pub fn header(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
201        self.config.headers.insert(name.into(), value.into());
202        self
203    }
204
205    /// Replaces all default headers with the given map. Any headers previously
206    /// added with [`header()`](Self::header) are discarded.
207    pub fn headers(mut self, headers: HashMap<String, String>) -> Self {
208        self.config.headers = headers;
209        self
210    }
211
212    /// Sets the maximum number of retry attempts. A value of 1 means the request is
213    /// tried once with no retries. The default is 3.
214    pub fn retries(mut self, retries: u32) -> Self {
215        self.config.retries = retries;
216        self
217    }
218
219    /// Sets the fixed delay in seconds between retries. There is no exponential
220    /// backoff -- each retry waits exactly this long.
221    pub fn retry_delay_secs(mut self, secs: u64) -> Self {
222        self.config.retry_delay_secs = secs;
223        self
224    }
225
226    /// Sets the redirect-following policy. See [`FollowRedirects`] for the options.
227    pub fn follow_redirects(mut self, policy: FollowRedirects) -> Self {
228        self.config.follow_redirects = policy;
229        self
230    }
231
232    /// Sets the maximum number of redirects to follow. If this limit is exceeded,
233    /// the request fails with an error rather than looping indefinitely.
234    pub fn max_redirects(mut self, max: usize) -> Self {
235        self.config.max_redirects = max;
236        self
237    }
238
239    /// Enables or disables TLS certificate verification. Disabling this is a
240    /// security risk and should only be used for testing with self-signed certs.
241    pub fn verify(mut self, verify: bool) -> Self {
242        self.config.verify = verify;
243        self
244    }
245
246    /// Sets a proxy rotator for distributing requests across multiple proxies.
247    /// Cannot be combined with [`proxy()`](Self::proxy) -- the builder will return
248    /// an error on [`build()`](Self::build) if both are set.
249    pub fn proxy_rotator(mut self, rotator: ProxyRotator) -> Self {
250        self.proxy_rotator = Some(rotator);
251        self
252    }
253
254    /// Validates and builds the configuration, returning a tuple of the config and
255    /// an optional proxy rotator. Returns an error if both a static proxy and a proxy
256    /// rotator were configured, since those options are mutually exclusive.
257    pub fn build(self) -> crate::error::Result<(FetcherConfig, Option<ProxyRotator>)> {
258        if self.proxy_rotator.is_some() && self.config.proxy.is_some() {
259            return Err(crate::error::FetchError::InvalidProxy(
260                "cannot use proxy_rotator together with static proxy".into(),
261            ));
262        }
263        Ok((self.config, self.proxy_rotator))
264    }
265}
266
267impl Default for FetcherConfigBuilder {
268    fn default() -> Self {
269        Self::new()
270    }
271}
272
273/// Configuration for the HTML parser.
274///
275/// Controls optional parsing features that sit on top of the core scrapling selector
276/// engine. Currently this is limited to adaptive parsing, which remembers page
277/// structure from prior crawls to improve extraction reliability.
278#[derive(Debug, Clone, Default)]
279pub struct ParserConfig {
280    /// Whether to enable adaptive parsing based on prior page structure. When
281    /// enabled, the parser stores structural fingerprints of previously-seen pages
282    /// and uses them to locate elements even when the HTML layout changes.
283    pub adaptive: bool,
284    /// The domain to scope adaptive parsing to. This prevents structural data
285    /// from one site bleeding into parsing heuristics for another.
286    pub adaptive_domain: String,
287}