scrapling_fetch/config.rs
1//! Configuration types for the HTTP fetcher.
2//!
3//! This module contains all the knobs you can turn to control how requests are made.
4//! The central type is [`FetcherConfig`], which holds defaults for timeouts, retries,
5//! proxies, browser impersonation, and redirect behavior. Use [`FetcherConfigBuilder`]
6//! to construct a validated config with a fluent API.
7//!
8//! [`ParserConfig`] is a separate, smaller struct that controls how the HTML parser
9//! behaves (e.g., whether adaptive parsing is enabled).
10
11use std::collections::HashMap;
12
13use crate::proxy::{Proxy, ProxyRotator};
14
15/// Policy for following HTTP redirects.
16///
17/// This controls whether the client automatically follows 3xx responses. The default
18/// is [`Safe`](FollowRedirects::Safe), which only follows redirects for GET and HEAD
19/// requests -- this prevents accidentally re-submitting POST bodies to a new URL.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum FollowRedirects {
22 /// Do not follow any redirects. The caller receives the raw 3xx response and is
23 /// responsible for handling the `Location` header manually.
24 None,
25 /// Follow redirects only for safe (non-mutating) HTTP methods like GET and HEAD.
26 /// This is the default and usually what you want.
27 Safe,
28 /// Follow all redirects regardless of HTTP method, including POST and PUT.
29 /// Use with caution -- this can re-submit request bodies to unexpected URLs.
30 All,
31}
32
33/// Configuration for the HTTP fetcher.
34///
35/// This struct holds the default settings applied to every request made by a
36/// [`Fetcher`](crate::Fetcher) or [`FetcherSession`](crate::FetcherSession).
37/// Individual requests can override most of these via [`RequestConfig`](crate::RequestConfig).
38/// Use [`FetcherConfigBuilder`] for a validated, fluent construction path.
39#[derive(Debug, Clone)]
40pub struct FetcherConfig {
41 /// The browser impersonation profile to use. Controls which TLS and HTTP/2
42 /// fingerprint the client presents to the server. Defaults to Chrome.
43 pub impersonate: Impersonate,
44 /// Whether to inject stealth headers (Referer, Sec-Ch-Ua, etc.) that make
45 /// requests look like they come from a real browser. Enabled by default.
46 pub stealthy_headers: bool,
47 /// An optional static proxy to route all requests through. If you need to
48 /// rotate across multiple proxies, use a [`ProxyRotator`](crate::ProxyRotator) instead.
49 pub proxy: Option<Proxy>,
50 /// Request timeout in seconds. Defaults to 30. Applies to the entire request
51 /// lifecycle including DNS resolution, connection, and response body download.
52 pub timeout_secs: u64,
53 /// Default headers to include with every request. These are merged with
54 /// per-request headers, with per-request values taking precedence on conflict.
55 pub headers: HashMap<String, String>,
56 /// Maximum number of retry attempts per request. Defaults to 3. Set to 1 to
57 /// disable retries entirely.
58 pub retries: u32,
59 /// Delay in seconds between retry attempts. Defaults to 1. This is a fixed
60 /// delay, not exponential backoff.
61 pub retry_delay_secs: u64,
62 /// The redirect-following policy. Defaults to [`FollowRedirects::Safe`].
63 pub follow_redirects: FollowRedirects,
64 /// Maximum number of redirects to follow before giving up. Defaults to 30.
65 /// Only applies when `follow_redirects` is not [`FollowRedirects::None`].
66 pub max_redirects: usize,
67 /// Whether to verify TLS certificates. Defaults to `true`. Set to `false`
68 /// only for testing against self-signed certificates -- never in production.
69 pub verify: bool,
70}
71
72impl Default for FetcherConfig {
73 fn default() -> Self {
74 Self {
75 impersonate: Impersonate::default(),
76 stealthy_headers: true,
77 proxy: None,
78 timeout_secs: 30,
79 headers: HashMap::new(),
80 retries: 3,
81 retry_delay_secs: 1,
82 follow_redirects: FollowRedirects::Safe,
83 max_redirects: 30,
84 verify: true,
85 }
86 }
87}
88
89/// Browser impersonation strategy for TLS/HTTP fingerprinting.
90///
91/// Modern bot-detection services fingerprint the TLS ClientHello and HTTP/2 settings
92/// to distinguish real browsers from HTTP libraries. This enum controls which browser
93/// profile the underlying wreq client emulates. The default is `Single("chrome")`.
94#[derive(Debug, Clone)]
95pub enum Impersonate {
96 /// No browser impersonation. The client uses wreq's default TLS settings, which
97 /// may be detected as non-browser traffic by sophisticated bot-detection systems.
98 None,
99 /// Impersonate a single specific browser profile for all requests. Pass a string
100 /// like `"chrome"`, `"firefox"`, or `"safari"` (see [`client::resolve_emulation`](crate::client)
101 /// for the full list of supported names).
102 Single(String),
103 /// Randomly select from a list of browser profiles on each request. This adds
104 /// diversity to your fingerprint, which can help avoid detection when scraping
105 /// at scale.
106 Random(Vec<String>),
107}
108
109impl Default for Impersonate {
110 fn default() -> Self {
111 Self::Single("chrome".to_owned())
112 }
113}
114
115impl Impersonate {
116 /// Returns the browser profile name to use for the current request, or `None`
117 /// if impersonation is disabled.
118 ///
119 /// For [`Impersonate::Random`], a new profile is selected each time this method
120 /// is called, so consecutive calls may return different values.
121 pub fn select(&self) -> Option<&str> {
122 match self {
123 Self::None => None,
124 Self::Single(s) => Some(s.as_str()),
125 Self::Random(list) => {
126 if list.is_empty() {
127 None
128 } else {
129 use rand::Rng;
130 let idx = rand::thread_rng().gen_range(0..list.len());
131 Some(list[idx].as_str())
132 }
133 }
134 }
135 }
136}
137
138/// Builder for constructing a [`FetcherConfig`] with validation.
139///
140/// The builder provides a fluent API for setting configuration options and catches
141/// invalid combinations at build time (e.g., setting both a static proxy and a proxy
142/// rotator). Call [`build()`](FetcherConfigBuilder::build) to get the validated config.
143///
144/// ```rust,no_run
145/// # use scrapling_fetch::config::*;
146/// let (config, rotator) = FetcherConfigBuilder::new()
147/// .timeout_secs(10)
148/// .retries(5)
149/// .follow_redirects(FollowRedirects::All)
150/// .build()
151/// .unwrap();
152/// ```
153pub struct FetcherConfigBuilder {
154 config: FetcherConfig,
155 proxy_rotator: Option<ProxyRotator>,
156}
157
158impl FetcherConfigBuilder {
159 /// Creates a new builder pre-populated with the same defaults as
160 /// [`FetcherConfig::default()`] -- 30s timeout, 3 retries, Chrome impersonation, etc.
161 pub fn new() -> Self {
162 Self {
163 config: FetcherConfig::default(),
164 proxy_rotator: None,
165 }
166 }
167
168 /// Sets the browser impersonation profile. See [`Impersonate`] for the available
169 /// strategies (none, single browser, or random rotation).
170 pub fn impersonate(mut self, imp: Impersonate) -> Self {
171 self.config.impersonate = imp;
172 self
173 }
174
175 /// Enables or disables stealth header injection. When enabled, the fetcher adds
176 /// browser-like headers (Referer, Sec-Fetch-*, etc.) to help bypass bot detection.
177 pub fn stealthy_headers(mut self, enabled: bool) -> Self {
178 self.config.stealthy_headers = enabled;
179 self
180 }
181
182 /// Sets a static proxy for all requests. Cannot be combined with
183 /// [`proxy_rotator()`](Self::proxy_rotator) -- the builder will return an error on
184 /// [`build()`](Self::build) if both are set.
185 pub fn proxy(mut self, proxy: Proxy) -> Self {
186 self.config.proxy = Some(proxy);
187 self
188 }
189
190 /// Sets the request timeout in seconds. This covers the entire request lifecycle
191 /// including DNS, TLS handshake, and body download.
192 pub fn timeout_secs(mut self, secs: u64) -> Self {
193 self.config.timeout_secs = secs;
194 self
195 }
196
197 /// Adds a single default header that will be sent with every request.
198 /// Call multiple times to add several headers. Per-request headers in
199 /// [`RequestConfig`](crate::RequestConfig) take precedence over these.
200 pub fn header(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
201 self.config.headers.insert(name.into(), value.into());
202 self
203 }
204
205 /// Replaces all default headers with the given map. Any headers previously
206 /// added with [`header()`](Self::header) are discarded.
207 pub fn headers(mut self, headers: HashMap<String, String>) -> Self {
208 self.config.headers = headers;
209 self
210 }
211
212 /// Sets the maximum number of retry attempts. A value of 1 means the request is
213 /// tried once with no retries. The default is 3.
214 pub fn retries(mut self, retries: u32) -> Self {
215 self.config.retries = retries;
216 self
217 }
218
219 /// Sets the fixed delay in seconds between retries. There is no exponential
220 /// backoff -- each retry waits exactly this long.
221 pub fn retry_delay_secs(mut self, secs: u64) -> Self {
222 self.config.retry_delay_secs = secs;
223 self
224 }
225
226 /// Sets the redirect-following policy. See [`FollowRedirects`] for the options.
227 pub fn follow_redirects(mut self, policy: FollowRedirects) -> Self {
228 self.config.follow_redirects = policy;
229 self
230 }
231
232 /// Sets the maximum number of redirects to follow. If this limit is exceeded,
233 /// the request fails with an error rather than looping indefinitely.
234 pub fn max_redirects(mut self, max: usize) -> Self {
235 self.config.max_redirects = max;
236 self
237 }
238
239 /// Enables or disables TLS certificate verification. Disabling this is a
240 /// security risk and should only be used for testing with self-signed certs.
241 pub fn verify(mut self, verify: bool) -> Self {
242 self.config.verify = verify;
243 self
244 }
245
246 /// Sets a proxy rotator for distributing requests across multiple proxies.
247 /// Cannot be combined with [`proxy()`](Self::proxy) -- the builder will return
248 /// an error on [`build()`](Self::build) if both are set.
249 pub fn proxy_rotator(mut self, rotator: ProxyRotator) -> Self {
250 self.proxy_rotator = Some(rotator);
251 self
252 }
253
254 /// Validates and builds the configuration, returning a tuple of the config and
255 /// an optional proxy rotator. Returns an error if both a static proxy and a proxy
256 /// rotator were configured, since those options are mutually exclusive.
257 pub fn build(self) -> crate::error::Result<(FetcherConfig, Option<ProxyRotator>)> {
258 if self.proxy_rotator.is_some() && self.config.proxy.is_some() {
259 return Err(crate::error::FetchError::InvalidProxy(
260 "cannot use proxy_rotator together with static proxy".into(),
261 ));
262 }
263 Ok((self.config, self.proxy_rotator))
264 }
265}
266
267impl Default for FetcherConfigBuilder {
268 fn default() -> Self {
269 Self::new()
270 }
271}
272
273/// Configuration for the HTML parser.
274///
275/// Controls optional parsing features that sit on top of the core scrapling selector
276/// engine. Currently this is limited to adaptive parsing, which remembers page
277/// structure from prior crawls to improve extraction reliability.
278#[derive(Debug, Clone, Default)]
279pub struct ParserConfig {
280 /// Whether to enable adaptive parsing based on prior page structure. When
281 /// enabled, the parser stores structural fingerprints of previously-seen pages
282 /// and uses them to locate elements even when the HTML layout changes.
283 pub adaptive: bool,
284 /// The domain to scope adaptive parsing to. This prevents structural data
285 /// from one site bleeding into parsing heuristics for another.
286 pub adaptive_domain: String,
287}