adler_core/client/builder.rs
1//! `ClientBuilder` — public configuration surface for [`Client`].
2//!
3//! Every CLI flag that affects HTTP behaviour (timeout / retries /
4//! proxy / Tor / UA rotation / egress pool / sessions / browser
5//! backend / escalation budget) maps onto a method here. The builder
6//! pattern lets `.build()` enforce derived invariants (the impersonate
7//! transport must initialise before the client is returned) without
8//! exposing them as fallible setters.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::Duration;
14
15use reqwest::redirect;
16
17use crate::access::{EgressPool, EgressSpec, SessionStore};
18use crate::browser::{BrowserBackend, BrowserBudget};
19use crate::error::{Error, Result};
20use crate::retry::RetryPolicy;
21use crate::robots::RobotsCache;
22use crate::throttle::HostThrottle;
23use crate::transport::HttpFetcher;
24#[cfg(feature = "impersonate")]
25use crate::transport::ImpersonateFetcher;
26
27use super::util::default_user_agent;
28use super::{
29 Client, DEFAULT_CONNECT_TIMEOUT, DEFAULT_PER_HOST_INTERVAL, DEFAULT_REDIRECT_LIMIT,
30 DEFAULT_TIMEOUT,
31};
32
33/// Builder for [`Client`].
34#[derive(Clone)]
35#[must_use = "ClientBuilder does nothing until `.build()` is called"]
36// A configuration builder accumulates many small flags; the four bool
37// fields here are semantically independent (redirect / enrich /
38// respect-robots / escalation), so collapsing them into a state machine
39// or enum would obscure rather than clarify.
40#[allow(clippy::struct_excessive_bools)]
41pub struct ClientBuilder {
42 timeout: Duration,
43 connect_timeout: Duration,
44 user_agent: String,
45 follow_redirects: bool,
46 redirect_limit: usize,
47 min_request_interval: Duration,
48 max_rps: Option<NonZeroU32>,
49 retry: RetryPolicy,
50 proxy: Option<String>,
51 user_agents: Vec<String>,
52 enrich: bool,
53 respect_robots: bool,
54 browser: Option<Arc<dyn BrowserBackend>>,
55 browser_budget: usize,
56 egress: Vec<EgressSpec>,
57 sessions: SessionStore,
58 escalation_budget: usize,
59 escalation_enabled: bool,
60}
61
62impl Default for ClientBuilder {
63 fn default() -> Self {
64 Self {
65 timeout: DEFAULT_TIMEOUT,
66 connect_timeout: DEFAULT_CONNECT_TIMEOUT,
67 user_agent: default_user_agent(),
68 follow_redirects: true,
69 redirect_limit: DEFAULT_REDIRECT_LIMIT,
70 min_request_interval: DEFAULT_PER_HOST_INTERVAL,
71 max_rps: None,
72 retry: RetryPolicy::default(),
73 proxy: None,
74 user_agents: Vec::new(),
75 enrich: false,
76 respect_robots: false,
77 browser: None,
78 browser_budget: DEFAULT_BROWSER_BUDGET,
79 egress: Vec::new(),
80 sessions: SessionStore::new(),
81 escalation_budget: DEFAULT_ESCALATION_BUDGET,
82 escalation_enabled: true,
83 }
84 }
85}
86
87impl ClientBuilder {
88 /// Per-request timeout (covers connect, headers, and body read).
89 pub fn timeout(mut self, timeout: Duration) -> Self {
90 self.timeout = timeout;
91 self
92 }
93
94 /// TCP-connect timeout, applied independently of the request timeout.
95 pub fn connect_timeout(mut self, timeout: Duration) -> Self {
96 self.connect_timeout = timeout;
97 self
98 }
99
100 /// Override the `User-Agent` header sent on every request.
101 pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
102 self.user_agent = user_agent.into();
103 self
104 }
105
106 /// Toggle automatic redirect following. Defaults to `true`; disable when
107 /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
108 pub fn follow_redirects(mut self, follow: bool) -> Self {
109 self.follow_redirects = follow;
110 self
111 }
112
113 /// Minimum time between consecutive requests to the same host.
114 ///
115 /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
116 /// rate-limit responses on common OSINT targets while keeping fan-out
117 /// across many sites fast.
118 pub fn min_request_interval(mut self, interval: Duration) -> Self {
119 self.min_request_interval = interval;
120 self
121 }
122
123 /// Cap the total request rate across *all* hosts to `rps` requests per
124 /// second. Independent of (and composed with) the per-host interval —
125 /// useful on a metered connection or behind a shared-quota proxy.
126 /// Uncapped by default.
127 pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
128 self.max_rps = Some(rps);
129 self
130 }
131
132 /// Maximum retry attempts after a transient ban response. Defaults to 2
133 /// (so up to 3 total tries). Set to `0` to disable retry entirely.
134 pub fn max_retries(mut self, n: u32) -> Self {
135 self.retry.max_retries = n;
136 self
137 }
138
139 /// Base delay for the first retry. Subsequent retries double until
140 /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
141 pub fn base_backoff_delay(mut self, d: Duration) -> Self {
142 self.retry.base_delay = d;
143 self
144 }
145
146 /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
147 pub fn max_backoff_delay(mut self, d: Duration) -> Self {
148 self.retry.max_delay = d;
149 self
150 }
151
152 /// Route all requests through a proxy. Accepts `http://`, `https://`,
153 /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
154 pub fn proxy(mut self, url: impl Into<String>) -> Self {
155 self.proxy = Some(url.into());
156 self
157 }
158
159 /// Rotate the `User-Agent` header per request, picking uniformly at
160 /// random from `agents`. An empty list (the default) keeps the single
161 /// fixed User-Agent. Useful for reducing trivial fingerprinting.
162 pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
163 self.user_agents = agents;
164 self
165 }
166
167 /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
168 /// pages. Off by default; enables an extra body read for matching sites.
169 pub fn enrich(mut self, enrich: bool) -> Self {
170 self.enrich = enrich;
171 self
172 }
173
174 /// Honor each host's `robots.txt`: probes to disallowed paths are
175 /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
176 /// default. Adds one cached `robots.txt` fetch per origin.
177 pub fn respect_robots(mut self, respect: bool) -> Self {
178 self.respect_robots = respect;
179 self
180 }
181
182 /// Attach a browser backend. Sites tagged `bot-protected` will be
183 /// routed through it instead of the raw HTTP path, up to the
184 /// [`browser_budget`](Self::browser_budget) cap.
185 pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
186 self.browser = Some(backend);
187 self
188 }
189
190 /// Per-scan cap on how many `bot-protected` sites are allowed to use
191 /// the browser backend. Once exhausted, the rest fall back to
192 /// `Uncertain(BrowserBudget)`. Defaults to
193 /// [`DEFAULT_BROWSER_BUDGET`].
194 pub const fn browser_budget(mut self, cap: usize) -> Self {
195 self.browser_budget = cap;
196 self
197 }
198
199 /// Per-scan cap on automatic escalations from the cheap transport
200 /// (HTTP / impersonate) to the browser when the cheap path returns
201 /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
202 /// [`browser_budget`](Self::browser_budget). Defaults to
203 /// [`DEFAULT_ESCALATION_BUDGET`]. `cap = 0` is equivalent to
204 /// [`disable_escalation`](Self::disable_escalation).
205 pub const fn escalation_budget(mut self, cap: usize) -> Self {
206 self.escalation_budget = cap;
207 self
208 }
209
210 /// Disable automatic escalation entirely — the cheap transport's
211 /// outcome is returned verbatim, even when its `Uncertain` reason is
212 /// one a browser fetch would resolve. Useful for benchmarking the
213 /// raw HTTP signals without the access-engine lift on top.
214 pub const fn disable_escalation(mut self) -> Self {
215 self.escalation_enabled = false;
216 self
217 }
218
219 /// Configure the egress pool: proxies tagged by country / IP type
220 /// that sites with an `access` policy can require. Sites without a
221 /// policy are unaffected (they use the default egress / `--proxy`).
222 /// Replaces any previously set pool.
223 pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
224 self.egress = egress;
225 self
226 }
227
228 /// Supply operator authenticated sessions. A site whose `access`
229 /// policy names a session has that session's headers (cookies /
230 /// tokens) applied to its probe; a named-but-missing session yields
231 /// `Uncertain(SessionRequired)` rather than a login-wall false
232 /// negative. Replaces any previously set store.
233 pub fn sessions(mut self, sessions: SessionStore) -> Self {
234 self.sessions = sessions;
235 self
236 }
237
238 /// Build a [`Client`].
239 pub fn build(self) -> Result<Client> {
240 let inner = build_reqwest(
241 &self.user_agent,
242 self.timeout,
243 self.connect_timeout,
244 self.follow_redirects,
245 self.redirect_limit,
246 self.proxy.as_deref(),
247 )?;
248
249 // One HTTP client per configured egress — `reqwest` bakes the
250 // proxy in at build time, so geo / IP-type routing means a
251 // distinct client per proxy, paired with its match metadata.
252 let mut egress_entries = Vec::with_capacity(self.egress.len());
253 for spec in &self.egress {
254 let client = build_reqwest(
255 &self.user_agent,
256 self.timeout,
257 self.connect_timeout,
258 self.follow_redirects,
259 self.redirect_limit,
260 Some(&spec.url),
261 )?;
262 egress_entries.push((
263 spec.name.clone(),
264 spec.country.clone(),
265 spec.kind,
266 Arc::new(HttpFetcher::new(client)),
267 ));
268 }
269
270 let global_throttle = self.max_rps.map(|rps| {
271 // Min spacing between any two requests = 1s / rps.
272 let interval = Duration::from_secs(1) / rps.get();
273 HostThrottle::new(interval)
274 });
275 let robots = self
276 .respect_robots
277 .then(|| RobotsCache::new(inner.clone(), "adler"));
278 // Build the impersonate fetcher up front when the feature is on;
279 // surface a wreq init failure as `HttpSetup` so the caller sees
280 // it the same way they'd see a bad `--proxy` URL.
281 #[cfg(feature = "impersonate")]
282 let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
283 Ok(Client {
284 http: Arc::new(HttpFetcher::new(inner)),
285 egress: Arc::new(EgressPool::new(egress_entries)),
286 sessions: Arc::new(self.sessions),
287 throttle: HostThrottle::new(self.min_request_interval),
288 global_throttle,
289 retry: self.retry,
290 user_agents: Arc::from(self.user_agents),
291 enrich: self.enrich,
292 robots,
293 browser: self.browser,
294 browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
295 escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
296 self.escalation_budget,
297 )),
298 escalation_enabled: self.escalation_enabled,
299 #[cfg(feature = "impersonate")]
300 impersonate,
301 })
302 }
303}
304
305/// Build a configured `reqwest::Client`, optionally routed through a
306/// proxy. Shared by the default client and every egress in the pool so
307/// they get identical timeout / redirect / User-Agent settings.
308fn build_reqwest(
309 user_agent: &str,
310 timeout: Duration,
311 connect_timeout: Duration,
312 follow_redirects: bool,
313 redirect_limit: usize,
314 proxy: Option<&str>,
315) -> Result<reqwest::Client> {
316 let redirect_policy = if follow_redirects {
317 redirect::Policy::limited(redirect_limit)
318 } else {
319 redirect::Policy::none()
320 };
321 let mut builder = reqwest::Client::builder()
322 .user_agent(user_agent.to_owned())
323 .timeout(timeout)
324 .connect_timeout(connect_timeout)
325 .redirect(redirect_policy);
326 if let Some(proxy_url) = proxy {
327 // reqwest treats a schemeless string (e.g. "not-a-url") as a host
328 // and silently defaults it to http://, so every probe would fail
329 // confusingly. Require an explicit, supported scheme up front.
330 const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
331 if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
332 return Err(Error::HttpSetup {
333 message: format!(
334 "invalid proxy {proxy_url:?}: must start with one of {}",
335 SCHEMES.join(", ")
336 ),
337 });
338 }
339 let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
340 message: format!("invalid proxy {proxy_url:?}: {e}"),
341 })?;
342 builder = builder.proxy(proxy);
343 }
344 builder.build().map_err(|e| Error::HttpSetup {
345 message: e.to_string(),
346 })
347}
348
349/// Default ceiling on browser-backed probes per scan when no other value
350/// is specified.
351///
352/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
353/// headroom while still being a guardrail against a misconfigured flag
354/// burning a whole Browserbase quota.
355pub const DEFAULT_BROWSER_BUDGET: usize = 50;
356
357/// Default ceiling on *automatic escalation* fetches per scan (HTTP /
358/// impersonate → browser when the cheap path returns
359/// `Uncertain(CloudflareChallenge | RateLimited)`).
360///
361/// Independent of [`DEFAULT_BROWSER_BUDGET`]: a `bot-protected` site that
362/// goes straight to the browser consumes browser budget; a non-pre-tagged
363/// site that escalates from HTTP to browser consumes one of each. Sized so
364/// a few-percent escalation rate across a typical registry stays under the
365/// cap without thinking about it.
366pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
367
368impl fmt::Debug for ClientBuilder {
369 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
370 f.debug_struct("ClientBuilder")
371 .field("timeout", &self.timeout)
372 .field("connect_timeout", &self.connect_timeout)
373 .field("user_agent", &self.user_agent)
374 .field("follow_redirects", &self.follow_redirects)
375 .field("redirect_limit", &self.redirect_limit)
376 .field("min_request_interval", &self.min_request_interval)
377 .field("max_rps", &self.max_rps)
378 .field("retry", &self.retry)
379 .field("proxy", &self.proxy)
380 .field("user_agents", &self.user_agents)
381 .field("enrich", &self.enrich)
382 .field("respect_robots", &self.respect_robots)
383 .field("browser", &self.browser.is_some())
384 .field("browser_budget", &self.browser_budget)
385 .field("egress", &self.egress)
386 .field("sessions", &self.sessions)
387 .field("escalation_budget", &self.escalation_budget)
388 .field("escalation_enabled", &self.escalation_enabled)
389 .finish()
390 }
391}