Skip to main content

adler_core/client/
builder.rs

1//! `ClientBuilder` — public configuration surface for [`Client`].
2//!
3//! Every CLI flag that affects HTTP behaviour (timeout / retries /
4//! proxy / Tor / UA rotation / egress pool / sessions / browser
5//! backend / escalation budget) maps onto a method here. The builder
6//! pattern lets `.build()` enforce derived invariants (the impersonate
7//! transport must initialise before the client is returned) without
8//! exposing them as fallible setters.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::Duration;
14
15use reqwest::redirect;
16
17use crate::access::{EgressPool, EgressSpec, SessionStore};
18use crate::browser::{BrowserBackend, BrowserBudget};
19use crate::error::{Error, Result};
20use crate::retry::RetryPolicy;
21use crate::robots::RobotsCache;
22use crate::throttle::HostThrottle;
23use crate::transport::HttpFetcher;
24#[cfg(feature = "impersonate")]
25use crate::transport::ImpersonateFetcher;
26
27use super::util::default_user_agent;
28use super::{
29    Client, DEFAULT_CONNECT_TIMEOUT, DEFAULT_PER_HOST_INTERVAL, DEFAULT_REDIRECT_LIMIT,
30    DEFAULT_TIMEOUT,
31};
32
33/// Builder for [`Client`].
34#[derive(Clone)]
35#[must_use = "ClientBuilder does nothing until `.build()` is called"]
36// A configuration builder accumulates many small flags; the four bool
37// fields here are semantically independent (redirect / enrich /
38// respect-robots / escalation), so collapsing them into a state machine
39// or enum would obscure rather than clarify.
40#[allow(clippy::struct_excessive_bools)]
41pub struct ClientBuilder {
42    timeout: Duration,
43    connect_timeout: Duration,
44    user_agent: String,
45    follow_redirects: bool,
46    redirect_limit: usize,
47    min_request_interval: Duration,
48    max_rps: Option<NonZeroU32>,
49    retry: RetryPolicy,
50    proxy: Option<String>,
51    user_agents: Vec<String>,
52    enrich: bool,
53    respect_robots: bool,
54    browser: Option<Arc<dyn BrowserBackend>>,
55    browser_budget: usize,
56    egress: Vec<EgressSpec>,
57    sessions: SessionStore,
58    escalation_budget: usize,
59    escalation_enabled: bool,
60}
61
62impl Default for ClientBuilder {
63    fn default() -> Self {
64        Self {
65            timeout: DEFAULT_TIMEOUT,
66            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
67            user_agent: default_user_agent(),
68            follow_redirects: true,
69            redirect_limit: DEFAULT_REDIRECT_LIMIT,
70            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
71            max_rps: None,
72            retry: RetryPolicy::default(),
73            proxy: None,
74            user_agents: Vec::new(),
75            enrich: false,
76            respect_robots: false,
77            browser: None,
78            browser_budget: DEFAULT_BROWSER_BUDGET,
79            egress: Vec::new(),
80            sessions: SessionStore::new(),
81            escalation_budget: DEFAULT_ESCALATION_BUDGET,
82            escalation_enabled: true,
83        }
84    }
85}
86
87impl ClientBuilder {
88    /// Per-request timeout (covers connect, headers, and body read).
89    pub fn timeout(mut self, timeout: Duration) -> Self {
90        self.timeout = timeout;
91        self
92    }
93
94    /// TCP-connect timeout, applied independently of the request timeout.
95    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
96        self.connect_timeout = timeout;
97        self
98    }
99
100    /// Override the `User-Agent` header sent on every request.
101    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
102        self.user_agent = user_agent.into();
103        self
104    }
105
106    /// Toggle automatic redirect following. Defaults to `true`; disable when
107    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
108    pub fn follow_redirects(mut self, follow: bool) -> Self {
109        self.follow_redirects = follow;
110        self
111    }
112
113    /// Minimum time between consecutive requests to the same host.
114    ///
115    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
116    /// rate-limit responses on common OSINT targets while keeping fan-out
117    /// across many sites fast.
118    pub fn min_request_interval(mut self, interval: Duration) -> Self {
119        self.min_request_interval = interval;
120        self
121    }
122
123    /// Cap the total request rate across *all* hosts to `rps` requests per
124    /// second. Independent of (and composed with) the per-host interval —
125    /// useful on a metered connection or behind a shared-quota proxy.
126    /// Uncapped by default.
127    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
128        self.max_rps = Some(rps);
129        self
130    }
131
132    /// Maximum retry attempts after a transient ban response. Defaults to 2
133    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
134    pub fn max_retries(mut self, n: u32) -> Self {
135        self.retry.max_retries = n;
136        self
137    }
138
139    /// Base delay for the first retry. Subsequent retries double until
140    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
141    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
142        self.retry.base_delay = d;
143        self
144    }
145
146    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
147    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
148        self.retry.max_delay = d;
149        self
150    }
151
152    /// Route all requests through a proxy. Accepts `http://`, `https://`,
153    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
154    pub fn proxy(mut self, url: impl Into<String>) -> Self {
155        self.proxy = Some(url.into());
156        self
157    }
158
159    /// Rotate the `User-Agent` header per request, picking uniformly at
160    /// random from `agents`. An empty list (the default) keeps the single
161    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
162    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
163        self.user_agents = agents;
164        self
165    }
166
167    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
168    /// pages. Off by default; enables an extra body read for matching sites.
169    pub fn enrich(mut self, enrich: bool) -> Self {
170        self.enrich = enrich;
171        self
172    }
173
174    /// Honor each host's `robots.txt`: probes to disallowed paths are
175    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
176    /// default. Adds one cached `robots.txt` fetch per origin.
177    pub fn respect_robots(mut self, respect: bool) -> Self {
178        self.respect_robots = respect;
179        self
180    }
181
182    /// Attach a browser backend. Sites tagged `bot-protected` will be
183    /// routed through it instead of the raw HTTP path, up to the
184    /// [`browser_budget`](Self::browser_budget) cap.
185    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
186        self.browser = Some(backend);
187        self
188    }
189
190    /// Per-scan cap on how many `bot-protected` sites are allowed to use
191    /// the browser backend. Once exhausted, the rest fall back to
192    /// `Uncertain(BrowserBudget)`. Defaults to
193    /// [`DEFAULT_BROWSER_BUDGET`].
194    pub const fn browser_budget(mut self, cap: usize) -> Self {
195        self.browser_budget = cap;
196        self
197    }
198
199    /// Per-scan cap on automatic escalations from the cheap transport
200    /// (HTTP / impersonate) to the browser when the cheap path returns
201    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
202    /// [`browser_budget`](Self::browser_budget). Defaults to
203    /// [`DEFAULT_ESCALATION_BUDGET`]. `cap = 0` is equivalent to
204    /// [`disable_escalation`](Self::disable_escalation).
205    pub const fn escalation_budget(mut self, cap: usize) -> Self {
206        self.escalation_budget = cap;
207        self
208    }
209
210    /// Disable automatic escalation entirely — the cheap transport's
211    /// outcome is returned verbatim, even when its `Uncertain` reason is
212    /// one a browser fetch would resolve. Useful for benchmarking the
213    /// raw HTTP signals without the access-engine lift on top.
214    pub const fn disable_escalation(mut self) -> Self {
215        self.escalation_enabled = false;
216        self
217    }
218
219    /// Configure the egress pool: proxies tagged by country / IP type
220    /// that sites with an `access` policy can require. Sites without a
221    /// policy are unaffected (they use the default egress / `--proxy`).
222    /// Replaces any previously set pool.
223    pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
224        self.egress = egress;
225        self
226    }
227
228    /// Supply operator authenticated sessions. A site whose `access`
229    /// policy names a session has that session's headers (cookies /
230    /// tokens) applied to its probe; a named-but-missing session yields
231    /// `Uncertain(SessionRequired)` rather than a login-wall false
232    /// negative. Replaces any previously set store.
233    pub fn sessions(mut self, sessions: SessionStore) -> Self {
234        self.sessions = sessions;
235        self
236    }
237
238    /// Build a [`Client`].
239    pub fn build(self) -> Result<Client> {
240        let inner = build_reqwest(
241            &self.user_agent,
242            self.timeout,
243            self.connect_timeout,
244            self.follow_redirects,
245            self.redirect_limit,
246            self.proxy.as_deref(),
247        )?;
248
249        // One HTTP client per configured egress — `reqwest` bakes the
250        // proxy in at build time, so geo / IP-type routing means a
251        // distinct client per proxy, paired with its match metadata.
252        let mut egress_entries = Vec::with_capacity(self.egress.len());
253        for spec in &self.egress {
254            let client = build_reqwest(
255                &self.user_agent,
256                self.timeout,
257                self.connect_timeout,
258                self.follow_redirects,
259                self.redirect_limit,
260                Some(&spec.url),
261            )?;
262            egress_entries.push((
263                spec.name.clone(),
264                spec.country.clone(),
265                spec.kind,
266                Arc::new(HttpFetcher::new(client)),
267            ));
268        }
269
270        let global_throttle = self.max_rps.map(|rps| {
271            // Min spacing between any two requests = 1s / rps.
272            let interval = Duration::from_secs(1) / rps.get();
273            HostThrottle::new(interval)
274        });
275        let robots = self
276            .respect_robots
277            .then(|| RobotsCache::new(inner.clone(), "adler"));
278        // Build the impersonate fetcher up front when the feature is on;
279        // surface a wreq init failure as `HttpSetup` so the caller sees
280        // it the same way they'd see a bad `--proxy` URL.
281        #[cfg(feature = "impersonate")]
282        let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
283        Ok(Client {
284            http: Arc::new(HttpFetcher::new(inner)),
285            egress: Arc::new(EgressPool::new(egress_entries)),
286            sessions: Arc::new(self.sessions),
287            throttle: HostThrottle::new(self.min_request_interval),
288            global_throttle,
289            retry: self.retry,
290            user_agents: Arc::from(self.user_agents),
291            enrich: self.enrich,
292            robots,
293            browser: self.browser,
294            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
295            escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
296                self.escalation_budget,
297            )),
298            escalation_enabled: self.escalation_enabled,
299            #[cfg(feature = "impersonate")]
300            impersonate,
301        })
302    }
303}
304
305/// Build a configured `reqwest::Client`, optionally routed through a
306/// proxy. Shared by the default client and every egress in the pool so
307/// they get identical timeout / redirect / User-Agent settings.
308fn build_reqwest(
309    user_agent: &str,
310    timeout: Duration,
311    connect_timeout: Duration,
312    follow_redirects: bool,
313    redirect_limit: usize,
314    proxy: Option<&str>,
315) -> Result<reqwest::Client> {
316    let redirect_policy = if follow_redirects {
317        redirect::Policy::limited(redirect_limit)
318    } else {
319        redirect::Policy::none()
320    };
321    let mut builder = reqwest::Client::builder()
322        .user_agent(user_agent.to_owned())
323        .timeout(timeout)
324        .connect_timeout(connect_timeout)
325        .redirect(redirect_policy);
326    if let Some(proxy_url) = proxy {
327        // reqwest treats a schemeless string (e.g. "not-a-url") as a host
328        // and silently defaults it to http://, so every probe would fail
329        // confusingly. Require an explicit, supported scheme up front.
330        const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
331        if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
332            return Err(Error::HttpSetup {
333                message: format!(
334                    "invalid proxy {proxy_url:?}: must start with one of {}",
335                    SCHEMES.join(", ")
336                ),
337            });
338        }
339        let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
340            message: format!("invalid proxy {proxy_url:?}: {e}"),
341        })?;
342        builder = builder.proxy(proxy);
343    }
344    builder.build().map_err(|e| Error::HttpSetup {
345        message: e.to_string(),
346    })
347}
348
349/// Default ceiling on browser-backed probes per scan when no other value
350/// is specified.
351///
352/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
353/// headroom while still being a guardrail against a misconfigured flag
354/// burning a whole Browserbase quota.
355pub const DEFAULT_BROWSER_BUDGET: usize = 50;
356
357/// Default ceiling on *automatic escalation* fetches per scan (HTTP /
358/// impersonate → browser when the cheap path returns
359/// `Uncertain(CloudflareChallenge | RateLimited)`).
360///
361/// Independent of [`DEFAULT_BROWSER_BUDGET`]: a `bot-protected` site that
362/// goes straight to the browser consumes browser budget; a non-pre-tagged
363/// site that escalates from HTTP to browser consumes one of each. Sized so
364/// a few-percent escalation rate across a typical registry stays under the
365/// cap without thinking about it.
366pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
367
368impl fmt::Debug for ClientBuilder {
369    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
370        f.debug_struct("ClientBuilder")
371            .field("timeout", &self.timeout)
372            .field("connect_timeout", &self.connect_timeout)
373            .field("user_agent", &self.user_agent)
374            .field("follow_redirects", &self.follow_redirects)
375            .field("redirect_limit", &self.redirect_limit)
376            .field("min_request_interval", &self.min_request_interval)
377            .field("max_rps", &self.max_rps)
378            .field("retry", &self.retry)
379            .field("proxy", &self.proxy)
380            .field("user_agents", &self.user_agents)
381            .field("enrich", &self.enrich)
382            .field("respect_robots", &self.respect_robots)
383            .field("browser", &self.browser.is_some())
384            .field("browser_budget", &self.browser_budget)
385            .field("egress", &self.egress)
386            .field("sessions", &self.sessions)
387            .field("escalation_budget", &self.escalation_budget)
388            .field("escalation_enabled", &self.escalation_enabled)
389            .finish()
390    }
391}