Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use reqwest::redirect;
16
17use crate::ban;
18use crate::browser::{BrowserBackend, BrowserBudget, RenderedPage};
19use crate::check::{CheckOutcome, MatchKind, UncertainReason};
20use crate::error::{Error, Result};
21use crate::retry::{self, RetryPolicy};
22use crate::robots::RobotsCache;
23use crate::site::{Probe, Signal, SignalVerdict, Site, aggregate};
24use crate::throttle::HostThrottle;
25use crate::username::Username;
26
27const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
28const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
29const DEFAULT_REDIRECT_LIMIT: usize = 8;
30const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
31/// Single fixed key for the global rate limiter (it gates all hosts).
32const GLOBAL_THROTTLE_KEY: &str = "*global*";
33
34/// HTTP client used to probe sites.
35///
36/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
37/// internally, and the throttle is `Arc`-backed, so cloning is the
38/// recommended way to share a client between tasks. Cloned clients share
39/// throttle state, which is what you want: a fan-out scan must not
40/// accidentally exceed a per-host budget by spawning more clients.
41#[derive(Clone)]
42pub struct Client {
43    inner: reqwest::Client,
44    throttle: HostThrottle,
45    /// Global RPS cap applied across all hosts. `None` → uncapped.
46    global_throttle: Option<HostThrottle>,
47    retry: RetryPolicy,
48    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
49    /// `Arc<[String]>` so cloning a client per task stays cheap.
50    user_agents: Arc<[String]>,
51    /// Extract profile fields from `Found` pages that declare extractors.
52    enrich: bool,
53    /// When set, skip probes disallowed by the host's `robots.txt`.
54    robots: Option<RobotsCache>,
55    /// Browser backend used for `bot-protected` sites. `None` → those sites
56    /// stay on the raw HTTP path and typically end up `Uncertain`.
57    browser: Option<Arc<dyn BrowserBackend>>,
58    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
59    /// for a single scan, so several tasks compete for the same budget.
60    browser_budget: Arc<BrowserBudget>,
61}
62
63impl Client {
64    /// Start configuring a new client.
65    pub fn builder() -> ClientBuilder {
66        ClientBuilder::default()
67    }
68
69    /// Probe a single site for `username`, retrying on transient bans.
70    ///
71    /// Network failures, timeouts, and unexpected response shapes all yield
72    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
73    /// returns an error: at the executor level we want a partial result for
74    /// every site, not abort-on-first-failure semantics.
75    ///
76    /// When ban detection classifies a response as `rate_limited` /
77    /// `cloudflare_challenge`, the call is retried with jittered exponential
78    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
79    /// Uncertain (network errors, body read failures) is **not** retried —
80    /// those failures rarely fix themselves in the seconds-to-minutes window
81    /// we'd block for.
82    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
83    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
84        let mut attempt: u32 = 0;
85        loop {
86            let outcome = self.probe_once(site, username).await;
87            if !retry::should_retry(&outcome, attempt, &self.retry) {
88                return outcome;
89            }
90            let delay = retry::backoff_delay(attempt, &self.retry);
91            tracing::info!(
92                site = %site.name,
93                attempt = attempt + 1,
94                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
95                ?delay,
96                "transient ban, retrying",
97            );
98            tokio::time::sleep(delay).await;
99            attempt += 1;
100        }
101    }
102
103    /// Fetch a URL and return raw response data (status, final URL, body)
104    /// with the same throttle / User-Agent / proxy machinery as `check`,
105    /// but without signal evaluation or retry.
106    ///
107    /// Returns `None` on any network/transport error. Intended for
108    /// diagnostics such as `adler --doctor --fix`, which diffs the
109    /// responses for a known-present and a nonsense user to derive a
110    /// signature.
111    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
112        let host = host_of(url);
113        if let Some(global) = &self.global_throttle {
114            global.wait(GLOBAL_THROTTLE_KEY).await;
115        }
116        self.throttle.wait(&host).await;
117        let mut request = self.inner.get(url);
118        if let Some(ua) = self.pick_user_agent() {
119            request = request.header(reqwest::header::USER_AGENT, ua);
120        }
121        let response = request.send().await.ok()?;
122        let status = response.status().as_u16();
123        let final_url = response.url().to_string();
124        let body = response.text().await.unwrap_or_default();
125        Some(RawResponse {
126            status,
127            final_url,
128            body,
129        })
130    }
131
132    /// Same as [`Self::fetch`] but routes through the configured browser
133    /// backend when the site is tagged `bot-protected` and a backend is
134    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
135    /// so that the diff-derivation works against the JS-rendered page
136    /// (login wall vs. real profile) rather than two identical raw-HTTP
137    /// shells.
138    ///
139    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
140    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
141    /// callers get the same `Option<RawResponse>` shape either way.
142    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
143        if let Some(backend) = self.browser.as_deref() {
144            if site
145                .tags
146                .iter()
147                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
148            {
149                let parsed = url::Url::parse(url).ok()?;
150                match backend
151                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
152                    .await
153                {
154                    Ok(page) => {
155                        return Some(RawResponse {
156                            status: page.status,
157                            final_url: page.final_url.to_string(),
158                            body: page.body,
159                        });
160                    }
161                    Err(err) => {
162                        tracing::warn!(
163                            site = %site.name, %url, error = %err,
164                            "browser fetch failed in doctor; falling back to raw HTTP",
165                        );
166                    }
167                }
168            }
169        }
170        self.fetch(url).await
171    }
172
173    /// Pick a User-Agent for the next request from the rotation pool, or
174    /// `None` to fall back on the client's fixed header.
175    fn pick_user_agent(&self) -> Option<&str> {
176        match self.user_agents.len() {
177            0 => None,
178            1 => Some(&self.user_agents[0]),
179            n => Some(&self.user_agents[fastrand::usize(0..n)]),
180        }
181    }
182
183    // Splitting probe_once into helpers would scatter the request/response
184    // flow that has to read top-to-bottom; one long function reads better.
185    #[allow(clippy::too_many_lines)]
186    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
187        let url = site.url_for(username);
188
189        // Site-level username constraint (Sherlock's `regexCheck`).
190        // Mismatch → skip the probe entirely. Saves a request and
191        // sidesteps the false-positive class where a site 404s on
192        // illegal usernames in a way our signal can't distinguish
193        // from a missing account. If the pattern fails to compile
194        // (Sherlock occasionally uses lookarounds, which our `regex`
195        // crate can't express), we let validate's warn-log stand
196        // and silently fall through — the rest of the probe still
197        // works.
198        if let Some(pat) = &site.regex_check {
199            if let Ok(re) = regex::Regex::new(pat) {
200                if !re.is_match(username.as_str()) {
201                    return uncertain(
202                        &site.name,
203                        url,
204                        Instant::now(),
205                        UncertainReason::UsernameNotAllowed,
206                    );
207                }
208            }
209        }
210
211        // Auto-route bot-protected sites through the browser backend when
212        // one is configured. Raw HTTP can't see past their JS/login wall,
213        // so this is the only way they ever produce a Found verdict.
214        if let Some(backend) = self.browser.as_deref() {
215            if site
216                .tags
217                .iter()
218                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
219            {
220                if self.browser_budget.try_consume() {
221                    return self.probe_with_browser(site, &url, backend).await;
222                }
223                tracing::warn!(site = %site.name, "browser budget exhausted");
224                return uncertain(
225                    &site.name,
226                    url,
227                    Instant::now(),
228                    UncertainReason::BrowserBudget,
229                );
230            }
231        }
232
233        let host = host_of(&url);
234
235        // robots.txt gate, before consuming a throttle slot or probing.
236        if let Some(robots) = &self.robots {
237            if let Some((origin, path)) = origin_and_path(&url) {
238                if !robots.allowed(&origin, &path).await {
239                    tracing::debug!(%url, "skipped by robots.txt");
240                    return uncertain(
241                        &site.name,
242                        url,
243                        Instant::now(),
244                        UncertainReason::RobotsDisallowed,
245                    );
246                }
247            }
248        }
249
250        // Global cap first (gates every request), then per-host spacing.
251        if let Some(global) = &self.global_throttle {
252            global.wait(GLOBAL_THROTTLE_KEY).await;
253        }
254        self.throttle.wait(&host).await;
255        let started = Instant::now();
256        tracing::debug!(%url, %host, "probing");
257
258        let mut request = self.inner.get(&url);
259        if let Some(ua) = self.pick_user_agent() {
260            request = request.header(reqwest::header::USER_AGENT, ua);
261        }
262        let response = match request.send().await {
263            Ok(r) => r,
264            Err(err) => {
265                tracing::debug!(error = %err, "request failed");
266                return uncertain(
267                    &site.name,
268                    url,
269                    started,
270                    UncertainReason::Network(err.to_string()),
271                );
272            }
273        };
274
275        let status = response.status().as_u16();
276        let final_url = response.url().to_string();
277
278        if let Some(reason) = ban::detect_pre_body(status, response.headers()) {
279            tracing::warn!(%host, status, %reason, "ban-like response");
280            return uncertain(&site.name, url, started, reason);
281        }
282
283        // Read the body if a signal needs it, or if enrichment is on and the
284        // site has extractor rules (extraction needs the body).
285        let want_enrich = self.enrich && !site.extract.is_empty();
286        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
287        let body = if needs_body {
288            match response.text().await {
289                Ok(b) => b,
290                Err(err) => {
291                    return uncertain(
292                        &site.name,
293                        url,
294                        started,
295                        UncertainReason::BodyRead(err.to_string()),
296                    );
297                }
298            }
299        } else {
300            String::new()
301        };
302
303        if !body.is_empty() {
304            if let Some(reason) = ban::detect_in_body(&body) {
305                tracing::warn!(%host, %reason, "ban-like body");
306                return uncertain(&site.name, url, started, reason);
307            }
308        }
309
310        let probe = Probe {
311            status,
312            final_url: &final_url,
313            body: &body,
314        };
315        let votes: Vec<(&Signal, SignalVerdict)> = site
316            .signals
317            .iter()
318            .map(|s| (s, s.evaluate(&probe)))
319            .collect();
320        let kind = aggregate(votes.iter().map(|(_, v)| *v));
321        let mut result = outcome(&site.name, url, started, kind);
322        // Record which signals produced the verdict (the winning polarity).
323        let winning = match kind {
324            MatchKind::Found => Some(SignalVerdict::Found),
325            MatchKind::NotFound => Some(SignalVerdict::NotFound),
326            MatchKind::Uncertain => None,
327        };
328        if let Some(want) = winning {
329            result.evidence = votes
330                .iter()
331                .filter(|(_, v)| *v == want)
332                .map(|(s, _)| s.describe_match(&probe))
333                .collect();
334        }
335        if want_enrich && kind == MatchKind::Found {
336            result.enrichment = crate::enrich::extract(&body, &site.extract);
337        }
338        result
339    }
340
341    /// Render `url` through the configured [`BrowserBackend`] and run the
342    /// same signal pipeline on the result. Per-fetch failures (timeout,
343    /// navigation error, etc.) surface as `Uncertain(BrowserFailed)` so
344    /// one flaky bot-protected site can't abort the scan.
345    async fn probe_with_browser(
346        &self,
347        site: &Site,
348        url: &str,
349        backend: &dyn BrowserBackend,
350    ) -> CheckOutcome {
351        let started = Instant::now();
352        let parsed = match url::Url::parse(url) {
353            Ok(u) => u,
354            Err(err) => {
355                return uncertain(
356                    &site.name,
357                    url.to_owned(),
358                    started,
359                    UncertainReason::Other(format!("invalid url: {err}")),
360                );
361            }
362        };
363
364        let page: RenderedPage = match backend
365            .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
366            .await
367        {
368            Ok(p) => p,
369            Err(err) => {
370                tracing::warn!(site = %site.name, %url, error = %err, "browser fetch failed");
371                return uncertain(
372                    &site.name,
373                    url.to_owned(),
374                    started,
375                    UncertainReason::BrowserFailed(err.to_string()),
376                );
377            }
378        };
379
380        let final_url_str = page.final_url.as_str().to_owned();
381        let probe = Probe {
382            status: page.status,
383            final_url: &final_url_str,
384            body: &page.body,
385        };
386        let votes: Vec<(&Signal, SignalVerdict)> = site
387            .signals
388            .iter()
389            .map(|s| (s, s.evaluate(&probe)))
390            .collect();
391        let kind = aggregate(votes.iter().map(|(_, v)| *v));
392        let mut result = outcome(&site.name, url.to_owned(), started, kind);
393        let winning = match kind {
394            MatchKind::Found => Some(SignalVerdict::Found),
395            MatchKind::NotFound => Some(SignalVerdict::NotFound),
396            MatchKind::Uncertain => None,
397        };
398        if let Some(want) = winning {
399            result.evidence = votes
400                .iter()
401                .filter(|(_, v)| *v == want)
402                .map(|(s, _)| s.describe_match(&probe))
403                .collect();
404        }
405        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
406            result.enrichment = crate::enrich::extract(&page.body, &site.extract);
407        }
408        result
409    }
410}
411
412/// Raw response data returned by [`Client::fetch`] for diagnostics.
413#[derive(Debug, Clone)]
414pub struct RawResponse {
415    /// HTTP status code.
416    pub status: u16,
417    /// Final URL after redirects.
418    pub final_url: String,
419    /// Decoded response body.
420    pub body: String,
421}
422
423/// Builder for [`Client`].
424#[derive(Clone)]
425#[must_use = "ClientBuilder does nothing until `.build()` is called"]
426pub struct ClientBuilder {
427    timeout: Duration,
428    connect_timeout: Duration,
429    user_agent: String,
430    follow_redirects: bool,
431    redirect_limit: usize,
432    min_request_interval: Duration,
433    max_rps: Option<NonZeroU32>,
434    retry: RetryPolicy,
435    proxy: Option<String>,
436    user_agents: Vec<String>,
437    enrich: bool,
438    respect_robots: bool,
439    browser: Option<Arc<dyn BrowserBackend>>,
440    browser_budget: usize,
441}
442
443impl Default for ClientBuilder {
444    fn default() -> Self {
445        Self {
446            timeout: DEFAULT_TIMEOUT,
447            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
448            user_agent: default_user_agent(),
449            follow_redirects: true,
450            redirect_limit: DEFAULT_REDIRECT_LIMIT,
451            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
452            max_rps: None,
453            retry: RetryPolicy::default(),
454            proxy: None,
455            user_agents: Vec::new(),
456            enrich: false,
457            respect_robots: false,
458            browser: None,
459            browser_budget: DEFAULT_BROWSER_BUDGET,
460        }
461    }
462}
463
464impl ClientBuilder {
465    /// Per-request timeout (covers connect, headers, and body read).
466    pub fn timeout(mut self, timeout: Duration) -> Self {
467        self.timeout = timeout;
468        self
469    }
470
471    /// TCP-connect timeout, applied independently of the request timeout.
472    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
473        self.connect_timeout = timeout;
474        self
475    }
476
477    /// Override the `User-Agent` header sent on every request.
478    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
479        self.user_agent = user_agent.into();
480        self
481    }
482
483    /// Toggle automatic redirect following. Defaults to `true`; disable when
484    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
485    pub fn follow_redirects(mut self, follow: bool) -> Self {
486        self.follow_redirects = follow;
487        self
488    }
489
490    /// Minimum time between consecutive requests to the same host.
491    ///
492    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
493    /// rate-limit responses on common OSINT targets while keeping fan-out
494    /// across many sites fast.
495    pub fn min_request_interval(mut self, interval: Duration) -> Self {
496        self.min_request_interval = interval;
497        self
498    }
499
500    /// Cap the total request rate across *all* hosts to `rps` requests per
501    /// second. Independent of (and composed with) the per-host interval —
502    /// useful on a metered connection or behind a shared-quota proxy.
503    /// Uncapped by default.
504    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
505        self.max_rps = Some(rps);
506        self
507    }
508
509    /// Maximum retry attempts after a transient ban response. Defaults to 2
510    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
511    pub fn max_retries(mut self, n: u32) -> Self {
512        self.retry.max_retries = n;
513        self
514    }
515
516    /// Base delay for the first retry. Subsequent retries double until
517    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
518    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
519        self.retry.base_delay = d;
520        self
521    }
522
523    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
524    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
525        self.retry.max_delay = d;
526        self
527    }
528
529    /// Route all requests through a proxy. Accepts `http://`, `https://`,
530    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
531    pub fn proxy(mut self, url: impl Into<String>) -> Self {
532        self.proxy = Some(url.into());
533        self
534    }
535
536    /// Rotate the `User-Agent` header per request, picking uniformly at
537    /// random from `agents`. An empty list (the default) keeps the single
538    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
539    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
540        self.user_agents = agents;
541        self
542    }
543
544    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
545    /// pages. Off by default; enables an extra body read for matching sites.
546    pub fn enrich(mut self, enrich: bool) -> Self {
547        self.enrich = enrich;
548        self
549    }
550
551    /// Honor each host's `robots.txt`: probes to disallowed paths are
552    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
553    /// default. Adds one cached `robots.txt` fetch per origin.
554    pub fn respect_robots(mut self, respect: bool) -> Self {
555        self.respect_robots = respect;
556        self
557    }
558
559    /// Attach a browser backend. Sites tagged `bot-protected` will be
560    /// routed through it instead of the raw HTTP path, up to the
561    /// [`browser_budget`](Self::browser_budget) cap.
562    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
563        self.browser = Some(backend);
564        self
565    }
566
567    /// Per-scan cap on how many `bot-protected` sites are allowed to use
568    /// the browser backend. Once exhausted, the rest fall back to
569    /// `Uncertain(BrowserBudget)`. Defaults to
570    /// [`DEFAULT_BROWSER_BUDGET`].
571    pub const fn browser_budget(mut self, cap: usize) -> Self {
572        self.browser_budget = cap;
573        self
574    }
575
576    /// Build a [`Client`].
577    pub fn build(self) -> Result<Client> {
578        let redirect_policy = if self.follow_redirects {
579            redirect::Policy::limited(self.redirect_limit)
580        } else {
581            redirect::Policy::none()
582        };
583        let mut builder = reqwest::Client::builder()
584            .user_agent(self.user_agent)
585            .timeout(self.timeout)
586            .connect_timeout(self.connect_timeout)
587            .redirect(redirect_policy);
588        if let Some(proxy_url) = &self.proxy {
589            // reqwest treats a schemeless string (e.g. "not-a-url") as a host
590            // and silently defaults it to http://, so every probe would fail
591            // confusingly. Require an explicit, supported scheme up front.
592            const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
593            if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
594                return Err(Error::HttpSetup {
595                    message: format!(
596                        "invalid proxy {proxy_url:?}: must start with one of {}",
597                        SCHEMES.join(", ")
598                    ),
599                });
600            }
601            let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
602                message: format!("invalid proxy {proxy_url:?}: {e}"),
603            })?;
604            builder = builder.proxy(proxy);
605        }
606        let inner = builder.build().map_err(|e| Error::HttpSetup {
607            message: e.to_string(),
608        })?;
609        let global_throttle = self.max_rps.map(|rps| {
610            // Min spacing between any two requests = 1s / rps.
611            let interval = Duration::from_secs(1) / rps.get();
612            HostThrottle::new(interval)
613        });
614        let robots = self
615            .respect_robots
616            .then(|| RobotsCache::new(inner.clone(), "adler"));
617        Ok(Client {
618            inner,
619            throttle: HostThrottle::new(self.min_request_interval),
620            global_throttle,
621            retry: self.retry,
622            user_agents: Arc::from(self.user_agents),
623            enrich: self.enrich,
624            robots,
625            browser: self.browser,
626            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
627        })
628    }
629}
630
631/// Default ceiling on browser-backed probes per scan when no other value
632/// is specified.
633///
634/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
635/// headroom while still being a guardrail against a misconfigured flag
636/// burning a whole Browserbase quota.
637pub const DEFAULT_BROWSER_BUDGET: usize = 50;
638
639impl fmt::Debug for Client {
640    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
641        f.debug_struct("Client")
642            .field("throttle", &self.throttle)
643            .field("global_throttle", &self.global_throttle)
644            .field("retry", &self.retry)
645            .field("user_agents", &self.user_agents)
646            .field("enrich", &self.enrich)
647            .field("robots", &self.robots.is_some())
648            .field("browser", &self.browser.is_some())
649            .field("browser_budget", &self.browser_budget)
650            .finish_non_exhaustive()
651    }
652}
653
654impl fmt::Debug for ClientBuilder {
655    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
656        f.debug_struct("ClientBuilder")
657            .field("timeout", &self.timeout)
658            .field("connect_timeout", &self.connect_timeout)
659            .field("user_agent", &self.user_agent)
660            .field("follow_redirects", &self.follow_redirects)
661            .field("redirect_limit", &self.redirect_limit)
662            .field("min_request_interval", &self.min_request_interval)
663            .field("max_rps", &self.max_rps)
664            .field("retry", &self.retry)
665            .field("proxy", &self.proxy)
666            .field("user_agents", &self.user_agents)
667            .field("enrich", &self.enrich)
668            .field("respect_robots", &self.respect_robots)
669            .field("browser", &self.browser.is_some())
670            .field("browser_budget", &self.browser_budget)
671            .finish()
672    }
673}
674
675/// Per-fetch timeout passed to [`BrowserBackend::fetch`]. Browser fetches
676/// (JS execution + waits) are inherently slower than raw HTTP, so this is
677/// generous on purpose.
678const BROWSER_TIMEOUT: Duration = Duration::from_secs(60);
679
680const BOT_PROTECTED_TAG: &str = "bot-protected";
681
682fn default_user_agent() -> String {
683    format!("adler/{}", env!("CARGO_PKG_VERSION"))
684}
685
686fn host_of(url: &str) -> String {
687    reqwest::Url::parse(url)
688        .ok()
689        .and_then(|u| u.host_str().map(str::to_owned))
690        .unwrap_or_else(|| "unknown".into())
691}
692
693/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
694/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
695fn origin_and_path(url: &str) -> Option<(String, String)> {
696    let parsed = reqwest::Url::parse(url).ok()?;
697    let host = parsed.host_str()?;
698    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
699    let origin = format!("{}://{host}{port}", parsed.scheme());
700    let path = parsed.query().map_or_else(
701        || parsed.path().to_owned(),
702        |q| format!("{}?{q}", parsed.path()),
703    );
704    Some((origin, path))
705}
706
707fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
708    CheckOutcome {
709        site: site.to_owned(),
710        url,
711        kind,
712        reason: None,
713        elapsed_ms: elapsed_ms(started),
714        enrichment: std::collections::BTreeMap::new(),
715        evidence: Vec::new(),
716    }
717}
718
719fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
720    CheckOutcome {
721        site: site.to_owned(),
722        url,
723        kind: MatchKind::Uncertain,
724        reason: Some(reason),
725        elapsed_ms: elapsed_ms(started),
726        enrichment: std::collections::BTreeMap::new(),
727        evidence: Vec::new(),
728    }
729}
730
731fn elapsed_ms(started: Instant) -> u64 {
732    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
733}
734
735#[cfg(test)]
736mod tests {
737    use super::*;
738    use crate::site::{Signal, UrlTemplate};
739    use wiremock::matchers::{method, path};
740    use wiremock::{Mock, MockServer, ResponseTemplate};
741
742    fn build_client() -> Client {
743        Client::builder()
744            .timeout(Duration::from_secs(2))
745            // Tests share `127.0.0.1` as host — keep throttle out of the
746            // way for everything but the dedicated throttle test below.
747            .min_request_interval(Duration::ZERO)
748            // Default retry would re-hit ban-test mocks; tests opt in
749            // explicitly when they want to exercise the retry path.
750            .max_retries(0)
751            .build()
752            .expect("client builds")
753    }
754
755    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
756        Site {
757            name: "Mock".into(),
758            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
759            signals,
760            known_present: None,
761            known_absent: None,
762            extract: Vec::new(),
763            tags: Vec::new(),
764            request_headers: std::collections::BTreeMap::new(),
765            regex_check: None,
766            engine: None,
767        }
768    }
769
770    fn user() -> Username {
771        Username::new("alice").unwrap()
772    }
773
774    #[tokio::test]
775    async fn regex_check_short_circuits_before_any_request() {
776        // Stand up a mock that would 200 on *anything* — if probe_once
777        // failed to short-circuit on regex mismatch, the username
778        // "alice" (5 chars) would resolve to Found here.
779        let server = MockServer::start().await;
780        Mock::given(method("GET"))
781            .respond_with(ResponseTemplate::new(200))
782            .mount(&server)
783            .await;
784        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
785        // The site only accepts usernames of 8+ chars; "alice" is 5.
786        site.regex_check = Some("^[A-Za-z]{8,}$".into());
787        let outcome = build_client().check(&site, &user()).await;
788        assert_eq!(outcome.kind, MatchKind::Uncertain);
789        assert!(
790            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
791            "expected UsernameNotAllowed, got {:?}",
792            outcome.reason,
793        );
794        // No request should have hit the mock — assert by counting
795        // received_requests on the wiremock server.
796        let recvd = server.received_requests().await.unwrap_or_default();
797        assert_eq!(
798            recvd.len(),
799            0,
800            "regex_check mismatch must skip the HTTP request entirely"
801        );
802    }
803
804    #[tokio::test]
805    async fn regex_check_pass_proceeds_to_probe() {
806        let server = MockServer::start().await;
807        Mock::given(method("GET"))
808            .and(path("/alice"))
809            .respond_with(ResponseTemplate::new(200))
810            .mount(&server)
811            .await;
812        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
813        // Pattern that matches "alice".
814        site.regex_check = Some("^[a-z]{3,}$".into());
815        let outcome = build_client().check(&site, &user()).await;
816        assert_eq!(outcome.kind, MatchKind::Found);
817    }
818
819    #[tokio::test]
820    async fn status_signal_reports_found_on_match() {
821        let server = MockServer::start().await;
822        Mock::given(method("GET"))
823            .and(path("/alice"))
824            .respond_with(ResponseTemplate::new(200))
825            .mount(&server)
826            .await;
827        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
828        let outcome = build_client().check(&site, &user()).await;
829        assert_eq!(outcome.kind, MatchKind::Found);
830        assert!(outcome.url.ends_with("/alice"));
831        assert!(outcome.reason.is_none());
832        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
833    }
834
835    #[tokio::test]
836    async fn status_signal_pair_reports_not_found_on_404() {
837        let server = MockServer::start().await;
838        Mock::given(method("GET"))
839            .and(path("/alice"))
840            .respond_with(ResponseTemplate::new(404))
841            .mount(&server)
842            .await;
843        let site = site_with(
844            &server,
845            vec![
846                Signal::StatusFound { codes: vec![200] },
847                Signal::StatusNotFound { codes: vec![404] },
848            ],
849        );
850        let outcome = build_client().check(&site, &user()).await;
851        assert_eq!(outcome.kind, MatchKind::NotFound);
852        // Only the NotFound-voting signal is cited as evidence.
853        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
854    }
855
856    #[tokio::test]
857    async fn body_absent_signal_detects_missing_account() {
858        let server = MockServer::start().await;
859        Mock::given(method("GET"))
860            .and(path("/alice"))
861            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
862            .mount(&server)
863            .await;
864        let site = site_with(
865            &server,
866            vec![Signal::BodyAbsent {
867                text: "Profile not found".into(),
868            }],
869        );
870        let outcome = build_client().check(&site, &user()).await;
871        assert_eq!(outcome.kind, MatchKind::NotFound);
872    }
873
874    #[tokio::test]
875    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
876        // Phase 2 semantics: absence of an absence-marker is not evidence
877        // of presence — it just means we have no signal that fired.
878        let server = MockServer::start().await;
879        Mock::given(method("GET"))
880            .and(path("/alice"))
881            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
882            .mount(&server)
883            .await;
884        let site = site_with(
885            &server,
886            vec![Signal::BodyAbsent {
887                text: "Profile not found".into(),
888            }],
889        );
890        let outcome = build_client().check(&site, &user()).await;
891        assert_eq!(outcome.kind, MatchKind::Uncertain);
892    }
893
894    #[tokio::test]
895    async fn body_present_plus_absent_resolve_to_found() {
896        let server = MockServer::start().await;
897        Mock::given(method("GET"))
898            .and(path("/alice"))
899            .respond_with(
900                ResponseTemplate::new(200)
901                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
902            )
903            .mount(&server)
904            .await;
905        let site = site_with(
906            &server,
907            vec![
908                Signal::BodyPresent {
909                    text: "profile-card".into(),
910                },
911                Signal::BodyAbsent {
912                    text: "Profile not found".into(),
913                },
914            ],
915        );
916        let outcome = build_client().check(&site, &user()).await;
917        assert_eq!(outcome.kind, MatchKind::Found);
918    }
919
920    #[tokio::test]
921    async fn redirect_absent_signal_detects_missing_account() {
922        let server = MockServer::start().await;
923        Mock::given(method("GET"))
924            .and(path("/alice"))
925            .respond_with(
926                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
927            )
928            .mount(&server)
929            .await;
930        Mock::given(method("GET"))
931            .and(path("/login"))
932            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
933            .mount(&server)
934            .await;
935        let site = site_with(
936            &server,
937            vec![Signal::RedirectAbsent {
938                fragment: "/login".into(),
939            }],
940        );
941        let outcome = build_client().check(&site, &user()).await;
942        assert_eq!(outcome.kind, MatchKind::NotFound);
943    }
944
945    #[tokio::test]
946    async fn negative_signal_wins_over_positive() {
947        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
948        // (error marker appears). Negative-priority aggregation → NotFound.
949        // This is the canonical Sherlock "message" pattern: a site that
950        // returns 200 for everyone and differentiates via an error string.
951        let server = MockServer::start().await;
952        Mock::given(method("GET"))
953            .and(path("/alice"))
954            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
955            .mount(&server)
956            .await;
957        let site = site_with(
958            &server,
959            vec![
960                Signal::StatusFound { codes: vec![200] },
961                Signal::BodyAbsent {
962                    text: "Profile not found".into(),
963                },
964            ],
965        );
966        let outcome = build_client().check(&site, &user()).await;
967        assert_eq!(outcome.kind, MatchKind::NotFound);
968    }
969
970    #[tokio::test]
971    async fn network_failure_yields_uncertain() {
972        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
973        let port = listener.local_addr().unwrap().port();
974        drop(listener);
975
976        let site = Site {
977            name: "Dead".into(),
978            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
979            signals: vec![Signal::StatusFound { codes: vec![200] }],
980            known_present: None,
981            known_absent: None,
982            extract: Vec::new(),
983            tags: Vec::new(),
984            request_headers: std::collections::BTreeMap::new(),
985            regex_check: None,
986            engine: None,
987        };
988        let client = Client::builder()
989            .timeout(Duration::from_millis(500))
990            .connect_timeout(Duration::from_millis(500))
991            .max_retries(0)
992            .build()
993            .unwrap();
994        let outcome = client.check(&site, &user()).await;
995        assert_eq!(outcome.kind, MatchKind::Uncertain);
996        assert!(outcome.reason.is_some());
997    }
998
999    #[tokio::test]
1000    async fn throttle_spaces_consecutive_calls_to_same_host() {
1001        let server = MockServer::start().await;
1002        Mock::given(method("GET"))
1003            .and(path("/alice"))
1004            .respond_with(ResponseTemplate::new(200))
1005            .mount(&server)
1006            .await;
1007        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1008        // Interval is intentionally much larger than typical wiremock latency
1009        // (≤10 ms locally, can spike under heavy parallel test load). Any
1010        // value too close to HTTP latency would let the first request burn
1011        // through the throttle window and make the assertion flaky.
1012        let client = Client::builder()
1013            .timeout(Duration::from_secs(2))
1014            .min_request_interval(Duration::from_millis(300))
1015            .build()
1016            .unwrap();
1017
1018        client.check(&site, &user()).await;
1019        let started = Instant::now();
1020        client.check(&site, &user()).await;
1021        let elapsed = started.elapsed();
1022        assert!(
1023            elapsed >= Duration::from_millis(200),
1024            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1025        );
1026    }
1027
1028    #[tokio::test]
1029    async fn builder_overrides_user_agent() {
1030        let server = MockServer::start().await;
1031        Mock::given(method("GET"))
1032            .and(path("/alice"))
1033            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1034            .respond_with(ResponseTemplate::new(200))
1035            .mount(&server)
1036            .await;
1037        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1038        let client = Client::builder()
1039            .user_agent("adler-test/1.0")
1040            .build()
1041            .unwrap();
1042        let outcome = client.check(&site, &user()).await;
1043        assert_eq!(outcome.kind, MatchKind::Found);
1044    }
1045
1046    #[tokio::test]
1047    async fn rate_limit_429_yields_uncertain_with_note() {
1048        let server = MockServer::start().await;
1049        Mock::given(method("GET"))
1050            .and(path("/alice"))
1051            .respond_with(ResponseTemplate::new(429))
1052            .mount(&server)
1053            .await;
1054        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1055        let outcome = build_client().check(&site, &user()).await;
1056        assert_eq!(outcome.kind, MatchKind::Uncertain);
1057        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1058    }
1059
1060    #[tokio::test]
1061    async fn cloudflare_server_header_yields_uncertain() {
1062        let server = MockServer::start().await;
1063        Mock::given(method("GET"))
1064            .and(path("/alice"))
1065            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1066            .mount(&server)
1067            .await;
1068        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1069        let outcome = build_client().check(&site, &user()).await;
1070        assert_eq!(outcome.kind, MatchKind::Uncertain);
1071        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1072    }
1073
1074    #[tokio::test]
1075    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1076        // Body-based ban detection only runs when a signal already needs
1077        // the body — this site uses BodyAbsent so the body is read.
1078        let server = MockServer::start().await;
1079        Mock::given(method("GET"))
1080            .and(path("/alice"))
1081            .respond_with(
1082                ResponseTemplate::new(200)
1083                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1084            )
1085            .mount(&server)
1086            .await;
1087        let site = site_with(
1088            &server,
1089            vec![Signal::BodyAbsent {
1090                text: "Profile not found".into(),
1091            }],
1092        );
1093        let outcome = build_client().check(&site, &user()).await;
1094        assert_eq!(outcome.kind, MatchKind::Uncertain);
1095        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1096    }
1097
1098    #[tokio::test]
1099    async fn ban_detection_does_not_fire_on_legitimate_403() {
1100        let server = MockServer::start().await;
1101        Mock::given(method("GET"))
1102            .and(path("/alice"))
1103            .respond_with(ResponseTemplate::new(403))
1104            .mount(&server)
1105            .await;
1106        let site = site_with(
1107            &server,
1108            vec![
1109                Signal::StatusFound { codes: vec![200] },
1110                Signal::StatusNotFound { codes: vec![403] },
1111            ],
1112        );
1113        let outcome = build_client().check(&site, &user()).await;
1114        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1115        assert_eq!(outcome.kind, MatchKind::NotFound);
1116        assert!(outcome.reason.is_none());
1117    }
1118
1119    #[tokio::test]
1120    async fn retry_recovers_after_transient_429() {
1121        let server = MockServer::start().await;
1122        // First request: 429. Subsequent: 200.
1123        Mock::given(method("GET"))
1124            .and(path("/alice"))
1125            .respond_with(ResponseTemplate::new(429))
1126            .up_to_n_times(1)
1127            .mount(&server)
1128            .await;
1129        Mock::given(method("GET"))
1130            .and(path("/alice"))
1131            .respond_with(ResponseTemplate::new(200))
1132            .mount(&server)
1133            .await;
1134        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1135        let client = Client::builder()
1136            .timeout(Duration::from_secs(2))
1137            .min_request_interval(Duration::ZERO)
1138            .max_retries(2)
1139            .base_backoff_delay(Duration::from_millis(20))
1140            .max_backoff_delay(Duration::from_millis(100))
1141            .build()
1142            .unwrap();
1143        let outcome = client.check(&site, &user()).await;
1144        assert_eq!(outcome.kind, MatchKind::Found);
1145        assert!(outcome.reason.is_none());
1146    }
1147
1148    #[tokio::test]
1149    async fn retry_exhausts_and_returns_uncertain() {
1150        let server = MockServer::start().await;
1151        Mock::given(method("GET"))
1152            .and(path("/alice"))
1153            .respond_with(ResponseTemplate::new(429))
1154            .mount(&server)
1155            .await;
1156        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1157        let client = Client::builder()
1158            .timeout(Duration::from_secs(2))
1159            .min_request_interval(Duration::ZERO)
1160            .max_retries(2)
1161            .base_backoff_delay(Duration::from_millis(10))
1162            .max_backoff_delay(Duration::from_millis(50))
1163            .build()
1164            .unwrap();
1165        let outcome = client.check(&site, &user()).await;
1166        assert_eq!(outcome.kind, MatchKind::Uncertain);
1167        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1168    }
1169
1170    #[tokio::test]
1171    async fn retry_does_not_fire_on_network_error() {
1172        // Connection refused → Uncertain note starts with "request:", not a
1173        // ban marker. We must NOT retry — otherwise a single dead site
1174        // burns the full backoff budget before reporting.
1175        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1176        let port = listener.local_addr().unwrap().port();
1177        drop(listener);
1178        let site = Site {
1179            name: "Dead".into(),
1180            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1181            signals: vec![Signal::StatusFound { codes: vec![200] }],
1182            known_present: None,
1183            known_absent: None,
1184            extract: Vec::new(),
1185            tags: Vec::new(),
1186            request_headers: std::collections::BTreeMap::new(),
1187            regex_check: None,
1188            engine: None,
1189        };
1190        let client = Client::builder()
1191            .timeout(Duration::from_millis(500))
1192            .connect_timeout(Duration::from_millis(500))
1193            .min_request_interval(Duration::ZERO)
1194            .max_retries(3)
1195            .base_backoff_delay(Duration::from_secs(60))
1196            .build()
1197            .unwrap();
1198        let started = Instant::now();
1199        let outcome = client.check(&site, &user()).await;
1200        // If retry fired, we'd be sleeping minutes; instead this returns
1201        // promptly with an Uncertain.
1202        assert!(started.elapsed() < Duration::from_secs(5));
1203        assert_eq!(outcome.kind, MatchKind::Uncertain);
1204        assert!(
1205            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1206            "got {:?}",
1207            outcome.reason,
1208        );
1209    }
1210
1211    #[tokio::test]
1212    async fn rotates_user_agent_per_request() {
1213        // The mock only matches when the request carries one of the pooled
1214        // UAs; if rotation weren't applied, the default adler/x.y UA would
1215        // miss and the verdict would be NotFound.
1216        let server = MockServer::start().await;
1217        Mock::given(method("GET"))
1218            .and(path("/alice"))
1219            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1220            .respond_with(ResponseTemplate::new(200))
1221            .mount(&server)
1222            .await;
1223        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1224        let client = Client::builder()
1225            .min_request_interval(Duration::ZERO)
1226            .max_retries(0)
1227            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1228            .build()
1229            .unwrap();
1230        let outcome = client.check(&site, &user()).await;
1231        assert_eq!(outcome.kind, MatchKind::Found);
1232    }
1233
1234    #[test]
1235    fn invalid_proxy_url_fails_build() {
1236        let err = Client::builder().proxy("not a url").build().unwrap_err();
1237        assert!(matches!(err, Error::HttpSetup { .. }));
1238    }
1239
1240    #[test]
1241    fn schemeless_proxy_is_rejected_up_front() {
1242        // reqwest would silently treat this as a host; we require a scheme.
1243        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1244        let Error::HttpSetup { message } = err else {
1245            panic!("expected HttpSetup, got {err:?}");
1246        };
1247        assert!(message.contains("must start with"), "{message}");
1248    }
1249
1250    #[test]
1251    fn socks5_proxy_scheme_is_accepted() {
1252        // Valid scheme + endpoint builds fine (no connection is attempted).
1253        assert!(
1254            Client::builder()
1255                .proxy("socks5://127.0.0.1:9050")
1256                .build()
1257                .is_ok()
1258        );
1259    }
1260
1261    #[tokio::test]
1262    async fn global_rps_cap_spaces_requests_across_hosts() {
1263        // Two distinct host paths; per-host throttle is disabled, so any
1264        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1265        let server = MockServer::start().await;
1266        Mock::given(method("GET"))
1267            .respond_with(ResponseTemplate::new(200))
1268            .mount(&server)
1269            .await;
1270        let site_a = Site {
1271            name: "A".into(),
1272            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1273            signals: vec![Signal::StatusFound { codes: vec![200] }],
1274            known_present: None,
1275            known_absent: None,
1276            extract: Vec::new(),
1277            tags: Vec::new(),
1278            request_headers: std::collections::BTreeMap::new(),
1279            regex_check: None,
1280            engine: None,
1281        };
1282        let site_b = Site {
1283            name: "B".into(),
1284            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1285            signals: vec![Signal::StatusFound { codes: vec![200] }],
1286            known_present: None,
1287            known_absent: None,
1288            extract: Vec::new(),
1289            tags: Vec::new(),
1290            request_headers: std::collections::BTreeMap::new(),
1291            regex_check: None,
1292            engine: None,
1293        };
1294        // 2 RPS → ~500 ms between requests. A large interval keeps the
1295        // assertion robust even when the first probe's own duration (which
1296        // eats into the measured gap) is inflated by test instrumentation
1297        // such as coverage tooling.
1298        let client = Client::builder()
1299            .min_request_interval(Duration::ZERO)
1300            .max_retries(0)
1301            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1302            .build()
1303            .unwrap();
1304        // First request consumes the slot at t≈0; second waits ~500 ms even
1305        // though it targets a different host.
1306        client.check(&site_a, &user()).await;
1307        let started = Instant::now();
1308        client.check(&site_b, &user()).await;
1309        assert!(
1310            started.elapsed() >= Duration::from_millis(350),
1311            "global cap should space cross-host requests, got {:?}",
1312            started.elapsed(),
1313        );
1314    }
1315
1316    #[tokio::test]
1317    async fn respect_robots_skips_disallowed_paths() {
1318        let server = MockServer::start().await;
1319        Mock::given(method("GET"))
1320            .and(path("/robots.txt"))
1321            .respond_with(
1322                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1323            )
1324            .mount(&server)
1325            .await;
1326        Mock::given(method("GET"))
1327            .and(path("/no/alice"))
1328            .respond_with(ResponseTemplate::new(200))
1329            .mount(&server)
1330            .await;
1331        Mock::given(method("GET"))
1332            .and(path("/yes/alice"))
1333            .respond_with(ResponseTemplate::new(200))
1334            .mount(&server)
1335            .await;
1336        let client = Client::builder()
1337            .min_request_interval(Duration::ZERO)
1338            .max_retries(0)
1339            .respect_robots(true)
1340            .build()
1341            .unwrap();
1342
1343        let disallowed = Site {
1344            name: "No".into(),
1345            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1346            signals: vec![Signal::StatusFound { codes: vec![200] }],
1347            known_present: None,
1348            known_absent: None,
1349            extract: Vec::new(),
1350            tags: Vec::new(),
1351            request_headers: std::collections::BTreeMap::new(),
1352            regex_check: None,
1353            engine: None,
1354        };
1355        let allowed = Site {
1356            name: "Yes".into(),
1357            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1358            signals: vec![Signal::StatusFound { codes: vec![200] }],
1359            known_present: None,
1360            known_absent: None,
1361            extract: Vec::new(),
1362            tags: Vec::new(),
1363            request_headers: std::collections::BTreeMap::new(),
1364            regex_check: None,
1365            engine: None,
1366        };
1367
1368        let no = client.check(&disallowed, &user()).await;
1369        assert_eq!(no.kind, MatchKind::Uncertain);
1370        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1371
1372        let yes = client.check(&allowed, &user()).await;
1373        assert_eq!(yes.kind, MatchKind::Found);
1374    }
1375
1376    #[tokio::test]
1377    async fn body_read_skipped_when_no_body_signal_needed() {
1378        // Mock returns body that would fail a body_absent check — but since
1379        // we only have a status signal, body is never read.
1380        let server = MockServer::start().await;
1381        Mock::given(method("GET"))
1382            .and(path("/alice"))
1383            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1384            .mount(&server)
1385            .await;
1386        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1387        let outcome = build_client().check(&site, &user()).await;
1388        assert_eq!(outcome.kind, MatchKind::Found);
1389    }
1390
1391    // ===== Browser routing =====
1392
1393    /// Test backend that returns a canned page and counts calls. Lets the
1394    /// routing tests assert "Client did/did not invoke the browser" without
1395    /// involving a real Chrome process.
1396    #[derive(Debug)]
1397    struct RecordingBackend {
1398        page: RenderedPage,
1399        calls: std::sync::atomic::AtomicUsize,
1400    }
1401
1402    impl RecordingBackend {
1403        fn with_page(page: RenderedPage) -> Self {
1404            Self {
1405                page,
1406                calls: std::sync::atomic::AtomicUsize::new(0),
1407            }
1408        }
1409        fn call_count(&self) -> usize {
1410            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1411        }
1412    }
1413
1414    #[async_trait::async_trait]
1415    impl BrowserBackend for RecordingBackend {
1416        async fn fetch(
1417            &self,
1418            _url: &url::Url,
1419            _headers: &std::collections::BTreeMap<String, String>,
1420            _timeout: Duration,
1421        ) -> Result<RenderedPage> {
1422            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1423            Ok(self.page.clone())
1424        }
1425    }
1426
1427    fn site_bot_protected(server: &MockServer) -> Site {
1428        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1429        s.tags = vec!["bot-protected".into()];
1430        s
1431    }
1432
1433    #[tokio::test]
1434    async fn browser_routes_bot_protected_sites() {
1435        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1436        // returns its canned page directly.
1437        let server = MockServer::start().await;
1438        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1439            status: 200,
1440            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1441            body: "<html></html>".into(),
1442            elapsed_ms: 42,
1443        }));
1444        let client = Client::builder()
1445            .min_request_interval(Duration::ZERO)
1446            .max_retries(0)
1447            .browser(backend.clone())
1448            .build()
1449            .unwrap();
1450        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1451        assert_eq!(outcome.kind, MatchKind::Found);
1452        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1453    }
1454
1455    #[tokio::test]
1456    async fn non_bot_protected_sites_skip_browser() {
1457        let server = MockServer::start().await;
1458        Mock::given(method("GET"))
1459            .and(path("/alice"))
1460            .respond_with(ResponseTemplate::new(200))
1461            .mount(&server)
1462            .await;
1463        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1464            status: 500, // would make wiremock case fail if browser was taken
1465            final_url: url::Url::parse("https://x/").unwrap(),
1466            body: String::new(),
1467            elapsed_ms: 0,
1468        }));
1469        let client = Client::builder()
1470            .min_request_interval(Duration::ZERO)
1471            .max_retries(0)
1472            .browser(backend.clone())
1473            .build()
1474            .unwrap();
1475        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1476        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1477        let outcome = client.check(&site, &user()).await;
1478        assert_eq!(outcome.kind, MatchKind::Found);
1479        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1480    }
1481
1482    #[tokio::test]
1483    async fn browser_budget_exhaust_yields_uncertain() {
1484        let server = MockServer::start().await;
1485        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1486            status: 200,
1487            final_url: url::Url::parse("https://x/").unwrap(),
1488            body: String::new(),
1489            elapsed_ms: 0,
1490        }));
1491        let client = Client::builder()
1492            .min_request_interval(Duration::ZERO)
1493            .max_retries(0)
1494            .browser(backend.clone())
1495            .browser_budget(1)
1496            .build()
1497            .unwrap();
1498        let site = site_bot_protected(&server);
1499        // First call consumes the only slot.
1500        let first = client.check(&site, &user()).await;
1501        assert_eq!(first.kind, MatchKind::Found);
1502        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1503        let second = client.check(&site, &user()).await;
1504        assert_eq!(second.kind, MatchKind::Uncertain);
1505        assert!(matches!(
1506            second.reason,
1507            Some(UncertainReason::BrowserBudget)
1508        ));
1509        assert_eq!(
1510            backend.call_count(),
1511            1,
1512            "second call must not invoke backend"
1513        );
1514    }
1515
1516    #[tokio::test]
1517    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1518        struct FailingBackend;
1519        #[async_trait::async_trait]
1520        impl BrowserBackend for FailingBackend {
1521            async fn fetch(
1522                &self,
1523                _url: &url::Url,
1524                _headers: &std::collections::BTreeMap<String, String>,
1525                _timeout: Duration,
1526            ) -> Result<RenderedPage> {
1527                Err(Error::BrowserSetup {
1528                    message: "simulated crash".into(),
1529                })
1530            }
1531        }
1532        impl std::fmt::Debug for FailingBackend {
1533            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1534                f.write_str("FailingBackend")
1535            }
1536        }
1537
1538        let server = MockServer::start().await;
1539        let client = Client::builder()
1540            .min_request_interval(Duration::ZERO)
1541            .max_retries(0)
1542            .browser(Arc::new(FailingBackend))
1543            .build()
1544            .unwrap();
1545        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1546        assert_eq!(outcome.kind, MatchKind::Uncertain);
1547        match outcome.reason {
1548            Some(UncertainReason::BrowserFailed(msg)) => {
1549                assert!(msg.contains("simulated crash"), "got: {msg}");
1550            }
1551            other => panic!("expected BrowserFailed, got {other:?}"),
1552        }
1553    }
1554}