Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38/// Single fixed key for the global rate limiter (it gates all hosts).
39const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41/// HTTP client used to probe sites.
42///
43/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
44/// internally, and the throttle is `Arc`-backed, so cloning is the
45/// recommended way to share a client between tasks. Cloned clients share
46/// throttle state, which is what you want: a fan-out scan must not
47/// accidentally exceed a per-host budget by spawning more clients.
48#[derive(Clone)]
49pub struct Client {
50    http: Arc<HttpFetcher>,
51    /// Geo / IP-type egress pool for sites whose `access` policy needs a
52    /// specific proxy. Empty by default → every site uses `http`.
53    egress: Arc<EgressPool>,
54    /// Operator-supplied sessions, keyed by the name a site references
55    /// via `access.session`. Empty by default.
56    sessions: Arc<SessionStore>,
57    throttle: HostThrottle,
58    /// Global RPS cap applied across all hosts. `None` → uncapped.
59    global_throttle: Option<HostThrottle>,
60    retry: RetryPolicy,
61    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
62    /// `Arc<[String]>` so cloning a client per task stays cheap.
63    user_agents: Arc<[String]>,
64    /// Extract profile fields from `Found` pages that declare extractors.
65    enrich: bool,
66    /// When set, skip probes disallowed by the host's `robots.txt`.
67    robots: Option<RobotsCache>,
68    /// Browser backend used for `bot-protected` sites. `None` → those sites
69    /// stay on the raw HTTP path and typically end up `Uncertain`.
70    browser: Option<Arc<dyn BrowserBackend>>,
71    /// TLS-fingerprint-impersonating HTTP client (`wreq`). Built when
72    /// the `impersonate` Cargo feature is on; routes sites whose
73    /// `protection` is exactly `TlsFingerprint`.
74    #[cfg(feature = "impersonate")]
75    impersonate: Option<Arc<ImpersonateFetcher>>,
76    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
77    /// for a single scan, so several tasks compete for the same budget.
78    browser_budget: Arc<BrowserBudget>,
79}
80
81impl Client {
82    /// Start configuring a new client.
83    pub fn builder() -> ClientBuilder {
84        ClientBuilder::default()
85    }
86
87    /// Probe a single site for `username`, retrying on transient bans.
88    ///
89    /// Network failures, timeouts, and unexpected response shapes all yield
90    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
91    /// returns an error: at the executor level we want a partial result for
92    /// every site, not abort-on-first-failure semantics.
93    ///
94    /// When ban detection classifies a response as `rate_limited` /
95    /// `cloudflare_challenge`, the call is retried with jittered exponential
96    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
97    /// Uncertain (network errors, body read failures) is **not** retried —
98    /// those failures rarely fix themselves in the seconds-to-minutes window
99    /// we'd block for.
100    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
101    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
102        let mut attempt: u32 = 0;
103        loop {
104            let outcome = self.probe_once(site, username).await;
105            if !retry::should_retry(&outcome, attempt, &self.retry) {
106                return outcome;
107            }
108            let delay = retry::backoff_delay(attempt, &self.retry);
109            tracing::info!(
110                site = %site.name,
111                attempt = attempt + 1,
112                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
113                ?delay,
114                "transient ban, retrying",
115            );
116            tokio::time::sleep(delay).await;
117            attempt += 1;
118        }
119    }
120
121    /// Fetch a URL and return raw response data (status, final URL, body)
122    /// with the same throttle / User-Agent / proxy machinery as `check`,
123    /// but without signal evaluation or retry.
124    ///
125    /// Returns `None` on any network/transport error. Intended for
126    /// diagnostics such as `adler --doctor --fix`, which diffs the
127    /// responses for a known-present and a nonsense user to derive a
128    /// signature.
129    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
130        let host = host_of(url);
131        if let Some(global) = &self.global_throttle {
132            global.wait(GLOBAL_THROTTLE_KEY).await;
133        }
134        self.throttle.wait(&host).await;
135        let mut request = self.http.client().get(url);
136        if let Some(ua) = self.pick_user_agent() {
137            request = request.header(reqwest::header::USER_AGENT, ua);
138        }
139        let response = request.send().await.ok()?;
140        let status = response.status().as_u16();
141        let final_url = response.url().to_string();
142        let body = response.text().await.unwrap_or_default();
143        Some(RawResponse {
144            status,
145            final_url,
146            body,
147        })
148    }
149
150    /// Same as [`Self::fetch`] but routes through the configured browser
151    /// backend when the site is tagged `bot-protected` and a backend is
152    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
153    /// so that the diff-derivation works against the JS-rendered page
154    /// (login wall vs. real profile) rather than two identical raw-HTTP
155    /// shells.
156    ///
157    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
158    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
159    /// callers get the same `Option<RawResponse>` shape either way.
160    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
161        if let Some(backend) = self.browser.as_deref() {
162            let has_tag = site
163                .tags
164                .iter()
165                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
166            if has_tag || !site.protection.is_empty() {
167                let parsed = url::Url::parse(url).ok()?;
168                match backend
169                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
170                    .await
171                {
172                    Ok(page) => {
173                        return Some(RawResponse {
174                            status: page.status,
175                            final_url: page.final_url.to_string(),
176                            body: page.body,
177                        });
178                    }
179                    Err(err) => {
180                        tracing::warn!(
181                            site = %site.name, %url, error = %err,
182                            "browser fetch failed in doctor; falling back to raw HTTP",
183                        );
184                    }
185                }
186            }
187        }
188        self.fetch(url).await
189    }
190
191    /// Pick a User-Agent for the next request from the rotation pool, or
192    /// `None` to fall back on the client's fixed header.
193    fn pick_user_agent(&self) -> Option<&str> {
194        match self.user_agents.len() {
195            0 => None,
196            1 => Some(&self.user_agents[0]),
197            n => Some(&self.user_agents[fastrand::usize(0..n)]),
198        }
199    }
200
201    // Splitting probe_once into helpers would scatter the request/response
202    // flow that has to read top-to-bottom; one long function reads better.
203    #[allow(clippy::too_many_lines)]
204    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
205        let url = site.url_for(username);
206
207        // Site-level username constraint (Sherlock's `regexCheck`).
208        // Mismatch → skip the probe entirely. Saves a request and
209        // sidesteps the false-positive class where a site 404s on
210        // illegal usernames in a way our signal can't distinguish
211        // from a missing account. If the pattern fails to compile
212        // (Sherlock occasionally uses lookarounds, which our `regex`
213        // crate can't express), we let validate's warn-log stand
214        // and silently fall through — the rest of the probe still
215        // works.
216        if let Some(pat) = &site.regex_check {
217            if let Ok(re) = regex::Regex::new(pat) {
218                if !re.is_match(username.as_str()) {
219                    return uncertain(
220                        &site.name,
221                        url,
222                        Instant::now(),
223                        UncertainReason::UsernameNotAllowed,
224                    );
225                }
226            }
227        }
228
229        // Resolve an operator session if the site's access policy names
230        // one, and fold its headers (cookies / tokens) over the site's
231        // own. A named-but-missing session is reported rather than sent
232        // unauthenticated into a login wall — which reads identically
233        // for an existing and a missing account. Applies to both the
234        // HTTP and browser transports.
235        let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
236            None => Cow::Borrowed(&site.request_headers),
237            Some(name) => match self.sessions.get(name) {
238                Some(session) => Cow::Owned(session.apply(&site.request_headers)),
239                None => {
240                    return uncertain(
241                        &site.name,
242                        url,
243                        Instant::now(),
244                        UncertainReason::SessionRequired,
245                    );
246                }
247            },
248        };
249        let headers: &BTreeMap<String, String> = &session_headers;
250
251        // Auto-route bot-protected sites through the browser backend when
252        // one is configured. Raw HTTP can't see past their JS/login wall,
253        // so this is the only way they ever produce a Found verdict.
254        // A site is "bot-protected" in the routing sense if it carries
255        // the legacy tag OR declares any specific protection mechanism
256        // via the new `protection` field — either signal is enough.
257        if let Some(backend) = &self.browser {
258            let has_tag = site
259                .tags
260                .iter()
261                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
262            if has_tag || !site.protection.is_empty() {
263                if self.browser_budget.try_consume() {
264                    let started = Instant::now();
265                    let req = FetchRequest {
266                        method: site.request_method,
267                        url: &url,
268                        body: None,
269                        user_agent: None,
270                        headers,
271                        want_body: true,
272                    };
273                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
274                    return match fetcher.fetch(&req).await {
275                        Ok(resp) => self.finish(site, url, started, &resp),
276                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
277                    };
278                }
279                tracing::warn!(site = %site.name, "browser budget exhausted");
280                return uncertain(
281                    &site.name,
282                    url,
283                    Instant::now(),
284                    UncertainReason::BrowserBudget,
285                );
286            }
287        }
288
289        // Phase 2: route pure-`TlsFingerprint` sites through the
290        // impersonating transport — a real BoringSSL TLS handshake from
291        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
292        // protection tag, at a fraction of the cost of a real browser.
293        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
294        // keep going through the browser path above, where they were.
295        #[cfg(feature = "impersonate")]
296        if let Some(fetcher) = &self.impersonate {
297            let pure_tls = site.protection.len() == 1
298                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
299                && !site
300                    .tags
301                    .iter()
302                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
303            if pure_tls {
304                let started = Instant::now();
305                let req = FetchRequest {
306                    method: site.request_method,
307                    url: &url,
308                    body: None,
309                    user_agent: self.pick_user_agent(),
310                    headers,
311                    want_body: true,
312                };
313                return match fetcher.fetch(&req).await {
314                    Ok(resp) => self.finish(site, url, started, &resp),
315                    Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
316                };
317            }
318        }
319
320        // Egress selection: route the HTTP path through a geo / IP-type
321        // matching proxy when the site's access policy demands one. An
322        // unconstrained policy uses the default egress; a constrained
323        // policy with no matching egress is reported `GeoUnavailable`
324        // rather than fetched from the wrong location (a false
325        // `NotFound` would be worse than an honest `Uncertain`).
326        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
327            EgressChoice::Default => Arc::clone(&self.http),
328            EgressChoice::Use(fetcher) => fetcher,
329            EgressChoice::Unavailable => {
330                return uncertain(
331                    &site.name,
332                    url,
333                    Instant::now(),
334                    UncertainReason::GeoUnavailable,
335                );
336            }
337        };
338
339        let host = host_of(&url);
340
341        // robots.txt gate, before consuming a throttle slot or probing.
342        if let Some(robots) = &self.robots {
343            if let Some((origin, path)) = origin_and_path(&url) {
344                if !robots.allowed(&origin, &path).await {
345                    tracing::debug!(%url, "skipped by robots.txt");
346                    return uncertain(
347                        &site.name,
348                        url,
349                        Instant::now(),
350                        UncertainReason::RobotsDisallowed,
351                    );
352                }
353            }
354        }
355
356        // Global cap first (gates every request), then per-host spacing.
357        if let Some(global) = &self.global_throttle {
358            global.wait(GLOBAL_THROTTLE_KEY).await;
359        }
360        self.throttle.wait(&host).await;
361        let started = Instant::now();
362        tracing::debug!(%url, %host, "probing");
363
364        // Read the body only if a signal needs it, or enrichment is on
365        // and the site declares extractor rules (extraction needs it).
366        let want_enrich = self.enrich && !site.extract.is_empty();
367        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
368
369        // POST sites carry their own body payload (the username goes in
370        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
371        // `{username}` in `Site::request_body` is substituted here,
372        // mirroring URL substitution.
373        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
374            const USERNAME_PH: &str = "{username}";
375            site.request_body
376                .as_deref()
377                .map(|t| t.replace(USERNAME_PH, username.as_str()))
378        } else {
379            None
380        };
381
382        let req = FetchRequest {
383            method: site.request_method,
384            url: &url,
385            body: body_for_post.as_deref(),
386            user_agent: self.pick_user_agent(),
387            headers,
388            want_body: needs_body,
389        };
390        match egress.fetch(&req).await {
391            Ok(resp) => self.finish(site, url, started, &resp),
392            Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
393        }
394    }
395
396    /// Evaluate a fetched response against the site's signals and build
397    /// the outcome. Shared by the HTTP and browser transports so the
398    /// verdict / evidence / enrichment logic lives in exactly one place.
399    fn finish(
400        &self,
401        site: &Site,
402        url: String,
403        started: Instant,
404        resp: &crate::transport::FetchResponse,
405    ) -> CheckOutcome {
406        let probe = Probe {
407            status: resp.status,
408            final_url: &resp.final_url,
409            body: &resp.body,
410        };
411        let votes: Vec<(&Signal, SignalVerdict)> = site
412            .signals
413            .iter()
414            .map(|s| (s, s.evaluate(&probe)))
415            .collect();
416        let kind = aggregate(votes.iter().map(|(_, v)| *v));
417        let mut result = outcome(&site.name, url, started, kind);
418        // Record which signals produced the verdict (the winning polarity).
419        let winning = match kind {
420            MatchKind::Found => Some(SignalVerdict::Found),
421            MatchKind::NotFound => Some(SignalVerdict::NotFound),
422            MatchKind::Uncertain => None,
423        };
424        if let Some(want) = winning {
425            result.evidence = votes
426                .iter()
427                .filter(|(_, v)| *v == want)
428                .map(|(s, _)| s.describe_match(&probe))
429                .collect();
430        }
431        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
432            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
433        }
434        result
435    }
436}
437
438/// Raw response data returned by [`Client::fetch`] for diagnostics.
439#[derive(Debug, Clone)]
440pub struct RawResponse {
441    /// HTTP status code.
442    pub status: u16,
443    /// Final URL after redirects.
444    pub final_url: String,
445    /// Decoded response body.
446    pub body: String,
447}
448
449/// Builder for [`Client`].
450#[derive(Clone)]
451#[must_use = "ClientBuilder does nothing until `.build()` is called"]
452pub struct ClientBuilder {
453    timeout: Duration,
454    connect_timeout: Duration,
455    user_agent: String,
456    follow_redirects: bool,
457    redirect_limit: usize,
458    min_request_interval: Duration,
459    max_rps: Option<NonZeroU32>,
460    retry: RetryPolicy,
461    proxy: Option<String>,
462    user_agents: Vec<String>,
463    enrich: bool,
464    respect_robots: bool,
465    browser: Option<Arc<dyn BrowserBackend>>,
466    browser_budget: usize,
467    egress: Vec<EgressSpec>,
468    sessions: SessionStore,
469}
470
471impl Default for ClientBuilder {
472    fn default() -> Self {
473        Self {
474            timeout: DEFAULT_TIMEOUT,
475            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
476            user_agent: default_user_agent(),
477            follow_redirects: true,
478            redirect_limit: DEFAULT_REDIRECT_LIMIT,
479            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
480            max_rps: None,
481            retry: RetryPolicy::default(),
482            proxy: None,
483            user_agents: Vec::new(),
484            enrich: false,
485            respect_robots: false,
486            browser: None,
487            browser_budget: DEFAULT_BROWSER_BUDGET,
488            egress: Vec::new(),
489            sessions: SessionStore::new(),
490        }
491    }
492}
493
494impl ClientBuilder {
495    /// Per-request timeout (covers connect, headers, and body read).
496    pub fn timeout(mut self, timeout: Duration) -> Self {
497        self.timeout = timeout;
498        self
499    }
500
501    /// TCP-connect timeout, applied independently of the request timeout.
502    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
503        self.connect_timeout = timeout;
504        self
505    }
506
507    /// Override the `User-Agent` header sent on every request.
508    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
509        self.user_agent = user_agent.into();
510        self
511    }
512
513    /// Toggle automatic redirect following. Defaults to `true`; disable when
514    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
515    pub fn follow_redirects(mut self, follow: bool) -> Self {
516        self.follow_redirects = follow;
517        self
518    }
519
520    /// Minimum time between consecutive requests to the same host.
521    ///
522    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
523    /// rate-limit responses on common OSINT targets while keeping fan-out
524    /// across many sites fast.
525    pub fn min_request_interval(mut self, interval: Duration) -> Self {
526        self.min_request_interval = interval;
527        self
528    }
529
530    /// Cap the total request rate across *all* hosts to `rps` requests per
531    /// second. Independent of (and composed with) the per-host interval —
532    /// useful on a metered connection or behind a shared-quota proxy.
533    /// Uncapped by default.
534    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
535        self.max_rps = Some(rps);
536        self
537    }
538
539    /// Maximum retry attempts after a transient ban response. Defaults to 2
540    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
541    pub fn max_retries(mut self, n: u32) -> Self {
542        self.retry.max_retries = n;
543        self
544    }
545
546    /// Base delay for the first retry. Subsequent retries double until
547    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
548    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
549        self.retry.base_delay = d;
550        self
551    }
552
553    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
554    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
555        self.retry.max_delay = d;
556        self
557    }
558
559    /// Route all requests through a proxy. Accepts `http://`, `https://`,
560    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
561    pub fn proxy(mut self, url: impl Into<String>) -> Self {
562        self.proxy = Some(url.into());
563        self
564    }
565
566    /// Rotate the `User-Agent` header per request, picking uniformly at
567    /// random from `agents`. An empty list (the default) keeps the single
568    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
569    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
570        self.user_agents = agents;
571        self
572    }
573
574    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
575    /// pages. Off by default; enables an extra body read for matching sites.
576    pub fn enrich(mut self, enrich: bool) -> Self {
577        self.enrich = enrich;
578        self
579    }
580
581    /// Honor each host's `robots.txt`: probes to disallowed paths are
582    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
583    /// default. Adds one cached `robots.txt` fetch per origin.
584    pub fn respect_robots(mut self, respect: bool) -> Self {
585        self.respect_robots = respect;
586        self
587    }
588
589    /// Attach a browser backend. Sites tagged `bot-protected` will be
590    /// routed through it instead of the raw HTTP path, up to the
591    /// [`browser_budget`](Self::browser_budget) cap.
592    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
593        self.browser = Some(backend);
594        self
595    }
596
597    /// Per-scan cap on how many `bot-protected` sites are allowed to use
598    /// the browser backend. Once exhausted, the rest fall back to
599    /// `Uncertain(BrowserBudget)`. Defaults to
600    /// [`DEFAULT_BROWSER_BUDGET`].
601    pub const fn browser_budget(mut self, cap: usize) -> Self {
602        self.browser_budget = cap;
603        self
604    }
605
606    /// Configure the egress pool: proxies tagged by country / IP type
607    /// that sites with an `access` policy can require. Sites without a
608    /// policy are unaffected (they use the default egress / `--proxy`).
609    /// Replaces any previously set pool.
610    pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
611        self.egress = egress;
612        self
613    }
614
615    /// Supply operator authenticated sessions. A site whose `access`
616    /// policy names a session has that session's headers (cookies /
617    /// tokens) applied to its probe; a named-but-missing session yields
618    /// `Uncertain(SessionRequired)` rather than a login-wall false
619    /// negative. Replaces any previously set store.
620    pub fn sessions(mut self, sessions: SessionStore) -> Self {
621        self.sessions = sessions;
622        self
623    }
624
625    /// Build a [`Client`].
626    pub fn build(self) -> Result<Client> {
627        let inner = build_reqwest(
628            &self.user_agent,
629            self.timeout,
630            self.connect_timeout,
631            self.follow_redirects,
632            self.redirect_limit,
633            self.proxy.as_deref(),
634        )?;
635
636        // One HTTP client per configured egress — `reqwest` bakes the
637        // proxy in at build time, so geo / IP-type routing means a
638        // distinct client per proxy, paired with its match metadata.
639        let mut egress_entries = Vec::with_capacity(self.egress.len());
640        for spec in &self.egress {
641            let client = build_reqwest(
642                &self.user_agent,
643                self.timeout,
644                self.connect_timeout,
645                self.follow_redirects,
646                self.redirect_limit,
647                Some(&spec.url),
648            )?;
649            egress_entries.push((
650                spec.country.clone(),
651                spec.kind,
652                Arc::new(HttpFetcher::new(client)),
653            ));
654        }
655
656        let global_throttle = self.max_rps.map(|rps| {
657            // Min spacing between any two requests = 1s / rps.
658            let interval = Duration::from_secs(1) / rps.get();
659            HostThrottle::new(interval)
660        });
661        let robots = self
662            .respect_robots
663            .then(|| RobotsCache::new(inner.clone(), "adler"));
664        // Build the impersonate fetcher up front when the feature is on;
665        // surface a wreq init failure as `HttpSetup` so the caller sees
666        // it the same way they'd see a bad `--proxy` URL.
667        #[cfg(feature = "impersonate")]
668        let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
669        Ok(Client {
670            http: Arc::new(HttpFetcher::new(inner)),
671            egress: Arc::new(EgressPool::new(egress_entries)),
672            sessions: Arc::new(self.sessions),
673            throttle: HostThrottle::new(self.min_request_interval),
674            global_throttle,
675            retry: self.retry,
676            user_agents: Arc::from(self.user_agents),
677            enrich: self.enrich,
678            robots,
679            browser: self.browser,
680            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
681            #[cfg(feature = "impersonate")]
682            impersonate,
683        })
684    }
685}
686
687/// Build a configured `reqwest::Client`, optionally routed through a
688/// proxy. Shared by the default client and every egress in the pool so
689/// they get identical timeout / redirect / User-Agent settings.
690fn build_reqwest(
691    user_agent: &str,
692    timeout: Duration,
693    connect_timeout: Duration,
694    follow_redirects: bool,
695    redirect_limit: usize,
696    proxy: Option<&str>,
697) -> Result<reqwest::Client> {
698    let redirect_policy = if follow_redirects {
699        redirect::Policy::limited(redirect_limit)
700    } else {
701        redirect::Policy::none()
702    };
703    let mut builder = reqwest::Client::builder()
704        .user_agent(user_agent.to_owned())
705        .timeout(timeout)
706        .connect_timeout(connect_timeout)
707        .redirect(redirect_policy);
708    if let Some(proxy_url) = proxy {
709        // reqwest treats a schemeless string (e.g. "not-a-url") as a host
710        // and silently defaults it to http://, so every probe would fail
711        // confusingly. Require an explicit, supported scheme up front.
712        const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
713        if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
714            return Err(Error::HttpSetup {
715                message: format!(
716                    "invalid proxy {proxy_url:?}: must start with one of {}",
717                    SCHEMES.join(", ")
718                ),
719            });
720        }
721        let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
722            message: format!("invalid proxy {proxy_url:?}: {e}"),
723        })?;
724        builder = builder.proxy(proxy);
725    }
726    builder.build().map_err(|e| Error::HttpSetup {
727        message: e.to_string(),
728    })
729}
730
731/// Default ceiling on browser-backed probes per scan when no other value
732/// is specified.
733///
734/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
735/// headroom while still being a guardrail against a misconfigured flag
736/// burning a whole Browserbase quota.
737pub const DEFAULT_BROWSER_BUDGET: usize = 50;
738
739impl fmt::Debug for Client {
740    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
741        f.debug_struct("Client")
742            .field("throttle", &self.throttle)
743            .field("global_throttle", &self.global_throttle)
744            .field("retry", &self.retry)
745            .field("user_agents", &self.user_agents)
746            .field("enrich", &self.enrich)
747            .field("robots", &self.robots.is_some())
748            .field("browser", &self.browser.is_some())
749            .field("browser_budget", &self.browser_budget)
750            .finish_non_exhaustive()
751    }
752}
753
754impl fmt::Debug for ClientBuilder {
755    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
756        f.debug_struct("ClientBuilder")
757            .field("timeout", &self.timeout)
758            .field("connect_timeout", &self.connect_timeout)
759            .field("user_agent", &self.user_agent)
760            .field("follow_redirects", &self.follow_redirects)
761            .field("redirect_limit", &self.redirect_limit)
762            .field("min_request_interval", &self.min_request_interval)
763            .field("max_rps", &self.max_rps)
764            .field("retry", &self.retry)
765            .field("proxy", &self.proxy)
766            .field("user_agents", &self.user_agents)
767            .field("enrich", &self.enrich)
768            .field("respect_robots", &self.respect_robots)
769            .field("browser", &self.browser.is_some())
770            .field("browser_budget", &self.browser_budget)
771            .field("egress", &self.egress)
772            .field("sessions", &self.sessions)
773            .finish()
774    }
775}
776
777const BOT_PROTECTED_TAG: &str = "bot-protected";
778
779fn default_user_agent() -> String {
780    format!("adler/{}", env!("CARGO_PKG_VERSION"))
781}
782
783fn host_of(url: &str) -> String {
784    reqwest::Url::parse(url)
785        .ok()
786        .and_then(|u| u.host_str().map(str::to_owned))
787        .unwrap_or_else(|| "unknown".into())
788}
789
790/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
791/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
792fn origin_and_path(url: &str) -> Option<(String, String)> {
793    let parsed = reqwest::Url::parse(url).ok()?;
794    let host = parsed.host_str()?;
795    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
796    let origin = format!("{}://{host}{port}", parsed.scheme());
797    let path = parsed.query().map_or_else(
798        || parsed.path().to_owned(),
799        |q| format!("{}?{q}", parsed.path()),
800    );
801    Some((origin, path))
802}
803
804fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
805    CheckOutcome {
806        site: site.to_owned(),
807        url,
808        kind,
809        reason: None,
810        elapsed_ms: elapsed_ms(started),
811        enrichment: std::collections::BTreeMap::new(),
812        evidence: Vec::new(),
813    }
814}
815
816fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
817    CheckOutcome {
818        site: site.to_owned(),
819        url,
820        kind: MatchKind::Uncertain,
821        reason: Some(reason),
822        elapsed_ms: elapsed_ms(started),
823        enrichment: std::collections::BTreeMap::new(),
824        evidence: Vec::new(),
825    }
826}
827
828fn elapsed_ms(started: Instant) -> u64 {
829    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
830}
831
832#[cfg(test)]
833mod tests {
834    use super::*;
835    use crate::browser::RenderedPage;
836    use crate::site::{Signal, UrlTemplate};
837    use wiremock::matchers::{any, method, path};
838    use wiremock::{Mock, MockServer, ResponseTemplate};
839
840    fn build_client() -> Client {
841        Client::builder()
842            .timeout(Duration::from_secs(2))
843            // Tests share `127.0.0.1` as host — keep throttle out of the
844            // way for everything but the dedicated throttle test below.
845            .min_request_interval(Duration::ZERO)
846            // Default retry would re-hit ban-test mocks; tests opt in
847            // explicitly when they want to exercise the retry path.
848            .max_retries(0)
849            .build()
850            .expect("client builds")
851    }
852
853    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
854        Site {
855            name: "Mock".into(),
856            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
857            signals,
858            known_present: None,
859            known_absent: None,
860            extract: Vec::new(),
861            tags: Vec::new(),
862            request_headers: std::collections::BTreeMap::new(),
863            regex_check: None,
864            engine: None,
865            strip_bad_char: None,
866            request_method: crate::site::HttpMethod::Get,
867            request_body: None,
868            protection: Vec::new(),
869            disabled: false,
870            source: None,
871            popularity: None,
872            access: crate::AccessPolicy::default(),
873        }
874    }
875
876    fn user() -> Username {
877        Username::new("alice").unwrap()
878    }
879
880    #[tokio::test]
881    async fn regex_check_short_circuits_before_any_request() {
882        // Stand up a mock that would 200 on *anything* — if probe_once
883        // failed to short-circuit on regex mismatch, the username
884        // "alice" (5 chars) would resolve to Found here.
885        let server = MockServer::start().await;
886        Mock::given(any())
887            .respond_with(ResponseTemplate::new(200))
888            .mount(&server)
889            .await;
890        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
891        // The site only accepts usernames of 8+ chars; "alice" is 5.
892        site.regex_check = Some("^[A-Za-z]{8,}$".into());
893        let outcome = build_client().check(&site, &user()).await;
894        assert_eq!(outcome.kind, MatchKind::Uncertain);
895        assert!(
896            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
897            "expected UsernameNotAllowed, got {:?}",
898            outcome.reason,
899        );
900        // No request should have hit the mock — assert by counting
901        // received_requests on the wiremock server.
902        let recvd = server.received_requests().await.unwrap_or_default();
903        assert_eq!(
904            recvd.len(),
905            0,
906            "regex_check mismatch must skip the HTTP request entirely"
907        );
908    }
909
910    #[tokio::test]
911    async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
912        // A mock that would 200 on anything — if the geo gate failed to
913        // short-circuit, "alice" would resolve to Found here.
914        let server = MockServer::start().await;
915        Mock::given(any())
916            .respond_with(ResponseTemplate::new(200))
917            .mount(&server)
918            .await;
919        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
920        // Require a Polish egress; the default client has no egress pool,
921        // so nothing can satisfy it.
922        site.access = crate::access::AccessPolicy {
923            geo: vec![crate::access::CountryCode::new("pl").unwrap()],
924            ip_type: None,
925            session: None,
926        };
927        let outcome = build_client().check(&site, &user()).await;
928        assert_eq!(outcome.kind, MatchKind::Uncertain);
929        assert!(
930            matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
931            "expected GeoUnavailable, got {:?}",
932            outcome.reason,
933        );
934        // The site must NOT have been probed — an unreachable geo is not
935        // evidence of absence, and we don't fetch from the wrong location.
936        let recvd = server.received_requests().await.unwrap_or_default();
937        assert_eq!(
938            recvd.len(),
939            0,
940            "geo-unavailable must skip the HTTP request entirely"
941        );
942    }
943
944    #[tokio::test]
945    async fn session_headers_are_sent_on_probe() {
946        // Only respond 200 when the request carries the session cookie,
947        // so a Found verdict proves the header was actually applied.
948        let server = MockServer::start().await;
949        Mock::given(any())
950            .and(wiremock::matchers::header("cookie", "sessionid=real"))
951            .respond_with(ResponseTemplate::new(200))
952            .mount(&server)
953            .await;
954        let mut headers = std::collections::BTreeMap::new();
955        headers.insert("Cookie".to_string(), "sessionid=real".to_string());
956        let mut store = SessionStore::new();
957        store.insert("acct", crate::access::Session::from_headers(headers));
958        let client = Client::builder()
959            .timeout(Duration::from_secs(2))
960            .min_request_interval(Duration::ZERO)
961            .max_retries(0)
962            .sessions(store)
963            .build()
964            .expect("client builds");
965        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
966        site.access.session = Some("acct".to_string());
967        let outcome = client.check(&site, &user()).await;
968        assert_eq!(
969            outcome.kind,
970            MatchKind::Found,
971            "session cookie should unlock the 200 (got {:?})",
972            outcome.reason,
973        );
974    }
975
976    #[tokio::test]
977    async fn missing_named_session_is_session_required() {
978        let server = MockServer::start().await;
979        Mock::given(any())
980            .respond_with(ResponseTemplate::new(200))
981            .mount(&server)
982            .await;
983        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
984        // Names a session the (empty) store doesn't have.
985        site.access.session = Some("not-configured".to_string());
986        let outcome = build_client().check(&site, &user()).await;
987        assert_eq!(outcome.kind, MatchKind::Uncertain);
988        assert!(
989            matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
990            "expected SessionRequired, got {:?}",
991            outcome.reason,
992        );
993        let recvd = server.received_requests().await.unwrap_or_default();
994        assert_eq!(
995            recvd.len(),
996            0,
997            "a missing session must skip the request, not probe unauthenticated"
998        );
999    }
1000
1001    #[cfg(feature = "impersonate")]
1002    #[tokio::test]
1003    async fn impersonate_routes_pure_tls_fingerprint_site() {
1004        let server = MockServer::start().await;
1005        Mock::given(any())
1006            .respond_with(ResponseTemplate::new(200))
1007            .mount(&server)
1008            .await;
1009        let client = Client::builder()
1010            .timeout(Duration::from_secs(2))
1011            .min_request_interval(Duration::ZERO)
1012            .max_retries(0)
1013            .build()
1014            .expect("client builds with impersonate");
1015        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1016        // Pure TLS-fingerprint protection — exactly the shape that
1017        // routes to the impersonate fetcher.
1018        site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1019        let outcome = client.check(&site, &user()).await;
1020        assert_eq!(
1021            outcome.kind,
1022            MatchKind::Found,
1023            "expected Found (reason {:?})",
1024            outcome.reason,
1025        );
1026        // wreq's Chrome-134 emulation sets a Chrome-shaped User-Agent —
1027        // observable proof that the request came from the impersonate
1028        // path and not the default `adler/<version>` HTTP fetcher.
1029        let recvd = server.received_requests().await.expect("received requests");
1030        assert_eq!(recvd.len(), 1, "expected exactly one request");
1031        let ua = recvd[0]
1032            .headers
1033            .get("user-agent")
1034            .and_then(|v| v.to_str().ok())
1035            .unwrap_or("");
1036        assert!(
1037            ua.contains("Chrome/"),
1038            "expected Chrome-shaped UA from wreq, got {ua:?}"
1039        );
1040    }
1041
1042    #[tokio::test]
1043    async fn regex_check_pass_proceeds_to_probe() {
1044        let server = MockServer::start().await;
1045        Mock::given(any())
1046            .and(path("/alice"))
1047            .respond_with(ResponseTemplate::new(200))
1048            .mount(&server)
1049            .await;
1050        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1051        // Pattern that matches "alice".
1052        site.regex_check = Some("^[a-z]{3,}$".into());
1053        let outcome = build_client().check(&site, &user()).await;
1054        assert_eq!(outcome.kind, MatchKind::Found);
1055    }
1056
1057    #[tokio::test]
1058    async fn status_signal_reports_found_on_match() {
1059        let server = MockServer::start().await;
1060        Mock::given(any())
1061            .and(path("/alice"))
1062            .respond_with(ResponseTemplate::new(200))
1063            .mount(&server)
1064            .await;
1065        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1066        let outcome = build_client().check(&site, &user()).await;
1067        assert_eq!(outcome.kind, MatchKind::Found);
1068        assert!(outcome.url.ends_with("/alice"));
1069        assert!(outcome.reason.is_none());
1070        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1071    }
1072
1073    #[tokio::test]
1074    async fn status_signal_pair_reports_not_found_on_404() {
1075        let server = MockServer::start().await;
1076        Mock::given(any())
1077            .and(path("/alice"))
1078            .respond_with(ResponseTemplate::new(404))
1079            .mount(&server)
1080            .await;
1081        let site = site_with(
1082            &server,
1083            vec![
1084                Signal::StatusFound { codes: vec![200] },
1085                Signal::StatusNotFound { codes: vec![404] },
1086            ],
1087        );
1088        let outcome = build_client().check(&site, &user()).await;
1089        assert_eq!(outcome.kind, MatchKind::NotFound);
1090        // Only the NotFound-voting signal is cited as evidence.
1091        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1092    }
1093
1094    #[tokio::test]
1095    async fn body_absent_signal_detects_missing_account() {
1096        let server = MockServer::start().await;
1097        Mock::given(any())
1098            .and(path("/alice"))
1099            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1100            .mount(&server)
1101            .await;
1102        let site = site_with(
1103            &server,
1104            vec![Signal::BodyAbsent {
1105                text: "Profile not found".into(),
1106            }],
1107        );
1108        let outcome = build_client().check(&site, &user()).await;
1109        assert_eq!(outcome.kind, MatchKind::NotFound);
1110    }
1111
1112    #[tokio::test]
1113    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1114        // Phase 2 semantics: absence of an absence-marker is not evidence
1115        // of presence — it just means we have no signal that fired.
1116        let server = MockServer::start().await;
1117        Mock::given(any())
1118            .and(path("/alice"))
1119            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1120            .mount(&server)
1121            .await;
1122        let site = site_with(
1123            &server,
1124            vec![Signal::BodyAbsent {
1125                text: "Profile not found".into(),
1126            }],
1127        );
1128        let outcome = build_client().check(&site, &user()).await;
1129        assert_eq!(outcome.kind, MatchKind::Uncertain);
1130    }
1131
1132    #[tokio::test]
1133    async fn body_present_plus_absent_resolve_to_found() {
1134        let server = MockServer::start().await;
1135        Mock::given(any())
1136            .and(path("/alice"))
1137            .respond_with(
1138                ResponseTemplate::new(200)
1139                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
1140            )
1141            .mount(&server)
1142            .await;
1143        let site = site_with(
1144            &server,
1145            vec![
1146                Signal::BodyPresent {
1147                    text: "profile-card".into(),
1148                },
1149                Signal::BodyAbsent {
1150                    text: "Profile not found".into(),
1151                },
1152            ],
1153        );
1154        let outcome = build_client().check(&site, &user()).await;
1155        assert_eq!(outcome.kind, MatchKind::Found);
1156    }
1157
1158    #[tokio::test]
1159    async fn redirect_absent_signal_detects_missing_account() {
1160        let server = MockServer::start().await;
1161        Mock::given(any())
1162            .and(path("/alice"))
1163            .respond_with(
1164                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1165            )
1166            .mount(&server)
1167            .await;
1168        Mock::given(any())
1169            .and(path("/login"))
1170            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1171            .mount(&server)
1172            .await;
1173        let site = site_with(
1174            &server,
1175            vec![Signal::RedirectAbsent {
1176                fragment: "/login".into(),
1177            }],
1178        );
1179        let outcome = build_client().check(&site, &user()).await;
1180        assert_eq!(outcome.kind, MatchKind::NotFound);
1181    }
1182
1183    #[tokio::test]
1184    async fn negative_signal_wins_over_positive() {
1185        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1186        // (error marker appears). Negative-priority aggregation → NotFound.
1187        // This is the canonical Sherlock "message" pattern: a site that
1188        // returns 200 for everyone and differentiates via an error string.
1189        let server = MockServer::start().await;
1190        Mock::given(any())
1191            .and(path("/alice"))
1192            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1193            .mount(&server)
1194            .await;
1195        let site = site_with(
1196            &server,
1197            vec![
1198                Signal::StatusFound { codes: vec![200] },
1199                Signal::BodyAbsent {
1200                    text: "Profile not found".into(),
1201                },
1202            ],
1203        );
1204        let outcome = build_client().check(&site, &user()).await;
1205        assert_eq!(outcome.kind, MatchKind::NotFound);
1206    }
1207
1208    #[tokio::test]
1209    async fn network_failure_yields_uncertain() {
1210        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1211        let port = listener.local_addr().unwrap().port();
1212        drop(listener);
1213
1214        let site = Site {
1215            name: "Dead".into(),
1216            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1217            signals: vec![Signal::StatusFound { codes: vec![200] }],
1218            known_present: None,
1219            known_absent: None,
1220            extract: Vec::new(),
1221            tags: Vec::new(),
1222            request_headers: std::collections::BTreeMap::new(),
1223            regex_check: None,
1224            engine: None,
1225            strip_bad_char: None,
1226            request_method: crate::site::HttpMethod::Get,
1227            request_body: None,
1228            protection: Vec::new(),
1229            disabled: false,
1230            source: None,
1231            popularity: None,
1232            access: crate::AccessPolicy::default(),
1233        };
1234        let client = Client::builder()
1235            .timeout(Duration::from_millis(500))
1236            .connect_timeout(Duration::from_millis(500))
1237            .max_retries(0)
1238            .build()
1239            .unwrap();
1240        let outcome = client.check(&site, &user()).await;
1241        assert_eq!(outcome.kind, MatchKind::Uncertain);
1242        assert!(outcome.reason.is_some());
1243    }
1244
1245    #[tokio::test]
1246    async fn throttle_spaces_consecutive_calls_to_same_host() {
1247        let server = MockServer::start().await;
1248        Mock::given(any())
1249            .and(path("/alice"))
1250            .respond_with(ResponseTemplate::new(200))
1251            .mount(&server)
1252            .await;
1253        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1254        // Interval is intentionally much larger than typical wiremock latency
1255        // (≤10 ms locally, can spike under heavy parallel test load). Any
1256        // value too close to HTTP latency would let the first request burn
1257        // through the throttle window and make the assertion flaky.
1258        let client = Client::builder()
1259            .timeout(Duration::from_secs(2))
1260            .min_request_interval(Duration::from_millis(300))
1261            .build()
1262            .unwrap();
1263
1264        client.check(&site, &user()).await;
1265        let started = Instant::now();
1266        client.check(&site, &user()).await;
1267        let elapsed = started.elapsed();
1268        assert!(
1269            elapsed >= Duration::from_millis(200),
1270            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1271        );
1272    }
1273
1274    #[tokio::test]
1275    async fn builder_overrides_user_agent() {
1276        let server = MockServer::start().await;
1277        Mock::given(any())
1278            .and(path("/alice"))
1279            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1280            .respond_with(ResponseTemplate::new(200))
1281            .mount(&server)
1282            .await;
1283        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1284        let client = Client::builder()
1285            .user_agent("adler-test/1.0")
1286            .build()
1287            .unwrap();
1288        let outcome = client.check(&site, &user()).await;
1289        assert_eq!(outcome.kind, MatchKind::Found);
1290    }
1291
1292    #[tokio::test]
1293    async fn rate_limit_429_yields_uncertain_with_note() {
1294        let server = MockServer::start().await;
1295        Mock::given(any())
1296            .and(path("/alice"))
1297            .respond_with(ResponseTemplate::new(429))
1298            .mount(&server)
1299            .await;
1300        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1301        let outcome = build_client().check(&site, &user()).await;
1302        assert_eq!(outcome.kind, MatchKind::Uncertain);
1303        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1304    }
1305
1306    #[tokio::test]
1307    async fn cloudflare_server_header_yields_uncertain() {
1308        let server = MockServer::start().await;
1309        Mock::given(any())
1310            .and(path("/alice"))
1311            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1312            .mount(&server)
1313            .await;
1314        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1315        let outcome = build_client().check(&site, &user()).await;
1316        assert_eq!(outcome.kind, MatchKind::Uncertain);
1317        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1318    }
1319
1320    #[tokio::test]
1321    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1322        // Body-based ban detection only runs when a signal already needs
1323        // the body — this site uses BodyAbsent so the body is read.
1324        let server = MockServer::start().await;
1325        Mock::given(any())
1326            .and(path("/alice"))
1327            .respond_with(
1328                ResponseTemplate::new(200)
1329                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1330            )
1331            .mount(&server)
1332            .await;
1333        let site = site_with(
1334            &server,
1335            vec![Signal::BodyAbsent {
1336                text: "Profile not found".into(),
1337            }],
1338        );
1339        let outcome = build_client().check(&site, &user()).await;
1340        assert_eq!(outcome.kind, MatchKind::Uncertain);
1341        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1342    }
1343
1344    #[tokio::test]
1345    async fn ban_detection_does_not_fire_on_legitimate_403() {
1346        let server = MockServer::start().await;
1347        Mock::given(any())
1348            .and(path("/alice"))
1349            .respond_with(ResponseTemplate::new(403))
1350            .mount(&server)
1351            .await;
1352        let site = site_with(
1353            &server,
1354            vec![
1355                Signal::StatusFound { codes: vec![200] },
1356                Signal::StatusNotFound { codes: vec![403] },
1357            ],
1358        );
1359        let outcome = build_client().check(&site, &user()).await;
1360        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1361        assert_eq!(outcome.kind, MatchKind::NotFound);
1362        assert!(outcome.reason.is_none());
1363    }
1364
1365    #[tokio::test]
1366    async fn retry_recovers_after_transient_429() {
1367        let server = MockServer::start().await;
1368        // First request: 429. Subsequent: 200.
1369        Mock::given(any())
1370            .and(path("/alice"))
1371            .respond_with(ResponseTemplate::new(429))
1372            .up_to_n_times(1)
1373            .mount(&server)
1374            .await;
1375        Mock::given(any())
1376            .and(path("/alice"))
1377            .respond_with(ResponseTemplate::new(200))
1378            .mount(&server)
1379            .await;
1380        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1381        let client = Client::builder()
1382            .timeout(Duration::from_secs(2))
1383            .min_request_interval(Duration::ZERO)
1384            .max_retries(2)
1385            .base_backoff_delay(Duration::from_millis(20))
1386            .max_backoff_delay(Duration::from_millis(100))
1387            .build()
1388            .unwrap();
1389        let outcome = client.check(&site, &user()).await;
1390        assert_eq!(outcome.kind, MatchKind::Found);
1391        assert!(outcome.reason.is_none());
1392    }
1393
1394    #[tokio::test]
1395    async fn retry_exhausts_and_returns_uncertain() {
1396        let server = MockServer::start().await;
1397        Mock::given(any())
1398            .and(path("/alice"))
1399            .respond_with(ResponseTemplate::new(429))
1400            .mount(&server)
1401            .await;
1402        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1403        let client = Client::builder()
1404            .timeout(Duration::from_secs(2))
1405            .min_request_interval(Duration::ZERO)
1406            .max_retries(2)
1407            .base_backoff_delay(Duration::from_millis(10))
1408            .max_backoff_delay(Duration::from_millis(50))
1409            .build()
1410            .unwrap();
1411        let outcome = client.check(&site, &user()).await;
1412        assert_eq!(outcome.kind, MatchKind::Uncertain);
1413        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1414    }
1415
1416    #[tokio::test]
1417    async fn retry_does_not_fire_on_network_error() {
1418        // Connection refused → Uncertain note starts with "request:", not a
1419        // ban marker. We must NOT retry — otherwise a single dead site
1420        // burns the full backoff budget before reporting.
1421        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1422        let port = listener.local_addr().unwrap().port();
1423        drop(listener);
1424        let site = Site {
1425            name: "Dead".into(),
1426            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1427            signals: vec![Signal::StatusFound { codes: vec![200] }],
1428            known_present: None,
1429            known_absent: None,
1430            extract: Vec::new(),
1431            tags: Vec::new(),
1432            request_headers: std::collections::BTreeMap::new(),
1433            regex_check: None,
1434            engine: None,
1435            strip_bad_char: None,
1436            request_method: crate::site::HttpMethod::Get,
1437            request_body: None,
1438            protection: Vec::new(),
1439            disabled: false,
1440            source: None,
1441            popularity: None,
1442            access: crate::AccessPolicy::default(),
1443        };
1444        let client = Client::builder()
1445            .timeout(Duration::from_millis(500))
1446            .connect_timeout(Duration::from_millis(500))
1447            .min_request_interval(Duration::ZERO)
1448            .max_retries(3)
1449            .base_backoff_delay(Duration::from_secs(60))
1450            .build()
1451            .unwrap();
1452        let started = Instant::now();
1453        let outcome = client.check(&site, &user()).await;
1454        // If retry fired, we'd be sleeping minutes; instead this returns
1455        // promptly with an Uncertain.
1456        assert!(started.elapsed() < Duration::from_secs(5));
1457        assert_eq!(outcome.kind, MatchKind::Uncertain);
1458        assert!(
1459            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1460            "got {:?}",
1461            outcome.reason,
1462        );
1463    }
1464
1465    #[tokio::test]
1466    async fn rotates_user_agent_per_request() {
1467        // The mock only matches when the request carries one of the pooled
1468        // UAs; if rotation weren't applied, the default adler/x.y UA would
1469        // miss and the verdict would be NotFound.
1470        let server = MockServer::start().await;
1471        Mock::given(any())
1472            .and(path("/alice"))
1473            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1474            .respond_with(ResponseTemplate::new(200))
1475            .mount(&server)
1476            .await;
1477        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1478        let client = Client::builder()
1479            .min_request_interval(Duration::ZERO)
1480            .max_retries(0)
1481            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1482            .build()
1483            .unwrap();
1484        let outcome = client.check(&site, &user()).await;
1485        assert_eq!(outcome.kind, MatchKind::Found);
1486    }
1487
1488    #[test]
1489    fn invalid_proxy_url_fails_build() {
1490        let err = Client::builder().proxy("not a url").build().unwrap_err();
1491        assert!(matches!(err, Error::HttpSetup { .. }));
1492    }
1493
1494    #[test]
1495    fn schemeless_proxy_is_rejected_up_front() {
1496        // reqwest would silently treat this as a host; we require a scheme.
1497        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1498        let Error::HttpSetup { message } = err else {
1499            panic!("expected HttpSetup, got {err:?}");
1500        };
1501        assert!(message.contains("must start with"), "{message}");
1502    }
1503
1504    #[test]
1505    fn socks5_proxy_scheme_is_accepted() {
1506        // Valid scheme + endpoint builds fine (no connection is attempted).
1507        assert!(
1508            Client::builder()
1509                .proxy("socks5://127.0.0.1:9050")
1510                .build()
1511                .is_ok()
1512        );
1513    }
1514
1515    #[tokio::test]
1516    async fn global_rps_cap_spaces_requests_across_hosts() {
1517        // Two distinct host paths; per-host throttle is disabled, so any
1518        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1519        let server = MockServer::start().await;
1520        Mock::given(any())
1521            .respond_with(ResponseTemplate::new(200))
1522            .mount(&server)
1523            .await;
1524        let site_a = Site {
1525            name: "A".into(),
1526            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1527            signals: vec![Signal::StatusFound { codes: vec![200] }],
1528            known_present: None,
1529            known_absent: None,
1530            extract: Vec::new(),
1531            tags: Vec::new(),
1532            request_headers: std::collections::BTreeMap::new(),
1533            regex_check: None,
1534            engine: None,
1535            strip_bad_char: None,
1536            request_method: crate::site::HttpMethod::Get,
1537            request_body: None,
1538            protection: Vec::new(),
1539            disabled: false,
1540            source: None,
1541            popularity: None,
1542            access: crate::AccessPolicy::default(),
1543        };
1544        let site_b = Site {
1545            name: "B".into(),
1546            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1547            signals: vec![Signal::StatusFound { codes: vec![200] }],
1548            known_present: None,
1549            known_absent: None,
1550            extract: Vec::new(),
1551            tags: Vec::new(),
1552            request_headers: std::collections::BTreeMap::new(),
1553            regex_check: None,
1554            engine: None,
1555            strip_bad_char: None,
1556            request_method: crate::site::HttpMethod::Get,
1557            request_body: None,
1558            protection: Vec::new(),
1559            disabled: false,
1560            source: None,
1561            popularity: None,
1562            access: crate::AccessPolicy::default(),
1563        };
1564        // 2 RPS → ~500 ms between requests. A large interval keeps the
1565        // assertion robust even when the first probe's own duration (which
1566        // eats into the measured gap) is inflated by test instrumentation
1567        // such as coverage tooling.
1568        let client = Client::builder()
1569            .min_request_interval(Duration::ZERO)
1570            .max_retries(0)
1571            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1572            .build()
1573            .unwrap();
1574        // First request consumes the slot at t≈0; second waits ~500 ms even
1575        // though it targets a different host.
1576        client.check(&site_a, &user()).await;
1577        let started = Instant::now();
1578        client.check(&site_b, &user()).await;
1579        assert!(
1580            started.elapsed() >= Duration::from_millis(350),
1581            "global cap should space cross-host requests, got {:?}",
1582            started.elapsed(),
1583        );
1584    }
1585
1586    #[tokio::test]
1587    async fn respect_robots_skips_disallowed_paths() {
1588        let server = MockServer::start().await;
1589        Mock::given(any())
1590            .and(path("/robots.txt"))
1591            .respond_with(
1592                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1593            )
1594            .mount(&server)
1595            .await;
1596        Mock::given(any())
1597            .and(path("/no/alice"))
1598            .respond_with(ResponseTemplate::new(200))
1599            .mount(&server)
1600            .await;
1601        Mock::given(any())
1602            .and(path("/yes/alice"))
1603            .respond_with(ResponseTemplate::new(200))
1604            .mount(&server)
1605            .await;
1606        let client = Client::builder()
1607            .min_request_interval(Duration::ZERO)
1608            .max_retries(0)
1609            .respect_robots(true)
1610            .build()
1611            .unwrap();
1612
1613        let disallowed = Site {
1614            name: "No".into(),
1615            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1616            signals: vec![Signal::StatusFound { codes: vec![200] }],
1617            known_present: None,
1618            known_absent: None,
1619            extract: Vec::new(),
1620            tags: Vec::new(),
1621            request_headers: std::collections::BTreeMap::new(),
1622            regex_check: None,
1623            engine: None,
1624            strip_bad_char: None,
1625            request_method: crate::site::HttpMethod::Get,
1626            request_body: None,
1627            protection: Vec::new(),
1628            disabled: false,
1629            source: None,
1630            popularity: None,
1631            access: crate::AccessPolicy::default(),
1632        };
1633        let allowed = Site {
1634            name: "Yes".into(),
1635            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1636            signals: vec![Signal::StatusFound { codes: vec![200] }],
1637            known_present: None,
1638            known_absent: None,
1639            extract: Vec::new(),
1640            tags: Vec::new(),
1641            request_headers: std::collections::BTreeMap::new(),
1642            regex_check: None,
1643            engine: None,
1644            strip_bad_char: None,
1645            request_method: crate::site::HttpMethod::Get,
1646            request_body: None,
1647            protection: Vec::new(),
1648            disabled: false,
1649            source: None,
1650            popularity: None,
1651            access: crate::AccessPolicy::default(),
1652        };
1653
1654        let no = client.check(&disallowed, &user()).await;
1655        assert_eq!(no.kind, MatchKind::Uncertain);
1656        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1657
1658        let yes = client.check(&allowed, &user()).await;
1659        assert_eq!(yes.kind, MatchKind::Found);
1660    }
1661
1662    #[tokio::test]
1663    async fn body_read_skipped_when_no_body_signal_needed() {
1664        // Mock returns body that would fail a body_absent check — but since
1665        // we only have a status signal, body is never read.
1666        let server = MockServer::start().await;
1667        Mock::given(any())
1668            .and(path("/alice"))
1669            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1670            .mount(&server)
1671            .await;
1672        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1673        let outcome = build_client().check(&site, &user()).await;
1674        assert_eq!(outcome.kind, MatchKind::Found);
1675    }
1676
1677    // ===== Browser routing =====
1678
1679    /// Test backend that returns a canned page and counts calls. Lets the
1680    /// routing tests assert "Client did/did not invoke the browser" without
1681    /// involving a real Chrome process.
1682    #[derive(Debug)]
1683    struct RecordingBackend {
1684        page: RenderedPage,
1685        calls: std::sync::atomic::AtomicUsize,
1686    }
1687
1688    impl RecordingBackend {
1689        fn with_page(page: RenderedPage) -> Self {
1690            Self {
1691                page,
1692                calls: std::sync::atomic::AtomicUsize::new(0),
1693            }
1694        }
1695        fn call_count(&self) -> usize {
1696            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1697        }
1698    }
1699
1700    #[async_trait::async_trait]
1701    impl BrowserBackend for RecordingBackend {
1702        async fn fetch(
1703            &self,
1704            _url: &url::Url,
1705            _headers: &std::collections::BTreeMap<String, String>,
1706            _timeout: Duration,
1707        ) -> Result<RenderedPage> {
1708            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1709            Ok(self.page.clone())
1710        }
1711    }
1712
1713    fn site_bot_protected(server: &MockServer) -> Site {
1714        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1715        s.tags = vec!["bot-protected".into()];
1716        s
1717    }
1718
1719    #[tokio::test]
1720    async fn browser_routes_bot_protected_sites() {
1721        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1722        // returns its canned page directly.
1723        let server = MockServer::start().await;
1724        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1725            status: 200,
1726            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1727            body: "<html></html>".into(),
1728            elapsed_ms: 42,
1729        }));
1730        let client = Client::builder()
1731            .min_request_interval(Duration::ZERO)
1732            .max_retries(0)
1733            .browser(backend.clone())
1734            .build()
1735            .unwrap();
1736        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1737        assert_eq!(outcome.kind, MatchKind::Found);
1738        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1739    }
1740
1741    #[tokio::test]
1742    async fn non_bot_protected_sites_skip_browser() {
1743        let server = MockServer::start().await;
1744        Mock::given(any())
1745            .and(path("/alice"))
1746            .respond_with(ResponseTemplate::new(200))
1747            .mount(&server)
1748            .await;
1749        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1750            status: 500, // would make wiremock case fail if browser was taken
1751            final_url: url::Url::parse("https://x/").unwrap(),
1752            body: String::new(),
1753            elapsed_ms: 0,
1754        }));
1755        let client = Client::builder()
1756            .min_request_interval(Duration::ZERO)
1757            .max_retries(0)
1758            .browser(backend.clone())
1759            .build()
1760            .unwrap();
1761        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1762        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1763        let outcome = client.check(&site, &user()).await;
1764        assert_eq!(outcome.kind, MatchKind::Found);
1765        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1766    }
1767
1768    #[tokio::test]
1769    async fn browser_budget_exhaust_yields_uncertain() {
1770        let server = MockServer::start().await;
1771        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1772            status: 200,
1773            final_url: url::Url::parse("https://x/").unwrap(),
1774            body: String::new(),
1775            elapsed_ms: 0,
1776        }));
1777        let client = Client::builder()
1778            .min_request_interval(Duration::ZERO)
1779            .max_retries(0)
1780            .browser(backend.clone())
1781            .browser_budget(1)
1782            .build()
1783            .unwrap();
1784        let site = site_bot_protected(&server);
1785        // First call consumes the only slot.
1786        let first = client.check(&site, &user()).await;
1787        assert_eq!(first.kind, MatchKind::Found);
1788        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1789        let second = client.check(&site, &user()).await;
1790        assert_eq!(second.kind, MatchKind::Uncertain);
1791        assert!(matches!(
1792            second.reason,
1793            Some(UncertainReason::BrowserBudget)
1794        ));
1795        assert_eq!(
1796            backend.call_count(),
1797            1,
1798            "second call must not invoke backend"
1799        );
1800    }
1801
1802    #[tokio::test]
1803    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1804        struct FailingBackend;
1805        #[async_trait::async_trait]
1806        impl BrowserBackend for FailingBackend {
1807            async fn fetch(
1808                &self,
1809                _url: &url::Url,
1810                _headers: &std::collections::BTreeMap<String, String>,
1811                _timeout: Duration,
1812            ) -> Result<RenderedPage> {
1813                Err(Error::BrowserSetup {
1814                    message: "simulated crash".into(),
1815                })
1816            }
1817        }
1818        impl std::fmt::Debug for FailingBackend {
1819            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1820                f.write_str("FailingBackend")
1821            }
1822        }
1823
1824        let server = MockServer::start().await;
1825        let client = Client::builder()
1826            .min_request_interval(Duration::ZERO)
1827            .max_retries(0)
1828            .browser(Arc::new(FailingBackend))
1829            .build()
1830            .unwrap();
1831        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1832        assert_eq!(outcome.kind, MatchKind::Uncertain);
1833        match outcome.reason {
1834            Some(UncertainReason::BrowserFailed(msg)) => {
1835                assert!(msg.contains("simulated crash"), "got: {msg}");
1836            }
1837            other => panic!("expected BrowserFailed, got {other:?}"),
1838        }
1839    }
1840
1841    #[tokio::test]
1842    async fn status_only_site_uses_head_request() {
1843        // Site with only status signals (no body markers, no enrichment)
1844        // should be probed with HEAD — saves the body download on
1845        // ~30% of the registry.
1846        let server = MockServer::start().await;
1847        Mock::given(method("HEAD"))
1848            .and(path("/alice"))
1849            .respond_with(ResponseTemplate::new(200))
1850            .mount(&server)
1851            .await;
1852        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1853        let outcome = build_client().check(&site, &user()).await;
1854        assert_eq!(outcome.kind, MatchKind::Found);
1855        let recvd = server.received_requests().await.unwrap_or_default();
1856        assert_eq!(recvd.len(), 1);
1857        assert_eq!(recvd[0].method.as_str(), "HEAD");
1858    }
1859
1860    #[tokio::test]
1861    async fn body_signal_site_uses_get_request() {
1862        // Same baseline plus a body-marker signal — must still GET so
1863        // the body actually arrives for matching.
1864        let server = MockServer::start().await;
1865        Mock::given(any())
1866            .and(path("/alice"))
1867            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
1868            .mount(&server)
1869            .await;
1870        let site = site_with(
1871            &server,
1872            vec![Signal::BodyPresent {
1873                text: "hello".into(),
1874            }],
1875        );
1876        let outcome = build_client().check(&site, &user()).await;
1877        assert_eq!(outcome.kind, MatchKind::Found);
1878        let recvd = server.received_requests().await.unwrap_or_default();
1879        assert_eq!(recvd[0].method.as_str(), "GET");
1880    }
1881
1882    #[tokio::test]
1883    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
1884        // A site that declares `protection: [Cloudflare]` but doesn't
1885        // carry the legacy `bot-protected` tag should still route
1886        // through the browser backend — the new structured field is
1887        // an additional signal, not a tag replacement.
1888        let server = MockServer::start().await;
1889        Mock::given(any())
1890            .respond_with(ResponseTemplate::new(200))
1891            .mount(&server)
1892            .await;
1893        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1894        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
1895        // No bot-protected tag — pure structured-field test.
1896        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1897            status: 200,
1898            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1899            body: String::new(),
1900            elapsed_ms: 0,
1901        }));
1902        let client = Client::builder()
1903            .min_request_interval(Duration::ZERO)
1904            .max_retries(0)
1905            .browser(backend)
1906            .build()
1907            .unwrap();
1908        let outcome = client.check(&site, &user()).await;
1909        // The recording backend always returns a synthetic 200, so
1910        // Found means we went through the browser path.
1911        assert_eq!(outcome.kind, MatchKind::Found);
1912        // No raw HTTP probe should have hit the mock server.
1913        let recvd = server.received_requests().await.unwrap_or_default();
1914        assert_eq!(
1915            recvd.len(),
1916            0,
1917            "structured protection must skip the raw HTTP path"
1918        );
1919    }
1920
1921    #[tokio::test]
1922    async fn post_method_sends_body_with_username_substituted() {
1923        // A POST-probed site (e.g. Anilist GraphQL) — the username
1924        // goes in the body, not the URL. Adler should substitute
1925        // `{username}` and send a POST with the rendered payload.
1926        let server = MockServer::start().await;
1927        Mock::given(method("POST"))
1928            .and(path("/api"))
1929            .respond_with(ResponseTemplate::new(200))
1930            .mount(&server)
1931            .await;
1932        // URL substitution still requires the `{username}` placeholder,
1933        // even for POST sites where the username also lives in the
1934        // body. Most real POST endpoints encode the username in both
1935        // (e.g. query string + body); we mirror that.
1936        let site = Site {
1937            name: "ApiPost".into(),
1938            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
1939            signals: vec![Signal::StatusFound { codes: vec![200] }],
1940            known_present: None,
1941            known_absent: None,
1942            extract: Vec::new(),
1943            tags: Vec::new(),
1944            request_headers: std::collections::BTreeMap::new(),
1945            regex_check: None,
1946            engine: None,
1947            strip_bad_char: None,
1948            request_method: HttpMethod::Post,
1949            request_body: Some(r#"{"name":"{username}"}"#.into()),
1950            protection: Vec::new(),
1951            disabled: false,
1952            source: None,
1953            popularity: None,
1954            access: crate::AccessPolicy::default(),
1955        };
1956        let outcome = build_client().check(&site, &user()).await;
1957        assert_eq!(outcome.kind, MatchKind::Found);
1958        let recvd = server.received_requests().await.unwrap_or_default();
1959        assert_eq!(recvd.len(), 1);
1960        assert_eq!(recvd[0].method.as_str(), "POST");
1961        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
1962        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
1963    }
1964
1965    #[tokio::test]
1966    async fn head_405_falls_back_to_get() {
1967        // A server that rejects HEAD with 405 — Adler should silently
1968        // retry with GET so the optimisation can never cost accuracy.
1969        let server = MockServer::start().await;
1970        Mock::given(method("HEAD"))
1971            .and(path("/alice"))
1972            .respond_with(ResponseTemplate::new(405))
1973            .mount(&server)
1974            .await;
1975        Mock::given(any())
1976            .and(path("/alice"))
1977            .respond_with(ResponseTemplate::new(200))
1978            .mount(&server)
1979            .await;
1980        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1981        let outcome = build_client().check(&site, &user()).await;
1982        assert_eq!(outcome.kind, MatchKind::Found);
1983        let recvd = server.received_requests().await.unwrap_or_default();
1984        assert_eq!(recvd.len(), 2);
1985        assert_eq!(recvd[0].method.as_str(), "HEAD");
1986        assert_eq!(recvd[1].method.as_str(), "GET");
1987    }
1988}