Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38/// Single fixed key for the global rate limiter (it gates all hosts).
39const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41/// HTTP client used to probe sites.
42///
43/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
44/// internally, and the throttle is `Arc`-backed, so cloning is the
45/// recommended way to share a client between tasks. Cloned clients share
46/// throttle state, which is what you want: a fan-out scan must not
47/// accidentally exceed a per-host budget by spawning more clients.
48#[derive(Clone)]
49pub struct Client {
50    http: Arc<HttpFetcher>,
51    /// Geo / IP-type egress pool for sites whose `access` policy needs a
52    /// specific proxy. Empty by default → every site uses `http`.
53    egress: Arc<EgressPool>,
54    /// Operator-supplied sessions, keyed by the name a site references
55    /// via `access.session`. Empty by default.
56    sessions: Arc<SessionStore>,
57    throttle: HostThrottle,
58    /// Global RPS cap applied across all hosts. `None` → uncapped.
59    global_throttle: Option<HostThrottle>,
60    retry: RetryPolicy,
61    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
62    /// `Arc<[String]>` so cloning a client per task stays cheap.
63    user_agents: Arc<[String]>,
64    /// Extract profile fields from `Found` pages that declare extractors.
65    enrich: bool,
66    /// When set, skip probes disallowed by the host's `robots.txt`.
67    robots: Option<RobotsCache>,
68    /// Browser backend used for `bot-protected` sites. `None` → those sites
69    /// stay on the raw HTTP path and typically end up `Uncertain`.
70    browser: Option<Arc<dyn BrowserBackend>>,
71    /// TLS-fingerprint-impersonating HTTP client (`wreq`). Built when
72    /// the `impersonate` Cargo feature is on; routes sites whose
73    /// `protection` is exactly `TlsFingerprint`.
74    #[cfg(feature = "impersonate")]
75    impersonate: Option<Arc<ImpersonateFetcher>>,
76    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
77    /// for a single scan, so several tasks compete for the same budget.
78    browser_budget: Arc<BrowserBudget>,
79    /// Per-scan cap on *automatic escalations* from a cheap transport to
80    /// the browser when the cheap path returns
81    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
82    /// `browser_budget` so the pre-tagged `bot-protected` subset and the
83    /// long-tail escalation subset don't fight over the same number.
84    escalation_budget: Arc<crate::escalation::EscalationBudget>,
85    /// Whether automatic escalation runs at all. `false` keeps the cheap
86    /// transport's outcome verbatim — useful for benchmarking the raw
87    /// signals without the access-engine lift on top.
88    escalation_enabled: bool,
89}
90
91impl Client {
92    /// Start configuring a new client.
93    pub fn builder() -> ClientBuilder {
94        ClientBuilder::default()
95    }
96
97    /// Read-only view of the configured egress pool — `(country, kind)`
98    /// for every registered proxy, in the order they were declared.
99    /// Proxy URLs are not surfaced (they typically carry credentials),
100    /// so this is safe to serialise to a JSON response.
101    #[must_use]
102    pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
103        self.egress.summary()
104    }
105
106    /// Names of the configured sessions (sorted lexicographically),
107    /// without any header values. Useful for a UI listing which session
108    /// keys an operator can reference via `access.session` on a site.
109    #[must_use]
110    pub fn session_names(&self) -> Vec<String> {
111        self.sessions.names()
112    }
113
114    /// Names of the configured egresses (in registration order, only
115    /// those that supplied a name). Used by the server to validate
116    /// per-scan `egress_names` against the loaded pool.
117    #[must_use]
118    pub fn egress_names(&self) -> Vec<String> {
119        self.egress.names()
120    }
121
122    /// Returns a new client identical to this one except its egress
123    /// pool is restricted to entries whose `name` matches one of
124    /// `names`. An empty `names` slice is treated as "no filter" and
125    /// returns a clone of the full pool.
126    ///
127    /// Cheap to call repeatedly: all shared state (HTTP clients,
128    /// throttle, sessions, budgets, browser backend, …) is
129    /// `Arc`-cloned so the returned client shares the parent's
130    /// per-scan caps (browser budget, escalation budget, throttle
131    /// state) rather than each subset getting a fresh one. This is the
132    /// right behaviour for a single web-server instance handing out
133    /// per-request clients.
134    #[must_use]
135    pub fn with_egress_subset(&self, names: &[String]) -> Self {
136        Self {
137            http: Arc::clone(&self.http),
138            egress: Arc::new(self.egress.subset(names)),
139            sessions: Arc::clone(&self.sessions),
140            throttle: self.throttle.clone(),
141            global_throttle: self.global_throttle.clone(),
142            retry: self.retry.clone(),
143            user_agents: Arc::clone(&self.user_agents),
144            enrich: self.enrich,
145            robots: self.robots.clone(),
146            browser: self.browser.clone(),
147            #[cfg(feature = "impersonate")]
148            impersonate: self.impersonate.clone(),
149            browser_budget: Arc::clone(&self.browser_budget),
150            escalation_budget: Arc::clone(&self.escalation_budget),
151            escalation_enabled: self.escalation_enabled,
152        }
153    }
154
155    /// Probe a single site for `username`, retrying on transient bans.
156    ///
157    /// Network failures, timeouts, and unexpected response shapes all yield
158    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
159    /// returns an error: at the executor level we want a partial result for
160    /// every site, not abort-on-first-failure semantics.
161    ///
162    /// When ban detection classifies a response as `rate_limited` /
163    /// `cloudflare_challenge`, the call is retried with jittered exponential
164    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
165    /// Uncertain (network errors, body read failures) is **not** retried —
166    /// those failures rarely fix themselves in the seconds-to-minutes window
167    /// we'd block for.
168    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
169    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
170        let mut attempt: u32 = 0;
171        loop {
172            let outcome = self.probe_once(site, username).await;
173            if !retry::should_retry(&outcome, attempt, &self.retry) {
174                return outcome;
175            }
176            let delay = retry::backoff_delay(attempt, &self.retry);
177            tracing::info!(
178                site = %site.name,
179                attempt = attempt + 1,
180                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
181                ?delay,
182                "transient ban, retrying",
183            );
184            tokio::time::sleep(delay).await;
185            attempt += 1;
186        }
187    }
188
189    /// Fetch a URL and return raw response data (status, final URL, body)
190    /// with the same throttle / User-Agent / proxy machinery as `check`,
191    /// but without signal evaluation or retry.
192    ///
193    /// Returns `None` on any network/transport error. Intended for
194    /// diagnostics such as `adler --doctor --fix`, which diffs the
195    /// responses for a known-present and a nonsense user to derive a
196    /// signature.
197    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
198        let host = host_of(url);
199        if let Some(global) = &self.global_throttle {
200            global.wait(GLOBAL_THROTTLE_KEY).await;
201        }
202        self.throttle.wait(&host).await;
203        let mut request = self.http.client().get(url);
204        if let Some(ua) = self.pick_user_agent() {
205            request = request.header(reqwest::header::USER_AGENT, ua);
206        }
207        let response = request.send().await.ok()?;
208        let status = response.status().as_u16();
209        let final_url = response.url().to_string();
210        let body = response.text().await.unwrap_or_default();
211        Some(RawResponse {
212            status,
213            final_url,
214            body,
215        })
216    }
217
218    /// Same as [`Self::fetch`] but routes through the configured browser
219    /// backend when the site is tagged `bot-protected` and a backend is
220    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
221    /// so that the diff-derivation works against the JS-rendered page
222    /// (login wall vs. real profile) rather than two identical raw-HTTP
223    /// shells.
224    ///
225    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
226    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
227    /// callers get the same `Option<RawResponse>` shape either way.
228    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
229        if let Some(backend) = self.browser.as_deref() {
230            let has_tag = site
231                .tags
232                .iter()
233                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
234            if has_tag || !site.protection.is_empty() {
235                let parsed = url::Url::parse(url).ok()?;
236                match backend
237                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
238                    .await
239                {
240                    Ok(page) => {
241                        return Some(RawResponse {
242                            status: page.status,
243                            final_url: page.final_url.to_string(),
244                            body: page.body,
245                        });
246                    }
247                    Err(err) => {
248                        tracing::warn!(
249                            site = %site.name, %url, error = %err,
250                            "browser fetch failed in doctor; falling back to raw HTTP",
251                        );
252                    }
253                }
254            }
255        }
256        self.fetch(url).await
257    }
258
259    /// Pick a User-Agent for the next request from the rotation pool, or
260    /// `None` to fall back on the client's fixed header.
261    fn pick_user_agent(&self) -> Option<&str> {
262        match self.user_agents.len() {
263            0 => None,
264            1 => Some(&self.user_agents[0]),
265            n => Some(&self.user_agents[fastrand::usize(0..n)]),
266        }
267    }
268
269    // Splitting probe_once into helpers would scatter the request/response
270    // flow that has to read top-to-bottom; one long function reads better.
271    #[allow(clippy::too_many_lines)]
272    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
273        let url = site.url_for(username);
274
275        // Site-level username constraint (Sherlock's `regexCheck`).
276        // Mismatch → skip the probe entirely. Saves a request and
277        // sidesteps the false-positive class where a site 404s on
278        // illegal usernames in a way our signal can't distinguish
279        // from a missing account. If the pattern fails to compile
280        // (Sherlock occasionally uses lookarounds, which our `regex`
281        // crate can't express), we let validate's warn-log stand
282        // and silently fall through — the rest of the probe still
283        // works.
284        if let Some(pat) = &site.regex_check {
285            if let Ok(re) = regex::Regex::new(pat) {
286                if !re.is_match(username.as_str()) {
287                    return uncertain(
288                        &site.name,
289                        url,
290                        Instant::now(),
291                        UncertainReason::UsernameNotAllowed,
292                    );
293                }
294            }
295        }
296
297        // Resolve an operator session if the site's access policy names
298        // one, and fold its headers (cookies / tokens) over the site's
299        // own. A named-but-missing session is reported rather than sent
300        // unauthenticated into a login wall — which reads identically
301        // for an existing and a missing account. Applies to both the
302        // HTTP and browser transports.
303        let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
304            None => Cow::Borrowed(&site.request_headers),
305            Some(name) => match self.sessions.get(name) {
306                Some(session) => Cow::Owned(session.apply(&site.request_headers)),
307                None => {
308                    return uncertain(
309                        &site.name,
310                        url,
311                        Instant::now(),
312                        UncertainReason::SessionRequired,
313                    );
314                }
315            },
316        };
317        let headers: &BTreeMap<String, String> = &session_headers;
318
319        // Auto-route bot-protected sites through the browser backend when
320        // one is configured. Raw HTTP can't see past their JS/login wall,
321        // so this is the only way they ever produce a Found verdict.
322        // A site is "bot-protected" in the routing sense if it carries
323        // the legacy tag OR declares any specific protection mechanism
324        // via the new `protection` field — either signal is enough.
325        if let Some(backend) = &self.browser {
326            let has_tag = site
327                .tags
328                .iter()
329                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
330            if has_tag || !site.protection.is_empty() {
331                if self.browser_budget.try_consume() {
332                    let started = Instant::now();
333                    let req = FetchRequest {
334                        method: site.request_method,
335                        url: &url,
336                        body: None,
337                        user_agent: None,
338                        headers,
339                        want_body: true,
340                    };
341                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
342                    let mut outcome = match fetcher.fetch(&req).await {
343                        Ok(resp) => self.finish(site, url, started, &resp),
344                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
345                    };
346                    outcome.transport = Some(crate::escalation::TransportTier::Browser);
347                    return outcome;
348                }
349                tracing::warn!(site = %site.name, "browser budget exhausted");
350                let mut outcome = uncertain(
351                    &site.name,
352                    url,
353                    Instant::now(),
354                    UncertainReason::BrowserBudget,
355                );
356                outcome.transport = Some(crate::escalation::TransportTier::Browser);
357                return outcome;
358            }
359        }
360
361        // Phase 2: route pure-`TlsFingerprint` sites through the
362        // impersonating transport — a real BoringSSL TLS handshake from
363        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
364        // protection tag, at a fraction of the cost of a real browser.
365        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
366        // keep going through the browser path above, where they were.
367        #[cfg(feature = "impersonate")]
368        if let Some(fetcher) = &self.impersonate {
369            let pure_tls = site.protection.len() == 1
370                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
371                && !site
372                    .tags
373                    .iter()
374                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
375            if pure_tls {
376                let started = Instant::now();
377                let req = FetchRequest {
378                    method: site.request_method,
379                    url: &url,
380                    body: None,
381                    user_agent: self.pick_user_agent(),
382                    headers,
383                    want_body: true,
384                };
385                let mut primary = match fetcher.fetch(&req).await {
386                    Ok(resp) => self.finish(site, url.clone(), started, &resp),
387                    Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
388                };
389                primary.transport = Some(crate::escalation::TransportTier::Impersonate);
390                return self.maybe_escalate(site, &url, headers, primary).await;
391            }
392        }
393
394        // Egress selection: route the HTTP path through a geo / IP-type
395        // matching proxy when the site's access policy demands one. An
396        // unconstrained policy uses the default egress; a constrained
397        // policy with no matching egress is reported `GeoUnavailable`
398        // rather than fetched from the wrong location (a false
399        // `NotFound` would be worse than an honest `Uncertain`).
400        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
401            EgressChoice::Default => Arc::clone(&self.http),
402            EgressChoice::Use(fetcher) => fetcher,
403            EgressChoice::Unavailable => {
404                return uncertain(
405                    &site.name,
406                    url,
407                    Instant::now(),
408                    UncertainReason::GeoUnavailable,
409                );
410            }
411        };
412
413        let host = host_of(&url);
414
415        // robots.txt gate, before consuming a throttle slot or probing.
416        if let Some(robots) = &self.robots {
417            if let Some((origin, path)) = origin_and_path(&url) {
418                if !robots.allowed(&origin, &path).await {
419                    tracing::debug!(%url, "skipped by robots.txt");
420                    return uncertain(
421                        &site.name,
422                        url,
423                        Instant::now(),
424                        UncertainReason::RobotsDisallowed,
425                    );
426                }
427            }
428        }
429
430        // Global cap first (gates every request), then per-host spacing.
431        if let Some(global) = &self.global_throttle {
432            global.wait(GLOBAL_THROTTLE_KEY).await;
433        }
434        self.throttle.wait(&host).await;
435        let started = Instant::now();
436        tracing::debug!(%url, %host, "probing");
437
438        // Read the body only if a signal needs it, or enrichment is on
439        // and the site declares extractor rules (extraction needs it).
440        let want_enrich = self.enrich && !site.extract.is_empty();
441        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
442
443        // POST sites carry their own body payload (the username goes in
444        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
445        // `{username}` in `Site::request_body` is substituted here,
446        // mirroring URL substitution.
447        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
448            const USERNAME_PH: &str = "{username}";
449            site.request_body
450                .as_deref()
451                .map(|t| t.replace(USERNAME_PH, username.as_str()))
452        } else {
453            None
454        };
455
456        let req = FetchRequest {
457            method: site.request_method,
458            url: &url,
459            body: body_for_post.as_deref(),
460            user_agent: self.pick_user_agent(),
461            headers,
462            want_body: needs_body,
463        };
464        let mut primary = match egress.fetch(&req).await {
465            Ok(resp) => self.finish(site, url.clone(), started, &resp),
466            Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
467        };
468        primary.transport = Some(crate::escalation::TransportTier::Http);
469        self.maybe_escalate(site, &url, headers, primary).await
470    }
471
472    /// If the cheap transport returned an `Uncertain` reason a browser
473    /// fetch could plausibly resolve, retry through the browser backend
474    /// and stamp the new outcome as escalated. Bounded by
475    /// [`escalation_budget`](ClientBuilder::escalation_budget).
476    async fn maybe_escalate(
477        &self,
478        site: &Site,
479        url: &str,
480        headers: &BTreeMap<String, String>,
481        primary: CheckOutcome,
482    ) -> CheckOutcome {
483        if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
484            return primary;
485        }
486        let Some(reason) = &primary.reason else {
487            return primary;
488        };
489        if !crate::escalation::should_escalate(reason) {
490            return primary;
491        }
492        let Some(backend) = &self.browser else {
493            return primary;
494        };
495        if !self.escalation_budget.try_consume() {
496            tracing::debug!(site = %site.name, "escalation budget exhausted");
497            return primary;
498        }
499
500        tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
501        let started = Instant::now();
502        let req = FetchRequest {
503            method: site.request_method,
504            url,
505            body: None,
506            user_agent: None,
507            headers,
508            want_body: true,
509        };
510        let fetcher = BrowserFetcher::new(Arc::clone(backend));
511        let mut escalated = match fetcher.fetch(&req).await {
512            Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
513            Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
514        };
515        escalated.transport = Some(crate::escalation::TransportTier::Browser);
516        escalated.escalations = 1;
517        escalated
518    }
519
520    /// Evaluate a fetched response against the site's signals and build
521    /// the outcome. Shared by the HTTP and browser transports so the
522    /// verdict / evidence / enrichment logic lives in exactly one place.
523    fn finish(
524        &self,
525        site: &Site,
526        url: String,
527        started: Instant,
528        resp: &crate::transport::FetchResponse,
529    ) -> CheckOutcome {
530        let probe = Probe {
531            status: resp.status,
532            final_url: &resp.final_url,
533            body: &resp.body,
534        };
535        let votes: Vec<(&Signal, SignalVerdict)> = site
536            .signals
537            .iter()
538            .map(|s| (s, s.evaluate(&probe)))
539            .collect();
540        let kind = aggregate(votes.iter().map(|(_, v)| *v));
541        let mut result = outcome(&site.name, url, started, kind);
542        // Record which signals produced the verdict (the winning polarity).
543        let winning = match kind {
544            MatchKind::Found => Some(SignalVerdict::Found),
545            MatchKind::NotFound => Some(SignalVerdict::NotFound),
546            MatchKind::Uncertain => None,
547        };
548        if let Some(want) = winning {
549            result.evidence = votes
550                .iter()
551                .filter(|(_, v)| *v == want)
552                .map(|(s, _)| s.describe_match(&probe))
553                .collect();
554        }
555        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
556            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
557        }
558        result
559    }
560}
561
562/// Raw response data returned by [`Client::fetch`] for diagnostics.
563#[derive(Debug, Clone)]
564pub struct RawResponse {
565    /// HTTP status code.
566    pub status: u16,
567    /// Final URL after redirects.
568    pub final_url: String,
569    /// Decoded response body.
570    pub body: String,
571}
572
573/// Builder for [`Client`].
574#[derive(Clone)]
575#[must_use = "ClientBuilder does nothing until `.build()` is called"]
576// A configuration builder accumulates many small flags; the four bool
577// fields here are semantically independent (redirect / enrich /
578// respect-robots / escalation), so collapsing them into a state machine
579// or enum would obscure rather than clarify.
580#[allow(clippy::struct_excessive_bools)]
581pub struct ClientBuilder {
582    timeout: Duration,
583    connect_timeout: Duration,
584    user_agent: String,
585    follow_redirects: bool,
586    redirect_limit: usize,
587    min_request_interval: Duration,
588    max_rps: Option<NonZeroU32>,
589    retry: RetryPolicy,
590    proxy: Option<String>,
591    user_agents: Vec<String>,
592    enrich: bool,
593    respect_robots: bool,
594    browser: Option<Arc<dyn BrowserBackend>>,
595    browser_budget: usize,
596    egress: Vec<EgressSpec>,
597    sessions: SessionStore,
598    escalation_budget: usize,
599    escalation_enabled: bool,
600}
601
602impl Default for ClientBuilder {
603    fn default() -> Self {
604        Self {
605            timeout: DEFAULT_TIMEOUT,
606            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
607            user_agent: default_user_agent(),
608            follow_redirects: true,
609            redirect_limit: DEFAULT_REDIRECT_LIMIT,
610            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
611            max_rps: None,
612            retry: RetryPolicy::default(),
613            proxy: None,
614            user_agents: Vec::new(),
615            enrich: false,
616            respect_robots: false,
617            browser: None,
618            browser_budget: DEFAULT_BROWSER_BUDGET,
619            egress: Vec::new(),
620            sessions: SessionStore::new(),
621            escalation_budget: DEFAULT_ESCALATION_BUDGET,
622            escalation_enabled: true,
623        }
624    }
625}
626
627impl ClientBuilder {
628    /// Per-request timeout (covers connect, headers, and body read).
629    pub fn timeout(mut self, timeout: Duration) -> Self {
630        self.timeout = timeout;
631        self
632    }
633
634    /// TCP-connect timeout, applied independently of the request timeout.
635    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
636        self.connect_timeout = timeout;
637        self
638    }
639
640    /// Override the `User-Agent` header sent on every request.
641    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
642        self.user_agent = user_agent.into();
643        self
644    }
645
646    /// Toggle automatic redirect following. Defaults to `true`; disable when
647    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
648    pub fn follow_redirects(mut self, follow: bool) -> Self {
649        self.follow_redirects = follow;
650        self
651    }
652
653    /// Minimum time between consecutive requests to the same host.
654    ///
655    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
656    /// rate-limit responses on common OSINT targets while keeping fan-out
657    /// across many sites fast.
658    pub fn min_request_interval(mut self, interval: Duration) -> Self {
659        self.min_request_interval = interval;
660        self
661    }
662
663    /// Cap the total request rate across *all* hosts to `rps` requests per
664    /// second. Independent of (and composed with) the per-host interval —
665    /// useful on a metered connection or behind a shared-quota proxy.
666    /// Uncapped by default.
667    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
668        self.max_rps = Some(rps);
669        self
670    }
671
672    /// Maximum retry attempts after a transient ban response. Defaults to 2
673    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
674    pub fn max_retries(mut self, n: u32) -> Self {
675        self.retry.max_retries = n;
676        self
677    }
678
679    /// Base delay for the first retry. Subsequent retries double until
680    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
681    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
682        self.retry.base_delay = d;
683        self
684    }
685
686    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
687    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
688        self.retry.max_delay = d;
689        self
690    }
691
692    /// Route all requests through a proxy. Accepts `http://`, `https://`,
693    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
694    pub fn proxy(mut self, url: impl Into<String>) -> Self {
695        self.proxy = Some(url.into());
696        self
697    }
698
699    /// Rotate the `User-Agent` header per request, picking uniformly at
700    /// random from `agents`. An empty list (the default) keeps the single
701    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
702    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
703        self.user_agents = agents;
704        self
705    }
706
707    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
708    /// pages. Off by default; enables an extra body read for matching sites.
709    pub fn enrich(mut self, enrich: bool) -> Self {
710        self.enrich = enrich;
711        self
712    }
713
714    /// Honor each host's `robots.txt`: probes to disallowed paths are
715    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
716    /// default. Adds one cached `robots.txt` fetch per origin.
717    pub fn respect_robots(mut self, respect: bool) -> Self {
718        self.respect_robots = respect;
719        self
720    }
721
722    /// Attach a browser backend. Sites tagged `bot-protected` will be
723    /// routed through it instead of the raw HTTP path, up to the
724    /// [`browser_budget`](Self::browser_budget) cap.
725    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
726        self.browser = Some(backend);
727        self
728    }
729
730    /// Per-scan cap on how many `bot-protected` sites are allowed to use
731    /// the browser backend. Once exhausted, the rest fall back to
732    /// `Uncertain(BrowserBudget)`. Defaults to
733    /// [`DEFAULT_BROWSER_BUDGET`].
734    pub const fn browser_budget(mut self, cap: usize) -> Self {
735        self.browser_budget = cap;
736        self
737    }
738
739    /// Per-scan cap on automatic escalations from the cheap transport
740    /// (HTTP / impersonate) to the browser when the cheap path returns
741    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
742    /// [`browser_budget`](Self::browser_budget). Defaults to
743    /// [`DEFAULT_ESCALATION_BUDGET`]. `cap = 0` is equivalent to
744    /// [`disable_escalation`](Self::disable_escalation).
745    pub const fn escalation_budget(mut self, cap: usize) -> Self {
746        self.escalation_budget = cap;
747        self
748    }
749
750    /// Disable automatic escalation entirely — the cheap transport's
751    /// outcome is returned verbatim, even when its `Uncertain` reason is
752    /// one a browser fetch would resolve. Useful for benchmarking the
753    /// raw HTTP signals without the access-engine lift on top.
754    pub const fn disable_escalation(mut self) -> Self {
755        self.escalation_enabled = false;
756        self
757    }
758
759    /// Configure the egress pool: proxies tagged by country / IP type
760    /// that sites with an `access` policy can require. Sites without a
761    /// policy are unaffected (they use the default egress / `--proxy`).
762    /// Replaces any previously set pool.
763    pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
764        self.egress = egress;
765        self
766    }
767
768    /// Supply operator authenticated sessions. A site whose `access`
769    /// policy names a session has that session's headers (cookies /
770    /// tokens) applied to its probe; a named-but-missing session yields
771    /// `Uncertain(SessionRequired)` rather than a login-wall false
772    /// negative. Replaces any previously set store.
773    pub fn sessions(mut self, sessions: SessionStore) -> Self {
774        self.sessions = sessions;
775        self
776    }
777
778    /// Build a [`Client`].
779    pub fn build(self) -> Result<Client> {
780        let inner = build_reqwest(
781            &self.user_agent,
782            self.timeout,
783            self.connect_timeout,
784            self.follow_redirects,
785            self.redirect_limit,
786            self.proxy.as_deref(),
787        )?;
788
789        // One HTTP client per configured egress — `reqwest` bakes the
790        // proxy in at build time, so geo / IP-type routing means a
791        // distinct client per proxy, paired with its match metadata.
792        let mut egress_entries = Vec::with_capacity(self.egress.len());
793        for spec in &self.egress {
794            let client = build_reqwest(
795                &self.user_agent,
796                self.timeout,
797                self.connect_timeout,
798                self.follow_redirects,
799                self.redirect_limit,
800                Some(&spec.url),
801            )?;
802            egress_entries.push((
803                spec.name.clone(),
804                spec.country.clone(),
805                spec.kind,
806                Arc::new(HttpFetcher::new(client)),
807            ));
808        }
809
810        let global_throttle = self.max_rps.map(|rps| {
811            // Min spacing between any two requests = 1s / rps.
812            let interval = Duration::from_secs(1) / rps.get();
813            HostThrottle::new(interval)
814        });
815        let robots = self
816            .respect_robots
817            .then(|| RobotsCache::new(inner.clone(), "adler"));
818        // Build the impersonate fetcher up front when the feature is on;
819        // surface a wreq init failure as `HttpSetup` so the caller sees
820        // it the same way they'd see a bad `--proxy` URL.
821        #[cfg(feature = "impersonate")]
822        let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
823        Ok(Client {
824            http: Arc::new(HttpFetcher::new(inner)),
825            egress: Arc::new(EgressPool::new(egress_entries)),
826            sessions: Arc::new(self.sessions),
827            throttle: HostThrottle::new(self.min_request_interval),
828            global_throttle,
829            retry: self.retry,
830            user_agents: Arc::from(self.user_agents),
831            enrich: self.enrich,
832            robots,
833            browser: self.browser,
834            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
835            escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
836                self.escalation_budget,
837            )),
838            escalation_enabled: self.escalation_enabled,
839            #[cfg(feature = "impersonate")]
840            impersonate,
841        })
842    }
843}
844
845/// Build a configured `reqwest::Client`, optionally routed through a
846/// proxy. Shared by the default client and every egress in the pool so
847/// they get identical timeout / redirect / User-Agent settings.
848fn build_reqwest(
849    user_agent: &str,
850    timeout: Duration,
851    connect_timeout: Duration,
852    follow_redirects: bool,
853    redirect_limit: usize,
854    proxy: Option<&str>,
855) -> Result<reqwest::Client> {
856    let redirect_policy = if follow_redirects {
857        redirect::Policy::limited(redirect_limit)
858    } else {
859        redirect::Policy::none()
860    };
861    let mut builder = reqwest::Client::builder()
862        .user_agent(user_agent.to_owned())
863        .timeout(timeout)
864        .connect_timeout(connect_timeout)
865        .redirect(redirect_policy);
866    if let Some(proxy_url) = proxy {
867        // reqwest treats a schemeless string (e.g. "not-a-url") as a host
868        // and silently defaults it to http://, so every probe would fail
869        // confusingly. Require an explicit, supported scheme up front.
870        const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
871        if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
872            return Err(Error::HttpSetup {
873                message: format!(
874                    "invalid proxy {proxy_url:?}: must start with one of {}",
875                    SCHEMES.join(", ")
876                ),
877            });
878        }
879        let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
880            message: format!("invalid proxy {proxy_url:?}: {e}"),
881        })?;
882        builder = builder.proxy(proxy);
883    }
884    builder.build().map_err(|e| Error::HttpSetup {
885        message: e.to_string(),
886    })
887}
888
889/// Default ceiling on browser-backed probes per scan when no other value
890/// is specified.
891///
892/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
893/// headroom while still being a guardrail against a misconfigured flag
894/// burning a whole Browserbase quota.
895pub const DEFAULT_BROWSER_BUDGET: usize = 50;
896
897/// Default ceiling on *automatic escalation* fetches per scan (HTTP /
898/// impersonate → browser when the cheap path returns
899/// `Uncertain(CloudflareChallenge | RateLimited)`).
900///
901/// Independent of [`DEFAULT_BROWSER_BUDGET`]: a `bot-protected` site that
902/// goes straight to the browser consumes browser budget; a non-pre-tagged
903/// site that escalates from HTTP to browser consumes one of each. Sized so
904/// a few-percent escalation rate across a typical registry stays under the
905/// cap without thinking about it.
906pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
907
908impl fmt::Debug for Client {
909    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
910        f.debug_struct("Client")
911            .field("throttle", &self.throttle)
912            .field("global_throttle", &self.global_throttle)
913            .field("retry", &self.retry)
914            .field("user_agents", &self.user_agents)
915            .field("enrich", &self.enrich)
916            .field("robots", &self.robots.is_some())
917            .field("browser", &self.browser.is_some())
918            .field("browser_budget", &self.browser_budget)
919            .field("escalation_budget", &self.escalation_budget)
920            .field("escalation_enabled", &self.escalation_enabled)
921            .finish_non_exhaustive()
922    }
923}
924
925impl fmt::Debug for ClientBuilder {
926    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
927        f.debug_struct("ClientBuilder")
928            .field("timeout", &self.timeout)
929            .field("connect_timeout", &self.connect_timeout)
930            .field("user_agent", &self.user_agent)
931            .field("follow_redirects", &self.follow_redirects)
932            .field("redirect_limit", &self.redirect_limit)
933            .field("min_request_interval", &self.min_request_interval)
934            .field("max_rps", &self.max_rps)
935            .field("retry", &self.retry)
936            .field("proxy", &self.proxy)
937            .field("user_agents", &self.user_agents)
938            .field("enrich", &self.enrich)
939            .field("respect_robots", &self.respect_robots)
940            .field("browser", &self.browser.is_some())
941            .field("browser_budget", &self.browser_budget)
942            .field("egress", &self.egress)
943            .field("sessions", &self.sessions)
944            .field("escalation_budget", &self.escalation_budget)
945            .field("escalation_enabled", &self.escalation_enabled)
946            .finish()
947    }
948}
949
950const BOT_PROTECTED_TAG: &str = "bot-protected";
951
952fn default_user_agent() -> String {
953    format!("adler/{}", env!("CARGO_PKG_VERSION"))
954}
955
956fn host_of(url: &str) -> String {
957    reqwest::Url::parse(url)
958        .ok()
959        .and_then(|u| u.host_str().map(str::to_owned))
960        .unwrap_or_else(|| "unknown".into())
961}
962
963/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
964/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
965fn origin_and_path(url: &str) -> Option<(String, String)> {
966    let parsed = reqwest::Url::parse(url).ok()?;
967    let host = parsed.host_str()?;
968    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
969    let origin = format!("{}://{host}{port}", parsed.scheme());
970    let path = parsed.query().map_or_else(
971        || parsed.path().to_owned(),
972        |q| format!("{}?{q}", parsed.path()),
973    );
974    Some((origin, path))
975}
976
977fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
978    CheckOutcome {
979        site: site.to_owned(),
980        url,
981        kind,
982        reason: None,
983        elapsed_ms: elapsed_ms(started),
984        enrichment: std::collections::BTreeMap::new(),
985        evidence: Vec::new(),
986        transport: None,
987        escalations: 0,
988    }
989}
990
991fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
992    CheckOutcome {
993        site: site.to_owned(),
994        url,
995        kind: MatchKind::Uncertain,
996        reason: Some(reason),
997        elapsed_ms: elapsed_ms(started),
998        enrichment: std::collections::BTreeMap::new(),
999        evidence: Vec::new(),
1000        transport: None,
1001        escalations: 0,
1002    }
1003}
1004
1005fn elapsed_ms(started: Instant) -> u64 {
1006    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011    use super::*;
1012    use crate::browser::RenderedPage;
1013    use crate::site::{Signal, UrlTemplate};
1014    use wiremock::matchers::{any, method, path};
1015    use wiremock::{Mock, MockServer, ResponseTemplate};
1016
1017    fn build_client() -> Client {
1018        Client::builder()
1019            .timeout(Duration::from_secs(2))
1020            // Tests share `127.0.0.1` as host — keep throttle out of the
1021            // way for everything but the dedicated throttle test below.
1022            .min_request_interval(Duration::ZERO)
1023            // Default retry would re-hit ban-test mocks; tests opt in
1024            // explicitly when they want to exercise the retry path.
1025            .max_retries(0)
1026            .build()
1027            .expect("client builds")
1028    }
1029
1030    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
1031        Site {
1032            name: "Mock".into(),
1033            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
1034            signals,
1035            known_present: None,
1036            known_absent: None,
1037            extract: Vec::new(),
1038            tags: Vec::new(),
1039            request_headers: std::collections::BTreeMap::new(),
1040            regex_check: None,
1041            engine: None,
1042            strip_bad_char: None,
1043            request_method: crate::site::HttpMethod::Get,
1044            request_body: None,
1045            protection: Vec::new(),
1046            disabled: false,
1047            disabled_reason: None,
1048            source: None,
1049            popularity: None,
1050            access: crate::AccessPolicy::default(),
1051        }
1052    }
1053
1054    fn user() -> Username {
1055        Username::new("alice").unwrap()
1056    }
1057
1058    #[tokio::test]
1059    async fn regex_check_short_circuits_before_any_request() {
1060        // Stand up a mock that would 200 on *anything* — if probe_once
1061        // failed to short-circuit on regex mismatch, the username
1062        // "alice" (5 chars) would resolve to Found here.
1063        let server = MockServer::start().await;
1064        Mock::given(any())
1065            .respond_with(ResponseTemplate::new(200))
1066            .mount(&server)
1067            .await;
1068        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1069        // The site only accepts usernames of 8+ chars; "alice" is 5.
1070        site.regex_check = Some("^[A-Za-z]{8,}$".into());
1071        let outcome = build_client().check(&site, &user()).await;
1072        assert_eq!(outcome.kind, MatchKind::Uncertain);
1073        assert!(
1074            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
1075            "expected UsernameNotAllowed, got {:?}",
1076            outcome.reason,
1077        );
1078        // No request should have hit the mock — assert by counting
1079        // received_requests on the wiremock server.
1080        let recvd = server.received_requests().await.unwrap_or_default();
1081        assert_eq!(
1082            recvd.len(),
1083            0,
1084            "regex_check mismatch must skip the HTTP request entirely"
1085        );
1086    }
1087
1088    #[tokio::test]
1089    async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
1090        // A mock that would 200 on anything — if the geo gate failed to
1091        // short-circuit, "alice" would resolve to Found here.
1092        let server = MockServer::start().await;
1093        Mock::given(any())
1094            .respond_with(ResponseTemplate::new(200))
1095            .mount(&server)
1096            .await;
1097        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1098        // Require a Polish egress; the default client has no egress pool,
1099        // so nothing can satisfy it.
1100        site.access = crate::access::AccessPolicy {
1101            geo: vec![crate::access::CountryCode::new("pl").unwrap()],
1102            ..crate::access::AccessPolicy::default()
1103        };
1104        let outcome = build_client().check(&site, &user()).await;
1105        assert_eq!(outcome.kind, MatchKind::Uncertain);
1106        assert!(
1107            matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
1108            "expected GeoUnavailable, got {:?}",
1109            outcome.reason,
1110        );
1111        // The site must NOT have been probed — an unreachable geo is not
1112        // evidence of absence, and we don't fetch from the wrong location.
1113        let recvd = server.received_requests().await.unwrap_or_default();
1114        assert_eq!(
1115            recvd.len(),
1116            0,
1117            "geo-unavailable must skip the HTTP request entirely"
1118        );
1119    }
1120
1121    #[tokio::test]
1122    async fn session_headers_are_sent_on_probe() {
1123        // Only respond 200 when the request carries the session cookie,
1124        // so a Found verdict proves the header was actually applied.
1125        let server = MockServer::start().await;
1126        Mock::given(any())
1127            .and(wiremock::matchers::header("cookie", "sessionid=real"))
1128            .respond_with(ResponseTemplate::new(200))
1129            .mount(&server)
1130            .await;
1131        let mut headers = std::collections::BTreeMap::new();
1132        headers.insert("Cookie".to_string(), "sessionid=real".to_string());
1133        let mut store = SessionStore::new();
1134        store.insert("acct", crate::access::Session::from_headers(headers));
1135        let client = Client::builder()
1136            .timeout(Duration::from_secs(2))
1137            .min_request_interval(Duration::ZERO)
1138            .max_retries(0)
1139            .sessions(store)
1140            .build()
1141            .expect("client builds");
1142        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1143        site.access.session = Some("acct".to_string());
1144        let outcome = client.check(&site, &user()).await;
1145        assert_eq!(
1146            outcome.kind,
1147            MatchKind::Found,
1148            "session cookie should unlock the 200 (got {:?})",
1149            outcome.reason,
1150        );
1151    }
1152
1153    #[tokio::test]
1154    async fn missing_named_session_is_session_required() {
1155        let server = MockServer::start().await;
1156        Mock::given(any())
1157            .respond_with(ResponseTemplate::new(200))
1158            .mount(&server)
1159            .await;
1160        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1161        // Names a session the (empty) store doesn't have.
1162        site.access.session = Some("not-configured".to_string());
1163        let outcome = build_client().check(&site, &user()).await;
1164        assert_eq!(outcome.kind, MatchKind::Uncertain);
1165        assert!(
1166            matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
1167            "expected SessionRequired, got {:?}",
1168            outcome.reason,
1169        );
1170        let recvd = server.received_requests().await.unwrap_or_default();
1171        assert_eq!(
1172            recvd.len(),
1173            0,
1174            "a missing session must skip the request, not probe unauthenticated"
1175        );
1176    }
1177
1178    #[cfg(feature = "impersonate")]
1179    #[tokio::test]
1180    async fn impersonate_routes_pure_tls_fingerprint_site() {
1181        let server = MockServer::start().await;
1182        Mock::given(any())
1183            .respond_with(ResponseTemplate::new(200))
1184            .mount(&server)
1185            .await;
1186        let client = Client::builder()
1187            .timeout(Duration::from_secs(2))
1188            .min_request_interval(Duration::ZERO)
1189            .max_retries(0)
1190            .build()
1191            .expect("client builds with impersonate");
1192        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1193        // Pure TLS-fingerprint protection — exactly the shape that
1194        // routes to the impersonate fetcher.
1195        site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1196        let outcome = client.check(&site, &user()).await;
1197        assert_eq!(
1198            outcome.kind,
1199            MatchKind::Found,
1200            "expected Found (reason {:?})",
1201            outcome.reason,
1202        );
1203        // wreq's Chrome-134 emulation sets a Chrome-shaped User-Agent —
1204        // observable proof that the request came from the impersonate
1205        // path and not the default `adler/<version>` HTTP fetcher.
1206        let recvd = server.received_requests().await.expect("received requests");
1207        assert_eq!(recvd.len(), 1, "expected exactly one request");
1208        let ua = recvd[0]
1209            .headers
1210            .get("user-agent")
1211            .and_then(|v| v.to_str().ok())
1212            .unwrap_or("");
1213        assert!(
1214            ua.contains("Chrome/"),
1215            "expected Chrome-shaped UA from wreq, got {ua:?}"
1216        );
1217    }
1218
1219    #[tokio::test]
1220    async fn regex_check_pass_proceeds_to_probe() {
1221        let server = MockServer::start().await;
1222        Mock::given(any())
1223            .and(path("/alice"))
1224            .respond_with(ResponseTemplate::new(200))
1225            .mount(&server)
1226            .await;
1227        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1228        // Pattern that matches "alice".
1229        site.regex_check = Some("^[a-z]{3,}$".into());
1230        let outcome = build_client().check(&site, &user()).await;
1231        assert_eq!(outcome.kind, MatchKind::Found);
1232    }
1233
1234    #[tokio::test]
1235    async fn status_signal_reports_found_on_match() {
1236        let server = MockServer::start().await;
1237        Mock::given(any())
1238            .and(path("/alice"))
1239            .respond_with(ResponseTemplate::new(200))
1240            .mount(&server)
1241            .await;
1242        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1243        let outcome = build_client().check(&site, &user()).await;
1244        assert_eq!(outcome.kind, MatchKind::Found);
1245        assert!(outcome.url.ends_with("/alice"));
1246        assert!(outcome.reason.is_none());
1247        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1248    }
1249
1250    #[tokio::test]
1251    async fn status_signal_pair_reports_not_found_on_404() {
1252        let server = MockServer::start().await;
1253        Mock::given(any())
1254            .and(path("/alice"))
1255            .respond_with(ResponseTemplate::new(404))
1256            .mount(&server)
1257            .await;
1258        let site = site_with(
1259            &server,
1260            vec![
1261                Signal::StatusFound { codes: vec![200] },
1262                Signal::StatusNotFound { codes: vec![404] },
1263            ],
1264        );
1265        let outcome = build_client().check(&site, &user()).await;
1266        assert_eq!(outcome.kind, MatchKind::NotFound);
1267        // Only the NotFound-voting signal is cited as evidence.
1268        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1269    }
1270
1271    #[tokio::test]
1272    async fn body_absent_signal_detects_missing_account() {
1273        let server = MockServer::start().await;
1274        Mock::given(any())
1275            .and(path("/alice"))
1276            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1277            .mount(&server)
1278            .await;
1279        let site = site_with(
1280            &server,
1281            vec![Signal::BodyAbsent {
1282                text: "Profile not found".into(),
1283            }],
1284        );
1285        let outcome = build_client().check(&site, &user()).await;
1286        assert_eq!(outcome.kind, MatchKind::NotFound);
1287    }
1288
1289    #[tokio::test]
1290    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1291        // Phase 2 semantics: absence of an absence-marker is not evidence
1292        // of presence — it just means we have no signal that fired.
1293        let server = MockServer::start().await;
1294        Mock::given(any())
1295            .and(path("/alice"))
1296            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1297            .mount(&server)
1298            .await;
1299        let site = site_with(
1300            &server,
1301            vec![Signal::BodyAbsent {
1302                text: "Profile not found".into(),
1303            }],
1304        );
1305        let outcome = build_client().check(&site, &user()).await;
1306        assert_eq!(outcome.kind, MatchKind::Uncertain);
1307    }
1308
1309    #[tokio::test]
1310    async fn body_present_plus_absent_resolve_to_found() {
1311        let server = MockServer::start().await;
1312        Mock::given(any())
1313            .and(path("/alice"))
1314            .respond_with(
1315                ResponseTemplate::new(200)
1316                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
1317            )
1318            .mount(&server)
1319            .await;
1320        let site = site_with(
1321            &server,
1322            vec![
1323                Signal::BodyPresent {
1324                    text: "profile-card".into(),
1325                },
1326                Signal::BodyAbsent {
1327                    text: "Profile not found".into(),
1328                },
1329            ],
1330        );
1331        let outcome = build_client().check(&site, &user()).await;
1332        assert_eq!(outcome.kind, MatchKind::Found);
1333    }
1334
1335    #[tokio::test]
1336    async fn redirect_absent_signal_detects_missing_account() {
1337        let server = MockServer::start().await;
1338        Mock::given(any())
1339            .and(path("/alice"))
1340            .respond_with(
1341                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1342            )
1343            .mount(&server)
1344            .await;
1345        Mock::given(any())
1346            .and(path("/login"))
1347            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1348            .mount(&server)
1349            .await;
1350        let site = site_with(
1351            &server,
1352            vec![Signal::RedirectAbsent {
1353                fragment: "/login".into(),
1354            }],
1355        );
1356        let outcome = build_client().check(&site, &user()).await;
1357        assert_eq!(outcome.kind, MatchKind::NotFound);
1358    }
1359
1360    #[tokio::test]
1361    async fn negative_signal_wins_over_positive() {
1362        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1363        // (error marker appears). Negative-priority aggregation → NotFound.
1364        // This is the canonical Sherlock "message" pattern: a site that
1365        // returns 200 for everyone and differentiates via an error string.
1366        let server = MockServer::start().await;
1367        Mock::given(any())
1368            .and(path("/alice"))
1369            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1370            .mount(&server)
1371            .await;
1372        let site = site_with(
1373            &server,
1374            vec![
1375                Signal::StatusFound { codes: vec![200] },
1376                Signal::BodyAbsent {
1377                    text: "Profile not found".into(),
1378                },
1379            ],
1380        );
1381        let outcome = build_client().check(&site, &user()).await;
1382        assert_eq!(outcome.kind, MatchKind::NotFound);
1383    }
1384
1385    #[tokio::test]
1386    async fn network_failure_yields_uncertain() {
1387        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1388        let port = listener.local_addr().unwrap().port();
1389        drop(listener);
1390
1391        let site = Site {
1392            name: "Dead".into(),
1393            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1394            signals: vec![Signal::StatusFound { codes: vec![200] }],
1395            known_present: None,
1396            known_absent: None,
1397            extract: Vec::new(),
1398            tags: Vec::new(),
1399            request_headers: std::collections::BTreeMap::new(),
1400            regex_check: None,
1401            engine: None,
1402            strip_bad_char: None,
1403            request_method: crate::site::HttpMethod::Get,
1404            request_body: None,
1405            protection: Vec::new(),
1406            disabled: false,
1407            disabled_reason: None,
1408            source: None,
1409            popularity: None,
1410            access: crate::AccessPolicy::default(),
1411        };
1412        let client = Client::builder()
1413            .timeout(Duration::from_millis(500))
1414            .connect_timeout(Duration::from_millis(500))
1415            .max_retries(0)
1416            .build()
1417            .unwrap();
1418        let outcome = client.check(&site, &user()).await;
1419        assert_eq!(outcome.kind, MatchKind::Uncertain);
1420        assert!(outcome.reason.is_some());
1421    }
1422
1423    #[tokio::test]
1424    async fn throttle_spaces_consecutive_calls_to_same_host() {
1425        let server = MockServer::start().await;
1426        Mock::given(any())
1427            .and(path("/alice"))
1428            .respond_with(ResponseTemplate::new(200))
1429            .mount(&server)
1430            .await;
1431        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1432        // Interval is intentionally much larger than typical wiremock latency
1433        // (≤10 ms locally, can spike under heavy parallel test load). Any
1434        // value too close to HTTP latency would let the first request burn
1435        // through the throttle window and make the assertion flaky.
1436        let client = Client::builder()
1437            .timeout(Duration::from_secs(2))
1438            .min_request_interval(Duration::from_millis(300))
1439            .build()
1440            .unwrap();
1441
1442        client.check(&site, &user()).await;
1443        let started = Instant::now();
1444        client.check(&site, &user()).await;
1445        let elapsed = started.elapsed();
1446        assert!(
1447            elapsed >= Duration::from_millis(200),
1448            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1449        );
1450    }
1451
1452    #[tokio::test]
1453    async fn builder_overrides_user_agent() {
1454        let server = MockServer::start().await;
1455        Mock::given(any())
1456            .and(path("/alice"))
1457            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1458            .respond_with(ResponseTemplate::new(200))
1459            .mount(&server)
1460            .await;
1461        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1462        let client = Client::builder()
1463            .user_agent("adler-test/1.0")
1464            .build()
1465            .unwrap();
1466        let outcome = client.check(&site, &user()).await;
1467        assert_eq!(outcome.kind, MatchKind::Found);
1468    }
1469
1470    #[tokio::test]
1471    async fn rate_limit_429_yields_uncertain_with_note() {
1472        let server = MockServer::start().await;
1473        Mock::given(any())
1474            .and(path("/alice"))
1475            .respond_with(ResponseTemplate::new(429))
1476            .mount(&server)
1477            .await;
1478        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1479        let outcome = build_client().check(&site, &user()).await;
1480        assert_eq!(outcome.kind, MatchKind::Uncertain);
1481        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1482    }
1483
1484    #[tokio::test]
1485    async fn cloudflare_server_header_yields_uncertain() {
1486        let server = MockServer::start().await;
1487        Mock::given(any())
1488            .and(path("/alice"))
1489            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1490            .mount(&server)
1491            .await;
1492        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1493        let outcome = build_client().check(&site, &user()).await;
1494        assert_eq!(outcome.kind, MatchKind::Uncertain);
1495        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1496    }
1497
1498    #[tokio::test]
1499    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1500        // Body-based ban detection only runs when a signal already needs
1501        // the body — this site uses BodyAbsent so the body is read.
1502        let server = MockServer::start().await;
1503        Mock::given(any())
1504            .and(path("/alice"))
1505            .respond_with(
1506                ResponseTemplate::new(200)
1507                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1508            )
1509            .mount(&server)
1510            .await;
1511        let site = site_with(
1512            &server,
1513            vec![Signal::BodyAbsent {
1514                text: "Profile not found".into(),
1515            }],
1516        );
1517        let outcome = build_client().check(&site, &user()).await;
1518        assert_eq!(outcome.kind, MatchKind::Uncertain);
1519        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1520    }
1521
1522    #[tokio::test]
1523    async fn ban_detection_does_not_fire_on_legitimate_403() {
1524        let server = MockServer::start().await;
1525        Mock::given(any())
1526            .and(path("/alice"))
1527            .respond_with(ResponseTemplate::new(403))
1528            .mount(&server)
1529            .await;
1530        let site = site_with(
1531            &server,
1532            vec![
1533                Signal::StatusFound { codes: vec![200] },
1534                Signal::StatusNotFound { codes: vec![403] },
1535            ],
1536        );
1537        let outcome = build_client().check(&site, &user()).await;
1538        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1539        assert_eq!(outcome.kind, MatchKind::NotFound);
1540        assert!(outcome.reason.is_none());
1541    }
1542
1543    #[tokio::test]
1544    async fn retry_recovers_after_transient_429() {
1545        let server = MockServer::start().await;
1546        // First request: 429. Subsequent: 200.
1547        Mock::given(any())
1548            .and(path("/alice"))
1549            .respond_with(ResponseTemplate::new(429))
1550            .up_to_n_times(1)
1551            .mount(&server)
1552            .await;
1553        Mock::given(any())
1554            .and(path("/alice"))
1555            .respond_with(ResponseTemplate::new(200))
1556            .mount(&server)
1557            .await;
1558        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1559        let client = Client::builder()
1560            .timeout(Duration::from_secs(2))
1561            .min_request_interval(Duration::ZERO)
1562            .max_retries(2)
1563            .base_backoff_delay(Duration::from_millis(20))
1564            .max_backoff_delay(Duration::from_millis(100))
1565            .build()
1566            .unwrap();
1567        let outcome = client.check(&site, &user()).await;
1568        assert_eq!(outcome.kind, MatchKind::Found);
1569        assert!(outcome.reason.is_none());
1570    }
1571
1572    #[tokio::test]
1573    async fn retry_exhausts_and_returns_uncertain() {
1574        let server = MockServer::start().await;
1575        Mock::given(any())
1576            .and(path("/alice"))
1577            .respond_with(ResponseTemplate::new(429))
1578            .mount(&server)
1579            .await;
1580        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1581        let client = Client::builder()
1582            .timeout(Duration::from_secs(2))
1583            .min_request_interval(Duration::ZERO)
1584            .max_retries(2)
1585            .base_backoff_delay(Duration::from_millis(10))
1586            .max_backoff_delay(Duration::from_millis(50))
1587            .build()
1588            .unwrap();
1589        let outcome = client.check(&site, &user()).await;
1590        assert_eq!(outcome.kind, MatchKind::Uncertain);
1591        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1592    }
1593
1594    #[tokio::test]
1595    async fn retry_does_not_fire_on_network_error() {
1596        // Connection refused → Uncertain note starts with "request:", not a
1597        // ban marker. We must NOT retry — otherwise a single dead site
1598        // burns the full backoff budget before reporting.
1599        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1600        let port = listener.local_addr().unwrap().port();
1601        drop(listener);
1602        let site = Site {
1603            name: "Dead".into(),
1604            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1605            signals: vec![Signal::StatusFound { codes: vec![200] }],
1606            known_present: None,
1607            known_absent: None,
1608            extract: Vec::new(),
1609            tags: Vec::new(),
1610            request_headers: std::collections::BTreeMap::new(),
1611            regex_check: None,
1612            engine: None,
1613            strip_bad_char: None,
1614            request_method: crate::site::HttpMethod::Get,
1615            request_body: None,
1616            protection: Vec::new(),
1617            disabled: false,
1618            disabled_reason: None,
1619            source: None,
1620            popularity: None,
1621            access: crate::AccessPolicy::default(),
1622        };
1623        let client = Client::builder()
1624            .timeout(Duration::from_millis(500))
1625            .connect_timeout(Duration::from_millis(500))
1626            .min_request_interval(Duration::ZERO)
1627            .max_retries(3)
1628            .base_backoff_delay(Duration::from_secs(60))
1629            .build()
1630            .unwrap();
1631        let started = Instant::now();
1632        let outcome = client.check(&site, &user()).await;
1633        // If retry fired, we'd be sleeping minutes; instead this returns
1634        // promptly with an Uncertain.
1635        assert!(started.elapsed() < Duration::from_secs(5));
1636        assert_eq!(outcome.kind, MatchKind::Uncertain);
1637        assert!(
1638            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1639            "got {:?}",
1640            outcome.reason,
1641        );
1642    }
1643
1644    #[tokio::test]
1645    async fn rotates_user_agent_per_request() {
1646        // The mock only matches when the request carries one of the pooled
1647        // UAs; if rotation weren't applied, the default adler/x.y UA would
1648        // miss and the verdict would be NotFound.
1649        let server = MockServer::start().await;
1650        Mock::given(any())
1651            .and(path("/alice"))
1652            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1653            .respond_with(ResponseTemplate::new(200))
1654            .mount(&server)
1655            .await;
1656        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1657        let client = Client::builder()
1658            .min_request_interval(Duration::ZERO)
1659            .max_retries(0)
1660            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1661            .build()
1662            .unwrap();
1663        let outcome = client.check(&site, &user()).await;
1664        assert_eq!(outcome.kind, MatchKind::Found);
1665    }
1666
1667    #[test]
1668    fn invalid_proxy_url_fails_build() {
1669        let err = Client::builder().proxy("not a url").build().unwrap_err();
1670        assert!(matches!(err, Error::HttpSetup { .. }));
1671    }
1672
1673    #[test]
1674    fn schemeless_proxy_is_rejected_up_front() {
1675        // reqwest would silently treat this as a host; we require a scheme.
1676        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1677        let Error::HttpSetup { message } = err else {
1678            panic!("expected HttpSetup, got {err:?}");
1679        };
1680        assert!(message.contains("must start with"), "{message}");
1681    }
1682
1683    #[test]
1684    fn socks5_proxy_scheme_is_accepted() {
1685        // Valid scheme + endpoint builds fine (no connection is attempted).
1686        assert!(
1687            Client::builder()
1688                .proxy("socks5://127.0.0.1:9050")
1689                .build()
1690                .is_ok()
1691        );
1692    }
1693
1694    #[tokio::test]
1695    async fn global_rps_cap_spaces_requests_across_hosts() {
1696        // Two distinct host paths; per-host throttle is disabled, so any
1697        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1698        let server = MockServer::start().await;
1699        Mock::given(any())
1700            .respond_with(ResponseTemplate::new(200))
1701            .mount(&server)
1702            .await;
1703        let site_a = Site {
1704            name: "A".into(),
1705            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1706            signals: vec![Signal::StatusFound { codes: vec![200] }],
1707            known_present: None,
1708            known_absent: None,
1709            extract: Vec::new(),
1710            tags: Vec::new(),
1711            request_headers: std::collections::BTreeMap::new(),
1712            regex_check: None,
1713            engine: None,
1714            strip_bad_char: None,
1715            request_method: crate::site::HttpMethod::Get,
1716            request_body: None,
1717            protection: Vec::new(),
1718            disabled: false,
1719            disabled_reason: None,
1720            source: None,
1721            popularity: None,
1722            access: crate::AccessPolicy::default(),
1723        };
1724        let site_b = Site {
1725            name: "B".into(),
1726            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1727            signals: vec![Signal::StatusFound { codes: vec![200] }],
1728            known_present: None,
1729            known_absent: None,
1730            extract: Vec::new(),
1731            tags: Vec::new(),
1732            request_headers: std::collections::BTreeMap::new(),
1733            regex_check: None,
1734            engine: None,
1735            strip_bad_char: None,
1736            request_method: crate::site::HttpMethod::Get,
1737            request_body: None,
1738            protection: Vec::new(),
1739            disabled: false,
1740            disabled_reason: None,
1741            source: None,
1742            popularity: None,
1743            access: crate::AccessPolicy::default(),
1744        };
1745        // 2 RPS → ~500 ms between requests. A large interval keeps the
1746        // assertion robust even when the first probe's own duration (which
1747        // eats into the measured gap) is inflated by test instrumentation
1748        // such as coverage tooling.
1749        let client = Client::builder()
1750            .min_request_interval(Duration::ZERO)
1751            .max_retries(0)
1752            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1753            .build()
1754            .unwrap();
1755        // First request consumes the slot at t≈0; second waits ~500 ms even
1756        // though it targets a different host.
1757        client.check(&site_a, &user()).await;
1758        let started = Instant::now();
1759        client.check(&site_b, &user()).await;
1760        assert!(
1761            started.elapsed() >= Duration::from_millis(350),
1762            "global cap should space cross-host requests, got {:?}",
1763            started.elapsed(),
1764        );
1765    }
1766
1767    #[tokio::test]
1768    async fn respect_robots_skips_disallowed_paths() {
1769        let server = MockServer::start().await;
1770        Mock::given(any())
1771            .and(path("/robots.txt"))
1772            .respond_with(
1773                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1774            )
1775            .mount(&server)
1776            .await;
1777        Mock::given(any())
1778            .and(path("/no/alice"))
1779            .respond_with(ResponseTemplate::new(200))
1780            .mount(&server)
1781            .await;
1782        Mock::given(any())
1783            .and(path("/yes/alice"))
1784            .respond_with(ResponseTemplate::new(200))
1785            .mount(&server)
1786            .await;
1787        let client = Client::builder()
1788            .min_request_interval(Duration::ZERO)
1789            .max_retries(0)
1790            .respect_robots(true)
1791            .build()
1792            .unwrap();
1793
1794        let disallowed = Site {
1795            name: "No".into(),
1796            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1797            signals: vec![Signal::StatusFound { codes: vec![200] }],
1798            known_present: None,
1799            known_absent: None,
1800            extract: Vec::new(),
1801            tags: Vec::new(),
1802            request_headers: std::collections::BTreeMap::new(),
1803            regex_check: None,
1804            engine: None,
1805            strip_bad_char: None,
1806            request_method: crate::site::HttpMethod::Get,
1807            request_body: None,
1808            protection: Vec::new(),
1809            disabled: false,
1810            disabled_reason: None,
1811            source: None,
1812            popularity: None,
1813            access: crate::AccessPolicy::default(),
1814        };
1815        let allowed = Site {
1816            name: "Yes".into(),
1817            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1818            signals: vec![Signal::StatusFound { codes: vec![200] }],
1819            known_present: None,
1820            known_absent: None,
1821            extract: Vec::new(),
1822            tags: Vec::new(),
1823            request_headers: std::collections::BTreeMap::new(),
1824            regex_check: None,
1825            engine: None,
1826            strip_bad_char: None,
1827            request_method: crate::site::HttpMethod::Get,
1828            request_body: None,
1829            protection: Vec::new(),
1830            disabled: false,
1831            disabled_reason: None,
1832            source: None,
1833            popularity: None,
1834            access: crate::AccessPolicy::default(),
1835        };
1836
1837        let no = client.check(&disallowed, &user()).await;
1838        assert_eq!(no.kind, MatchKind::Uncertain);
1839        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1840
1841        let yes = client.check(&allowed, &user()).await;
1842        assert_eq!(yes.kind, MatchKind::Found);
1843    }
1844
1845    #[tokio::test]
1846    async fn body_read_skipped_when_no_body_signal_needed() {
1847        // Mock returns body that would fail a body_absent check — but since
1848        // we only have a status signal, body is never read.
1849        let server = MockServer::start().await;
1850        Mock::given(any())
1851            .and(path("/alice"))
1852            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1853            .mount(&server)
1854            .await;
1855        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1856        let outcome = build_client().check(&site, &user()).await;
1857        assert_eq!(outcome.kind, MatchKind::Found);
1858    }
1859
1860    // ===== Browser routing =====
1861
1862    /// Test backend that returns a canned page and counts calls. Lets the
1863    /// routing tests assert "Client did/did not invoke the browser" without
1864    /// involving a real Chrome process.
1865    #[derive(Debug)]
1866    struct RecordingBackend {
1867        page: RenderedPage,
1868        calls: std::sync::atomic::AtomicUsize,
1869    }
1870
1871    impl RecordingBackend {
1872        fn with_page(page: RenderedPage) -> Self {
1873            Self {
1874                page,
1875                calls: std::sync::atomic::AtomicUsize::new(0),
1876            }
1877        }
1878        fn call_count(&self) -> usize {
1879            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1880        }
1881    }
1882
1883    #[async_trait::async_trait]
1884    impl BrowserBackend for RecordingBackend {
1885        async fn fetch(
1886            &self,
1887            _url: &url::Url,
1888            _headers: &std::collections::BTreeMap<String, String>,
1889            _timeout: Duration,
1890        ) -> Result<RenderedPage> {
1891            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1892            Ok(self.page.clone())
1893        }
1894    }
1895
1896    fn site_bot_protected(server: &MockServer) -> Site {
1897        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1898        s.tags = vec!["bot-protected".into()];
1899        s
1900    }
1901
1902    #[tokio::test]
1903    async fn browser_routes_bot_protected_sites() {
1904        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1905        // returns its canned page directly.
1906        let server = MockServer::start().await;
1907        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1908            status: 200,
1909            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1910            body: "<html></html>".into(),
1911            elapsed_ms: 42,
1912        }));
1913        let client = Client::builder()
1914            .min_request_interval(Duration::ZERO)
1915            .max_retries(0)
1916            .browser(backend.clone())
1917            .build()
1918            .unwrap();
1919        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1920        assert_eq!(outcome.kind, MatchKind::Found);
1921        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1922    }
1923
1924    #[tokio::test]
1925    async fn non_bot_protected_sites_skip_browser() {
1926        let server = MockServer::start().await;
1927        Mock::given(any())
1928            .and(path("/alice"))
1929            .respond_with(ResponseTemplate::new(200))
1930            .mount(&server)
1931            .await;
1932        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1933            status: 500, // would make wiremock case fail if browser was taken
1934            final_url: url::Url::parse("https://x/").unwrap(),
1935            body: String::new(),
1936            elapsed_ms: 0,
1937        }));
1938        let client = Client::builder()
1939            .min_request_interval(Duration::ZERO)
1940            .max_retries(0)
1941            .browser(backend.clone())
1942            .build()
1943            .unwrap();
1944        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1945        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1946        let outcome = client.check(&site, &user()).await;
1947        assert_eq!(outcome.kind, MatchKind::Found);
1948        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1949    }
1950
1951    #[tokio::test]
1952    async fn browser_budget_exhaust_yields_uncertain() {
1953        let server = MockServer::start().await;
1954        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1955            status: 200,
1956            final_url: url::Url::parse("https://x/").unwrap(),
1957            body: String::new(),
1958            elapsed_ms: 0,
1959        }));
1960        let client = Client::builder()
1961            .min_request_interval(Duration::ZERO)
1962            .max_retries(0)
1963            .browser(backend.clone())
1964            .browser_budget(1)
1965            .build()
1966            .unwrap();
1967        let site = site_bot_protected(&server);
1968        // First call consumes the only slot.
1969        let first = client.check(&site, &user()).await;
1970        assert_eq!(first.kind, MatchKind::Found);
1971        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1972        let second = client.check(&site, &user()).await;
1973        assert_eq!(second.kind, MatchKind::Uncertain);
1974        assert!(matches!(
1975            second.reason,
1976            Some(UncertainReason::BrowserBudget)
1977        ));
1978        assert_eq!(
1979            backend.call_count(),
1980            1,
1981            "second call must not invoke backend"
1982        );
1983    }
1984
1985    #[tokio::test]
1986    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1987        struct FailingBackend;
1988        #[async_trait::async_trait]
1989        impl BrowserBackend for FailingBackend {
1990            async fn fetch(
1991                &self,
1992                _url: &url::Url,
1993                _headers: &std::collections::BTreeMap<String, String>,
1994                _timeout: Duration,
1995            ) -> Result<RenderedPage> {
1996                Err(Error::BrowserSetup {
1997                    message: "simulated crash".into(),
1998                })
1999            }
2000        }
2001        impl std::fmt::Debug for FailingBackend {
2002            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2003                f.write_str("FailingBackend")
2004            }
2005        }
2006
2007        let server = MockServer::start().await;
2008        let client = Client::builder()
2009            .min_request_interval(Duration::ZERO)
2010            .max_retries(0)
2011            .browser(Arc::new(FailingBackend))
2012            .build()
2013            .unwrap();
2014        let outcome = client.check(&site_bot_protected(&server), &user()).await;
2015        assert_eq!(outcome.kind, MatchKind::Uncertain);
2016        match outcome.reason {
2017            Some(UncertainReason::BrowserFailed(msg)) => {
2018                assert!(msg.contains("simulated crash"), "got: {msg}");
2019            }
2020            other => panic!("expected BrowserFailed, got {other:?}"),
2021        }
2022    }
2023
2024    #[tokio::test]
2025    async fn status_only_site_uses_head_request() {
2026        // Site with only status signals (no body markers, no enrichment)
2027        // should be probed with HEAD — saves the body download on
2028        // ~30% of the registry.
2029        let server = MockServer::start().await;
2030        Mock::given(method("HEAD"))
2031            .and(path("/alice"))
2032            .respond_with(ResponseTemplate::new(200))
2033            .mount(&server)
2034            .await;
2035        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2036        let outcome = build_client().check(&site, &user()).await;
2037        assert_eq!(outcome.kind, MatchKind::Found);
2038        let recvd = server.received_requests().await.unwrap_or_default();
2039        assert_eq!(recvd.len(), 1);
2040        assert_eq!(recvd[0].method.as_str(), "HEAD");
2041    }
2042
2043    #[tokio::test]
2044    async fn body_signal_site_uses_get_request() {
2045        // Same baseline plus a body-marker signal — must still GET so
2046        // the body actually arrives for matching.
2047        let server = MockServer::start().await;
2048        Mock::given(any())
2049            .and(path("/alice"))
2050            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
2051            .mount(&server)
2052            .await;
2053        let site = site_with(
2054            &server,
2055            vec![Signal::BodyPresent {
2056                text: "hello".into(),
2057            }],
2058        );
2059        let outcome = build_client().check(&site, &user()).await;
2060        assert_eq!(outcome.kind, MatchKind::Found);
2061        let recvd = server.received_requests().await.unwrap_or_default();
2062        assert_eq!(recvd[0].method.as_str(), "GET");
2063    }
2064
2065    #[tokio::test]
2066    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
2067        // A site that declares `protection: [Cloudflare]` but doesn't
2068        // carry the legacy `bot-protected` tag should still route
2069        // through the browser backend — the new structured field is
2070        // an additional signal, not a tag replacement.
2071        let server = MockServer::start().await;
2072        Mock::given(any())
2073            .respond_with(ResponseTemplate::new(200))
2074            .mount(&server)
2075            .await;
2076        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2077        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
2078        // No bot-protected tag — pure structured-field test.
2079        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2080            status: 200,
2081            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2082            body: String::new(),
2083            elapsed_ms: 0,
2084        }));
2085        let client = Client::builder()
2086            .min_request_interval(Duration::ZERO)
2087            .max_retries(0)
2088            .browser(backend)
2089            .build()
2090            .unwrap();
2091        let outcome = client.check(&site, &user()).await;
2092        // The recording backend always returns a synthetic 200, so
2093        // Found means we went through the browser path.
2094        assert_eq!(outcome.kind, MatchKind::Found);
2095        // No raw HTTP probe should have hit the mock server.
2096        let recvd = server.received_requests().await.unwrap_or_default();
2097        assert_eq!(
2098            recvd.len(),
2099            0,
2100            "structured protection must skip the raw HTTP path"
2101        );
2102    }
2103
2104    #[tokio::test]
2105    async fn post_method_sends_body_with_username_substituted() {
2106        // A POST-probed site (e.g. Anilist GraphQL) — the username
2107        // goes in the body, not the URL. Adler should substitute
2108        // `{username}` and send a POST with the rendered payload.
2109        let server = MockServer::start().await;
2110        Mock::given(method("POST"))
2111            .and(path("/api"))
2112            .respond_with(ResponseTemplate::new(200))
2113            .mount(&server)
2114            .await;
2115        // URL substitution still requires the `{username}` placeholder,
2116        // even for POST sites where the username also lives in the
2117        // body. Most real POST endpoints encode the username in both
2118        // (e.g. query string + body); we mirror that.
2119        let site = Site {
2120            name: "ApiPost".into(),
2121            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
2122            signals: vec![Signal::StatusFound { codes: vec![200] }],
2123            known_present: None,
2124            known_absent: None,
2125            extract: Vec::new(),
2126            tags: Vec::new(),
2127            request_headers: std::collections::BTreeMap::new(),
2128            regex_check: None,
2129            engine: None,
2130            strip_bad_char: None,
2131            request_method: HttpMethod::Post,
2132            request_body: Some(r#"{"name":"{username}"}"#.into()),
2133            protection: Vec::new(),
2134            disabled: false,
2135            disabled_reason: None,
2136            source: None,
2137            popularity: None,
2138            access: crate::AccessPolicy::default(),
2139        };
2140        let outcome = build_client().check(&site, &user()).await;
2141        assert_eq!(outcome.kind, MatchKind::Found);
2142        let recvd = server.received_requests().await.unwrap_or_default();
2143        assert_eq!(recvd.len(), 1);
2144        assert_eq!(recvd[0].method.as_str(), "POST");
2145        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
2146        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
2147    }
2148
2149    #[tokio::test]
2150    async fn head_405_falls_back_to_get() {
2151        // A server that rejects HEAD with 405 — Adler should silently
2152        // retry with GET so the optimisation can never cost accuracy.
2153        let server = MockServer::start().await;
2154        Mock::given(method("HEAD"))
2155            .and(path("/alice"))
2156            .respond_with(ResponseTemplate::new(405))
2157            .mount(&server)
2158            .await;
2159        Mock::given(any())
2160            .and(path("/alice"))
2161            .respond_with(ResponseTemplate::new(200))
2162            .mount(&server)
2163            .await;
2164        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2165        let outcome = build_client().check(&site, &user()).await;
2166        assert_eq!(outcome.kind, MatchKind::Found);
2167        let recvd = server.received_requests().await.unwrap_or_default();
2168        assert_eq!(recvd.len(), 2);
2169        assert_eq!(recvd[0].method.as_str(), "HEAD");
2170        assert_eq!(recvd[1].method.as_str(), "GET");
2171    }
2172
2173    // ------------------------------------------------------------------
2174    // Phase 4 — automatic escalation when the cheap transport hits a
2175    // Cloudflare / rate-limit Uncertain that the browser could resolve.
2176    // ------------------------------------------------------------------
2177
2178    /// Mocked HTTP that always responds with a Cloudflare 503 (server
2179    /// header + 503 status — what the pre-body ban detector turns into
2180    /// `Uncertain(CloudflareChallenge)`).
2181    async fn cloudflare_503_server() -> MockServer {
2182        let server = MockServer::start().await;
2183        Mock::given(any())
2184            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
2185            .mount(&server)
2186            .await;
2187        server
2188    }
2189
2190    #[tokio::test]
2191    async fn http_success_stamps_http_transport_no_escalations() {
2192        let server = MockServer::start().await;
2193        Mock::given(any())
2194            .respond_with(ResponseTemplate::new(200))
2195            .mount(&server)
2196            .await;
2197        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2198        let outcome = build_client().check(&site, &user()).await;
2199        assert_eq!(outcome.kind, MatchKind::Found);
2200        assert_eq!(
2201            outcome.transport,
2202            Some(crate::escalation::TransportTier::Http),
2203            "successful HTTP probe must stamp Http transport"
2204        );
2205        assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
2206    }
2207
2208    #[tokio::test]
2209    async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
2210        let server = cloudflare_503_server().await;
2211        // Browser returns a 200 that the StatusFound signal turns into Found.
2212        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2213            status: 200,
2214            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2215            body: String::new(),
2216            elapsed_ms: 5,
2217        }));
2218        let client = Client::builder()
2219            .min_request_interval(Duration::ZERO)
2220            .max_retries(0)
2221            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2222            .build()
2223            .unwrap();
2224        // Non-bot-protected site — HTTP path runs first, hits Cloudflare,
2225        // escalation routes to the browser, browser's 200 → Found.
2226        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2227        let outcome = client.check(&site, &user()).await;
2228        assert_eq!(
2229            outcome.kind,
2230            MatchKind::Found,
2231            "escalation should flip CF challenge to Found via browser (reason {:?})",
2232            outcome.reason
2233        );
2234        assert_eq!(
2235            outcome.transport,
2236            Some(crate::escalation::TransportTier::Browser),
2237            "escalated outcome must be stamped Browser"
2238        );
2239        assert_eq!(
2240            outcome.escalations, 1,
2241            "exactly one escalation should have fired"
2242        );
2243        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
2244    }
2245
2246    #[tokio::test]
2247    async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
2248        let server = cloudflare_503_server().await;
2249        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2250            status: 200,
2251            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2252            body: String::new(),
2253            elapsed_ms: 0,
2254        }));
2255        let client = Client::builder()
2256            .min_request_interval(Duration::ZERO)
2257            .max_retries(0)
2258            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2259            .disable_escalation()
2260            .build()
2261            .unwrap();
2262        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2263        let outcome = client.check(&site, &user()).await;
2264        assert_eq!(outcome.kind, MatchKind::Uncertain);
2265        assert!(matches!(
2266            outcome.reason,
2267            Some(UncertainReason::CloudflareChallenge)
2268        ));
2269        assert_eq!(
2270            outcome.transport,
2271            Some(crate::escalation::TransportTier::Http),
2272            "primary transport must still be stamped"
2273        );
2274        assert_eq!(outcome.escalations, 0);
2275        assert_eq!(
2276            backend.call_count(),
2277            0,
2278            "browser must not be touched when --no-escalation"
2279        );
2280    }
2281
2282    #[tokio::test]
2283    async fn escalation_budget_zero_keeps_browser_untouched() {
2284        let server = cloudflare_503_server().await;
2285        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2286            status: 200,
2287            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2288            body: String::new(),
2289            elapsed_ms: 0,
2290        }));
2291        let client = Client::builder()
2292            .min_request_interval(Duration::ZERO)
2293            .max_retries(0)
2294            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2295            .escalation_budget(0)
2296            .build()
2297            .unwrap();
2298        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2299        let outcome = client.check(&site, &user()).await;
2300        assert_eq!(outcome.kind, MatchKind::Uncertain);
2301        assert!(matches!(
2302            outcome.reason,
2303            Some(UncertainReason::CloudflareChallenge)
2304        ));
2305        assert_eq!(outcome.escalations, 0);
2306        assert_eq!(
2307            backend.call_count(),
2308            0,
2309            "zero budget must deny every escalation"
2310        );
2311    }
2312
2313    #[tokio::test]
2314    async fn escalation_consumes_budget_then_stops() {
2315        let server = cloudflare_503_server().await;
2316        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2317            status: 200,
2318            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2319            body: String::new(),
2320            elapsed_ms: 0,
2321        }));
2322        let client = Client::builder()
2323            .min_request_interval(Duration::ZERO)
2324            .max_retries(0)
2325            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2326            .escalation_budget(1)
2327            .build()
2328            .unwrap();
2329        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2330        // First call burns the only escalation slot.
2331        let first = client.check(&site, &user()).await;
2332        assert_eq!(first.kind, MatchKind::Found);
2333        assert_eq!(first.escalations, 1);
2334        // Second call's escalation is denied → cheap-path Uncertain survives.
2335        let second = client.check(&site, &user()).await;
2336        assert_eq!(second.kind, MatchKind::Uncertain);
2337        assert!(matches!(
2338            second.reason,
2339            Some(UncertainReason::CloudflareChallenge)
2340        ));
2341        assert_eq!(second.escalations, 0);
2342        assert_eq!(backend.call_count(), 1, "browser called exactly once total");
2343    }
2344}