Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38/// Single fixed key for the global rate limiter (it gates all hosts).
39const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41/// HTTP client used to probe sites.
42///
43/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
44/// internally, and the throttle is `Arc`-backed, so cloning is the
45/// recommended way to share a client between tasks. Cloned clients share
46/// throttle state, which is what you want: a fan-out scan must not
47/// accidentally exceed a per-host budget by spawning more clients.
48#[derive(Clone)]
49pub struct Client {
50    http: Arc<HttpFetcher>,
51    /// Geo / IP-type egress pool for sites whose `access` policy needs a
52    /// specific proxy. Empty by default → every site uses `http`.
53    egress: Arc<EgressPool>,
54    /// Operator-supplied sessions, keyed by the name a site references
55    /// via `access.session`. Empty by default.
56    sessions: Arc<SessionStore>,
57    throttle: HostThrottle,
58    /// Global RPS cap applied across all hosts. `None` → uncapped.
59    global_throttle: Option<HostThrottle>,
60    retry: RetryPolicy,
61    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
62    /// `Arc<[String]>` so cloning a client per task stays cheap.
63    user_agents: Arc<[String]>,
64    /// Extract profile fields from `Found` pages that declare extractors.
65    enrich: bool,
66    /// When set, skip probes disallowed by the host's `robots.txt`.
67    robots: Option<RobotsCache>,
68    /// Browser backend used for `bot-protected` sites. `None` → those sites
69    /// stay on the raw HTTP path and typically end up `Uncertain`.
70    browser: Option<Arc<dyn BrowserBackend>>,
71    /// TLS-fingerprint-impersonating HTTP client (`wreq`). Built when
72    /// the `impersonate` Cargo feature is on; routes sites whose
73    /// `protection` is exactly `TlsFingerprint`.
74    #[cfg(feature = "impersonate")]
75    impersonate: Option<Arc<ImpersonateFetcher>>,
76    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
77    /// for a single scan, so several tasks compete for the same budget.
78    browser_budget: Arc<BrowserBudget>,
79    /// Per-scan cap on *automatic escalations* from a cheap transport to
80    /// the browser when the cheap path returns
81    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
82    /// `browser_budget` so the pre-tagged `bot-protected` subset and the
83    /// long-tail escalation subset don't fight over the same number.
84    escalation_budget: Arc<crate::escalation::EscalationBudget>,
85    /// Whether automatic escalation runs at all. `false` keeps the cheap
86    /// transport's outcome verbatim — useful for benchmarking the raw
87    /// signals without the access-engine lift on top.
88    escalation_enabled: bool,
89}
90
91impl Client {
92    /// Start configuring a new client.
93    pub fn builder() -> ClientBuilder {
94        ClientBuilder::default()
95    }
96
97    /// Read-only view of the configured egress pool — `(country, kind)`
98    /// for every registered proxy, in the order they were declared.
99    /// Proxy URLs are not surfaced (they typically carry credentials),
100    /// so this is safe to serialise to a JSON response.
101    #[must_use]
102    pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
103        self.egress.summary()
104    }
105
106    /// Names of the configured sessions (sorted lexicographically),
107    /// without any header values. Useful for a UI listing which session
108    /// keys an operator can reference via `access.session` on a site.
109    #[must_use]
110    pub fn session_names(&self) -> Vec<String> {
111        self.sessions.names()
112    }
113
114    /// Names of the configured egresses (in registration order, only
115    /// those that supplied a name). Used by the server to validate
116    /// per-scan `egress_names` against the loaded pool.
117    #[must_use]
118    pub fn egress_names(&self) -> Vec<String> {
119        self.egress.names()
120    }
121
122    /// Returns a new client identical to this one except its egress
123    /// pool is restricted to entries whose `name` matches one of
124    /// `names`. An empty `names` slice is treated as "no filter" and
125    /// returns a clone of the full pool.
126    ///
127    /// Cheap to call repeatedly: all shared state (HTTP clients,
128    /// throttle, sessions, budgets, browser backend, …) is
129    /// `Arc`-cloned so the returned client shares the parent's
130    /// per-scan caps (browser budget, escalation budget, throttle
131    /// state) rather than each subset getting a fresh one. This is the
132    /// right behaviour for a single web-server instance handing out
133    /// per-request clients.
134    #[must_use]
135    pub fn with_egress_subset(&self, names: &[String]) -> Self {
136        Self {
137            http: Arc::clone(&self.http),
138            egress: Arc::new(self.egress.subset(names)),
139            sessions: Arc::clone(&self.sessions),
140            throttle: self.throttle.clone(),
141            global_throttle: self.global_throttle.clone(),
142            retry: self.retry.clone(),
143            user_agents: Arc::clone(&self.user_agents),
144            enrich: self.enrich,
145            robots: self.robots.clone(),
146            browser: self.browser.clone(),
147            #[cfg(feature = "impersonate")]
148            impersonate: self.impersonate.clone(),
149            browser_budget: Arc::clone(&self.browser_budget),
150            escalation_budget: Arc::clone(&self.escalation_budget),
151            escalation_enabled: self.escalation_enabled,
152        }
153    }
154
155    /// Probe a single site for `username`, retrying on transient bans.
156    ///
157    /// Network failures, timeouts, and unexpected response shapes all yield
158    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
159    /// returns an error: at the executor level we want a partial result for
160    /// every site, not abort-on-first-failure semantics.
161    ///
162    /// When ban detection classifies a response as `rate_limited` /
163    /// `cloudflare_challenge`, the call is retried with jittered exponential
164    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
165    /// Uncertain (network errors, body read failures) is **not** retried —
166    /// those failures rarely fix themselves in the seconds-to-minutes window
167    /// we'd block for.
168    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
169    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
170        let mut attempt: u32 = 0;
171        loop {
172            let outcome = self.probe_once(site, username).await;
173            if !retry::should_retry(&outcome, attempt, &self.retry) {
174                return outcome;
175            }
176            let delay = retry::backoff_delay(attempt, &self.retry);
177            tracing::info!(
178                site = %site.name,
179                attempt = attempt + 1,
180                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
181                ?delay,
182                "transient ban, retrying",
183            );
184            tokio::time::sleep(delay).await;
185            attempt += 1;
186        }
187    }
188
189    /// Fetch a URL and return raw response data (status, final URL, body)
190    /// with the same throttle / User-Agent / proxy machinery as `check`,
191    /// but without signal evaluation or retry.
192    ///
193    /// Returns `None` on any network/transport error. Intended for
194    /// diagnostics such as `adler --doctor --fix`, which diffs the
195    /// responses for a known-present and a nonsense user to derive a
196    /// signature.
197    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
198        let host = host_of(url);
199        if let Some(global) = &self.global_throttle {
200            global.wait(GLOBAL_THROTTLE_KEY).await;
201        }
202        self.throttle.wait(&host).await;
203        let mut request = self.http.client().get(url);
204        if let Some(ua) = self.pick_user_agent() {
205            request = request.header(reqwest::header::USER_AGENT, ua);
206        }
207        let response = request.send().await.ok()?;
208        let status = response.status().as_u16();
209        let final_url = response.url().to_string();
210        let body = response.text().await.unwrap_or_default();
211        Some(RawResponse {
212            status,
213            final_url,
214            body,
215        })
216    }
217
218    /// Same as [`Self::fetch`] but routes through the configured browser
219    /// backend when the site is tagged `bot-protected` and a backend is
220    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
221    /// so that the diff-derivation works against the JS-rendered page
222    /// (login wall vs. real profile) rather than two identical raw-HTTP
223    /// shells.
224    ///
225    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
226    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
227    /// callers get the same `Option<RawResponse>` shape either way.
228    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
229        if let Some(backend) = self.browser.as_deref() {
230            let has_tag = site
231                .tags
232                .iter()
233                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
234            if has_tag || !site.protection.is_empty() {
235                let parsed = url::Url::parse(url).ok()?;
236                match backend
237                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
238                    .await
239                {
240                    Ok(page) => {
241                        return Some(RawResponse {
242                            status: page.status,
243                            final_url: page.final_url.to_string(),
244                            body: page.body,
245                        });
246                    }
247                    Err(err) => {
248                        tracing::warn!(
249                            site = %site.name, %url, error = %err,
250                            "browser fetch failed in doctor; falling back to raw HTTP",
251                        );
252                    }
253                }
254            }
255        }
256        self.fetch(url).await
257    }
258
259    /// Pick a User-Agent for the next request from the rotation pool, or
260    /// `None` to fall back on the client's fixed header.
261    fn pick_user_agent(&self) -> Option<&str> {
262        match self.user_agents.len() {
263            0 => None,
264            1 => Some(&self.user_agents[0]),
265            n => Some(&self.user_agents[fastrand::usize(0..n)]),
266        }
267    }
268
269    // Splitting probe_once into helpers would scatter the request/response
270    // flow that has to read top-to-bottom; one long function reads better.
271    #[allow(clippy::too_many_lines)]
272    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
273        let url = site.url_for(username);
274
275        // Site-level username constraint (Sherlock's `regexCheck`).
276        // Mismatch → skip the probe entirely. Saves a request and
277        // sidesteps the false-positive class where a site 404s on
278        // illegal usernames in a way our signal can't distinguish
279        // from a missing account. If the pattern fails to compile
280        // (Sherlock occasionally uses lookarounds, which our `regex`
281        // crate can't express), we let validate's warn-log stand
282        // and silently fall through — the rest of the probe still
283        // works.
284        if let Some(pat) = &site.regex_check {
285            if let Ok(re) = regex::Regex::new(pat) {
286                if !re.is_match(username.as_str()) {
287                    return uncertain(
288                        &site.name,
289                        url,
290                        Instant::now(),
291                        UncertainReason::UsernameNotAllowed,
292                    );
293                }
294            }
295        }
296
297        // Resolve an operator session if the site's access policy names
298        // one, and fold its headers (cookies / tokens) over the site's
299        // own. A named-but-missing session is reported rather than sent
300        // unauthenticated into a login wall — which reads identically
301        // for an existing and a missing account. Applies to both the
302        // HTTP and browser transports.
303        let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
304            None => Cow::Borrowed(&site.request_headers),
305            Some(name) => match self.sessions.get(name) {
306                Some(session) => Cow::Owned(session.apply(&site.request_headers)),
307                None => {
308                    return uncertain(
309                        &site.name,
310                        url,
311                        Instant::now(),
312                        UncertainReason::SessionRequired,
313                    );
314                }
315            },
316        };
317        let headers: &BTreeMap<String, String> = &session_headers;
318
319        // Auto-route bot-protected sites through the browser backend when
320        // one is configured. Raw HTTP can't see past their JS/login wall,
321        // so this is the only way they ever produce a Found verdict.
322        // A site is "bot-protected" in the routing sense if it carries
323        // the legacy tag OR declares any specific protection mechanism
324        // via the new `protection` field — either signal is enough.
325        if let Some(backend) = &self.browser {
326            let has_tag = site
327                .tags
328                .iter()
329                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
330            if has_tag || !site.protection.is_empty() {
331                if self.browser_budget.try_consume() {
332                    let started = Instant::now();
333                    let req = FetchRequest {
334                        method: site.request_method,
335                        url: &url,
336                        body: None,
337                        user_agent: None,
338                        headers,
339                        want_body: true,
340                    };
341                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
342                    let mut outcome = match fetcher.fetch(&req).await {
343                        Ok(resp) => self.finish(site, url, started, &resp),
344                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
345                    };
346                    outcome.transport = Some(crate::escalation::TransportTier::Browser);
347                    return outcome;
348                }
349                tracing::warn!(site = %site.name, "browser budget exhausted");
350                let mut outcome = uncertain(
351                    &site.name,
352                    url,
353                    Instant::now(),
354                    UncertainReason::BrowserBudget,
355                );
356                outcome.transport = Some(crate::escalation::TransportTier::Browser);
357                return outcome;
358            }
359        }
360
361        // Phase 2: route pure-`TlsFingerprint` sites through the
362        // impersonating transport — a real BoringSSL TLS handshake from
363        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
364        // protection tag, at a fraction of the cost of a real browser.
365        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
366        // keep going through the browser path above, where they were.
367        #[cfg(feature = "impersonate")]
368        if let Some(fetcher) = &self.impersonate {
369            let pure_tls = site.protection.len() == 1
370                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
371                && !site
372                    .tags
373                    .iter()
374                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
375            if pure_tls {
376                let started = Instant::now();
377                let req = FetchRequest {
378                    method: site.request_method,
379                    url: &url,
380                    body: None,
381                    user_agent: self.pick_user_agent(),
382                    headers,
383                    want_body: true,
384                };
385                let mut primary = match fetcher.fetch(&req).await {
386                    Ok(resp) => self.finish(site, url.clone(), started, &resp),
387                    Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
388                };
389                primary.transport = Some(crate::escalation::TransportTier::Impersonate);
390                return self.maybe_escalate(site, &url, headers, primary).await;
391            }
392        }
393
394        // Egress selection: route the HTTP path through a geo / IP-type
395        // matching proxy when the site's access policy demands one. An
396        // unconstrained policy uses the default egress; a constrained
397        // policy with no matching egress is reported `GeoUnavailable`
398        // rather than fetched from the wrong location (a false
399        // `NotFound` would be worse than an honest `Uncertain`).
400        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
401            EgressChoice::Default => Arc::clone(&self.http),
402            EgressChoice::Use(fetcher) => fetcher,
403            EgressChoice::Unavailable => {
404                return uncertain(
405                    &site.name,
406                    url,
407                    Instant::now(),
408                    UncertainReason::GeoUnavailable,
409                );
410            }
411        };
412
413        let host = host_of(&url);
414
415        // robots.txt gate, before consuming a throttle slot or probing.
416        if let Some(robots) = &self.robots {
417            if let Some((origin, path)) = origin_and_path(&url) {
418                if !robots.allowed(&origin, &path).await {
419                    tracing::debug!(%url, "skipped by robots.txt");
420                    return uncertain(
421                        &site.name,
422                        url,
423                        Instant::now(),
424                        UncertainReason::RobotsDisallowed,
425                    );
426                }
427            }
428        }
429
430        // Global cap first (gates every request), then per-host spacing.
431        if let Some(global) = &self.global_throttle {
432            global.wait(GLOBAL_THROTTLE_KEY).await;
433        }
434        self.throttle.wait(&host).await;
435        let started = Instant::now();
436        tracing::debug!(%url, %host, "probing");
437
438        // Read the body only if a signal needs it, or enrichment is on
439        // and the site declares extractor rules (extraction needs it).
440        let want_enrich = self.enrich && !site.extract.is_empty();
441        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
442
443        // POST sites carry their own body payload (the username goes in
444        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
445        // `{username}` in `Site::request_body` is substituted here,
446        // mirroring URL substitution.
447        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
448            const USERNAME_PH: &str = "{username}";
449            site.request_body
450                .as_deref()
451                .map(|t| t.replace(USERNAME_PH, username.as_str()))
452        } else {
453            None
454        };
455
456        let req = FetchRequest {
457            method: site.request_method,
458            url: &url,
459            body: body_for_post.as_deref(),
460            user_agent: self.pick_user_agent(),
461            headers,
462            want_body: needs_body,
463        };
464        let mut primary = match egress.fetch(&req).await {
465            Ok(resp) => self.finish(site, url.clone(), started, &resp),
466            Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
467        };
468        primary.transport = Some(crate::escalation::TransportTier::Http);
469        self.maybe_escalate(site, &url, headers, primary).await
470    }
471
472    /// If the cheap transport returned an `Uncertain` reason a browser
473    /// fetch could plausibly resolve, retry through the browser backend
474    /// and stamp the new outcome as escalated. Bounded by
475    /// [`escalation_budget`](ClientBuilder::escalation_budget).
476    async fn maybe_escalate(
477        &self,
478        site: &Site,
479        url: &str,
480        headers: &BTreeMap<String, String>,
481        primary: CheckOutcome,
482    ) -> CheckOutcome {
483        if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
484            return primary;
485        }
486        let Some(reason) = &primary.reason else {
487            return primary;
488        };
489        if !crate::escalation::should_escalate(reason) {
490            return primary;
491        }
492        let Some(backend) = &self.browser else {
493            return primary;
494        };
495        if !self.escalation_budget.try_consume() {
496            tracing::debug!(site = %site.name, "escalation budget exhausted");
497            return primary;
498        }
499
500        tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
501        let started = Instant::now();
502        let req = FetchRequest {
503            method: site.request_method,
504            url,
505            body: None,
506            user_agent: None,
507            headers,
508            want_body: true,
509        };
510        let fetcher = BrowserFetcher::new(Arc::clone(backend));
511        let mut escalated = match fetcher.fetch(&req).await {
512            Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
513            Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
514        };
515        escalated.transport = Some(crate::escalation::TransportTier::Browser);
516        escalated.escalations = 1;
517        escalated
518    }
519
520    /// Evaluate a fetched response against the site's signals and build
521    /// the outcome. Shared by the HTTP and browser transports so the
522    /// verdict / evidence / enrichment logic lives in exactly one place.
523    fn finish(
524        &self,
525        site: &Site,
526        url: String,
527        started: Instant,
528        resp: &crate::transport::FetchResponse,
529    ) -> CheckOutcome {
530        let probe = Probe {
531            status: resp.status,
532            final_url: &resp.final_url,
533            body: &resp.body,
534        };
535        let votes: Vec<(&Signal, SignalVerdict)> = site
536            .signals
537            .iter()
538            .map(|s| (s, s.evaluate(&probe)))
539            .collect();
540        let kind = aggregate(votes.iter().map(|(_, v)| *v));
541        let mut result = outcome(&site.name, url, started, kind);
542        // Record which signals produced the verdict (the winning polarity).
543        let winning = match kind {
544            MatchKind::Found => Some(SignalVerdict::Found),
545            MatchKind::NotFound => Some(SignalVerdict::NotFound),
546            MatchKind::Uncertain => None,
547        };
548        if let Some(want) = winning {
549            result.evidence = votes
550                .iter()
551                .filter(|(_, v)| *v == want)
552                .map(|(s, _)| s.describe_match(&probe))
553                .collect();
554        }
555        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
556            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
557        }
558        result
559    }
560}
561
562/// Raw response data returned by [`Client::fetch`] for diagnostics.
563#[derive(Debug, Clone)]
564pub struct RawResponse {
565    /// HTTP status code.
566    pub status: u16,
567    /// Final URL after redirects.
568    pub final_url: String,
569    /// Decoded response body.
570    pub body: String,
571}
572
573/// Builder for [`Client`].
574#[derive(Clone)]
575#[must_use = "ClientBuilder does nothing until `.build()` is called"]
576// A configuration builder accumulates many small flags; the four bool
577// fields here are semantically independent (redirect / enrich /
578// respect-robots / escalation), so collapsing them into a state machine
579// or enum would obscure rather than clarify.
580#[allow(clippy::struct_excessive_bools)]
581pub struct ClientBuilder {
582    timeout: Duration,
583    connect_timeout: Duration,
584    user_agent: String,
585    follow_redirects: bool,
586    redirect_limit: usize,
587    min_request_interval: Duration,
588    max_rps: Option<NonZeroU32>,
589    retry: RetryPolicy,
590    proxy: Option<String>,
591    user_agents: Vec<String>,
592    enrich: bool,
593    respect_robots: bool,
594    browser: Option<Arc<dyn BrowserBackend>>,
595    browser_budget: usize,
596    egress: Vec<EgressSpec>,
597    sessions: SessionStore,
598    escalation_budget: usize,
599    escalation_enabled: bool,
600}
601
602impl Default for ClientBuilder {
603    fn default() -> Self {
604        Self {
605            timeout: DEFAULT_TIMEOUT,
606            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
607            user_agent: default_user_agent(),
608            follow_redirects: true,
609            redirect_limit: DEFAULT_REDIRECT_LIMIT,
610            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
611            max_rps: None,
612            retry: RetryPolicy::default(),
613            proxy: None,
614            user_agents: Vec::new(),
615            enrich: false,
616            respect_robots: false,
617            browser: None,
618            browser_budget: DEFAULT_BROWSER_BUDGET,
619            egress: Vec::new(),
620            sessions: SessionStore::new(),
621            escalation_budget: DEFAULT_ESCALATION_BUDGET,
622            escalation_enabled: true,
623        }
624    }
625}
626
627impl ClientBuilder {
628    /// Per-request timeout (covers connect, headers, and body read).
629    pub fn timeout(mut self, timeout: Duration) -> Self {
630        self.timeout = timeout;
631        self
632    }
633
634    /// TCP-connect timeout, applied independently of the request timeout.
635    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
636        self.connect_timeout = timeout;
637        self
638    }
639
640    /// Override the `User-Agent` header sent on every request.
641    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
642        self.user_agent = user_agent.into();
643        self
644    }
645
646    /// Toggle automatic redirect following. Defaults to `true`; disable when
647    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
648    pub fn follow_redirects(mut self, follow: bool) -> Self {
649        self.follow_redirects = follow;
650        self
651    }
652
653    /// Minimum time between consecutive requests to the same host.
654    ///
655    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
656    /// rate-limit responses on common OSINT targets while keeping fan-out
657    /// across many sites fast.
658    pub fn min_request_interval(mut self, interval: Duration) -> Self {
659        self.min_request_interval = interval;
660        self
661    }
662
663    /// Cap the total request rate across *all* hosts to `rps` requests per
664    /// second. Independent of (and composed with) the per-host interval —
665    /// useful on a metered connection or behind a shared-quota proxy.
666    /// Uncapped by default.
667    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
668        self.max_rps = Some(rps);
669        self
670    }
671
672    /// Maximum retry attempts after a transient ban response. Defaults to 2
673    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
674    pub fn max_retries(mut self, n: u32) -> Self {
675        self.retry.max_retries = n;
676        self
677    }
678
679    /// Base delay for the first retry. Subsequent retries double until
680    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
681    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
682        self.retry.base_delay = d;
683        self
684    }
685
686    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
687    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
688        self.retry.max_delay = d;
689        self
690    }
691
692    /// Route all requests through a proxy. Accepts `http://`, `https://`,
693    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
694    pub fn proxy(mut self, url: impl Into<String>) -> Self {
695        self.proxy = Some(url.into());
696        self
697    }
698
699    /// Rotate the `User-Agent` header per request, picking uniformly at
700    /// random from `agents`. An empty list (the default) keeps the single
701    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
702    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
703        self.user_agents = agents;
704        self
705    }
706
707    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
708    /// pages. Off by default; enables an extra body read for matching sites.
709    pub fn enrich(mut self, enrich: bool) -> Self {
710        self.enrich = enrich;
711        self
712    }
713
714    /// Honor each host's `robots.txt`: probes to disallowed paths are
715    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
716    /// default. Adds one cached `robots.txt` fetch per origin.
717    pub fn respect_robots(mut self, respect: bool) -> Self {
718        self.respect_robots = respect;
719        self
720    }
721
722    /// Attach a browser backend. Sites tagged `bot-protected` will be
723    /// routed through it instead of the raw HTTP path, up to the
724    /// [`browser_budget`](Self::browser_budget) cap.
725    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
726        self.browser = Some(backend);
727        self
728    }
729
730    /// Per-scan cap on how many `bot-protected` sites are allowed to use
731    /// the browser backend. Once exhausted, the rest fall back to
732    /// `Uncertain(BrowserBudget)`. Defaults to
733    /// [`DEFAULT_BROWSER_BUDGET`].
734    pub const fn browser_budget(mut self, cap: usize) -> Self {
735        self.browser_budget = cap;
736        self
737    }
738
739    /// Per-scan cap on automatic escalations from the cheap transport
740    /// (HTTP / impersonate) to the browser when the cheap path returns
741    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
742    /// [`browser_budget`](Self::browser_budget). Defaults to
743    /// [`DEFAULT_ESCALATION_BUDGET`]. `cap = 0` is equivalent to
744    /// [`disable_escalation`](Self::disable_escalation).
745    pub const fn escalation_budget(mut self, cap: usize) -> Self {
746        self.escalation_budget = cap;
747        self
748    }
749
750    /// Disable automatic escalation entirely — the cheap transport's
751    /// outcome is returned verbatim, even when its `Uncertain` reason is
752    /// one a browser fetch would resolve. Useful for benchmarking the
753    /// raw HTTP signals without the access-engine lift on top.
754    pub const fn disable_escalation(mut self) -> Self {
755        self.escalation_enabled = false;
756        self
757    }
758
759    /// Configure the egress pool: proxies tagged by country / IP type
760    /// that sites with an `access` policy can require. Sites without a
761    /// policy are unaffected (they use the default egress / `--proxy`).
762    /// Replaces any previously set pool.
763    pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
764        self.egress = egress;
765        self
766    }
767
768    /// Supply operator authenticated sessions. A site whose `access`
769    /// policy names a session has that session's headers (cookies /
770    /// tokens) applied to its probe; a named-but-missing session yields
771    /// `Uncertain(SessionRequired)` rather than a login-wall false
772    /// negative. Replaces any previously set store.
773    pub fn sessions(mut self, sessions: SessionStore) -> Self {
774        self.sessions = sessions;
775        self
776    }
777
778    /// Build a [`Client`].
779    pub fn build(self) -> Result<Client> {
780        let inner = build_reqwest(
781            &self.user_agent,
782            self.timeout,
783            self.connect_timeout,
784            self.follow_redirects,
785            self.redirect_limit,
786            self.proxy.as_deref(),
787        )?;
788
789        // One HTTP client per configured egress — `reqwest` bakes the
790        // proxy in at build time, so geo / IP-type routing means a
791        // distinct client per proxy, paired with its match metadata.
792        let mut egress_entries = Vec::with_capacity(self.egress.len());
793        for spec in &self.egress {
794            let client = build_reqwest(
795                &self.user_agent,
796                self.timeout,
797                self.connect_timeout,
798                self.follow_redirects,
799                self.redirect_limit,
800                Some(&spec.url),
801            )?;
802            egress_entries.push((
803                spec.name.clone(),
804                spec.country.clone(),
805                spec.kind,
806                Arc::new(HttpFetcher::new(client)),
807            ));
808        }
809
810        let global_throttle = self.max_rps.map(|rps| {
811            // Min spacing between any two requests = 1s / rps.
812            let interval = Duration::from_secs(1) / rps.get();
813            HostThrottle::new(interval)
814        });
815        let robots = self
816            .respect_robots
817            .then(|| RobotsCache::new(inner.clone(), "adler"));
818        // Build the impersonate fetcher up front when the feature is on;
819        // surface a wreq init failure as `HttpSetup` so the caller sees
820        // it the same way they'd see a bad `--proxy` URL.
821        #[cfg(feature = "impersonate")]
822        let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
823        Ok(Client {
824            http: Arc::new(HttpFetcher::new(inner)),
825            egress: Arc::new(EgressPool::new(egress_entries)),
826            sessions: Arc::new(self.sessions),
827            throttle: HostThrottle::new(self.min_request_interval),
828            global_throttle,
829            retry: self.retry,
830            user_agents: Arc::from(self.user_agents),
831            enrich: self.enrich,
832            robots,
833            browser: self.browser,
834            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
835            escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
836                self.escalation_budget,
837            )),
838            escalation_enabled: self.escalation_enabled,
839            #[cfg(feature = "impersonate")]
840            impersonate,
841        })
842    }
843}
844
845/// Build a configured `reqwest::Client`, optionally routed through a
846/// proxy. Shared by the default client and every egress in the pool so
847/// they get identical timeout / redirect / User-Agent settings.
848fn build_reqwest(
849    user_agent: &str,
850    timeout: Duration,
851    connect_timeout: Duration,
852    follow_redirects: bool,
853    redirect_limit: usize,
854    proxy: Option<&str>,
855) -> Result<reqwest::Client> {
856    let redirect_policy = if follow_redirects {
857        redirect::Policy::limited(redirect_limit)
858    } else {
859        redirect::Policy::none()
860    };
861    let mut builder = reqwest::Client::builder()
862        .user_agent(user_agent.to_owned())
863        .timeout(timeout)
864        .connect_timeout(connect_timeout)
865        .redirect(redirect_policy);
866    if let Some(proxy_url) = proxy {
867        // reqwest treats a schemeless string (e.g. "not-a-url") as a host
868        // and silently defaults it to http://, so every probe would fail
869        // confusingly. Require an explicit, supported scheme up front.
870        const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
871        if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
872            return Err(Error::HttpSetup {
873                message: format!(
874                    "invalid proxy {proxy_url:?}: must start with one of {}",
875                    SCHEMES.join(", ")
876                ),
877            });
878        }
879        let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
880            message: format!("invalid proxy {proxy_url:?}: {e}"),
881        })?;
882        builder = builder.proxy(proxy);
883    }
884    builder.build().map_err(|e| Error::HttpSetup {
885        message: e.to_string(),
886    })
887}
888
889/// Default ceiling on browser-backed probes per scan when no other value
890/// is specified.
891///
892/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
893/// headroom while still being a guardrail against a misconfigured flag
894/// burning a whole Browserbase quota.
895pub const DEFAULT_BROWSER_BUDGET: usize = 50;
896
897/// Default ceiling on *automatic escalation* fetches per scan (HTTP /
898/// impersonate → browser when the cheap path returns
899/// `Uncertain(CloudflareChallenge | RateLimited)`).
900///
901/// Independent of [`DEFAULT_BROWSER_BUDGET`]: a `bot-protected` site that
902/// goes straight to the browser consumes browser budget; a non-pre-tagged
903/// site that escalates from HTTP to browser consumes one of each. Sized so
904/// a few-percent escalation rate across a typical registry stays under the
905/// cap without thinking about it.
906pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
907
908impl fmt::Debug for Client {
909    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
910        f.debug_struct("Client")
911            .field("throttle", &self.throttle)
912            .field("global_throttle", &self.global_throttle)
913            .field("retry", &self.retry)
914            .field("user_agents", &self.user_agents)
915            .field("enrich", &self.enrich)
916            .field("robots", &self.robots.is_some())
917            .field("browser", &self.browser.is_some())
918            .field("browser_budget", &self.browser_budget)
919            .field("escalation_budget", &self.escalation_budget)
920            .field("escalation_enabled", &self.escalation_enabled)
921            .finish_non_exhaustive()
922    }
923}
924
925impl fmt::Debug for ClientBuilder {
926    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
927        f.debug_struct("ClientBuilder")
928            .field("timeout", &self.timeout)
929            .field("connect_timeout", &self.connect_timeout)
930            .field("user_agent", &self.user_agent)
931            .field("follow_redirects", &self.follow_redirects)
932            .field("redirect_limit", &self.redirect_limit)
933            .field("min_request_interval", &self.min_request_interval)
934            .field("max_rps", &self.max_rps)
935            .field("retry", &self.retry)
936            .field("proxy", &self.proxy)
937            .field("user_agents", &self.user_agents)
938            .field("enrich", &self.enrich)
939            .field("respect_robots", &self.respect_robots)
940            .field("browser", &self.browser.is_some())
941            .field("browser_budget", &self.browser_budget)
942            .field("egress", &self.egress)
943            .field("sessions", &self.sessions)
944            .field("escalation_budget", &self.escalation_budget)
945            .field("escalation_enabled", &self.escalation_enabled)
946            .finish()
947    }
948}
949
950const BOT_PROTECTED_TAG: &str = "bot-protected";
951
952fn default_user_agent() -> String {
953    format!("adler/{}", env!("CARGO_PKG_VERSION"))
954}
955
956fn host_of(url: &str) -> String {
957    reqwest::Url::parse(url)
958        .ok()
959        .and_then(|u| u.host_str().map(str::to_owned))
960        .unwrap_or_else(|| "unknown".into())
961}
962
963/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
964/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
965fn origin_and_path(url: &str) -> Option<(String, String)> {
966    let parsed = reqwest::Url::parse(url).ok()?;
967    let host = parsed.host_str()?;
968    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
969    let origin = format!("{}://{host}{port}", parsed.scheme());
970    let path = parsed.query().map_or_else(
971        || parsed.path().to_owned(),
972        |q| format!("{}?{q}", parsed.path()),
973    );
974    Some((origin, path))
975}
976
977fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
978    CheckOutcome {
979        site: site.to_owned(),
980        url,
981        kind,
982        reason: None,
983        elapsed_ms: elapsed_ms(started),
984        enrichment: std::collections::BTreeMap::new(),
985        evidence: Vec::new(),
986        transport: None,
987        escalations: 0,
988    }
989}
990
991fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
992    CheckOutcome {
993        site: site.to_owned(),
994        url,
995        kind: MatchKind::Uncertain,
996        reason: Some(reason),
997        elapsed_ms: elapsed_ms(started),
998        enrichment: std::collections::BTreeMap::new(),
999        evidence: Vec::new(),
1000        transport: None,
1001        escalations: 0,
1002    }
1003}
1004
1005fn elapsed_ms(started: Instant) -> u64 {
1006    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011    use super::*;
1012    use crate::browser::RenderedPage;
1013    use crate::site::{Signal, UrlTemplate};
1014    use wiremock::matchers::{any, method, path};
1015    use wiremock::{Mock, MockServer, ResponseTemplate};
1016
1017    fn build_client() -> Client {
1018        Client::builder()
1019            .timeout(Duration::from_secs(2))
1020            // Tests share `127.0.0.1` as host — keep throttle out of the
1021            // way for everything but the dedicated throttle test below.
1022            .min_request_interval(Duration::ZERO)
1023            // Default retry would re-hit ban-test mocks; tests opt in
1024            // explicitly when they want to exercise the retry path.
1025            .max_retries(0)
1026            .build()
1027            .expect("client builds")
1028    }
1029
1030    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
1031        Site {
1032            name: "Mock".into(),
1033            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
1034            signals,
1035            known_present: None,
1036            known_absent: None,
1037            extract: Vec::new(),
1038            tags: Vec::new(),
1039            request_headers: std::collections::BTreeMap::new(),
1040            regex_check: None,
1041            engine: None,
1042            strip_bad_char: None,
1043            request_method: crate::site::HttpMethod::Get,
1044            request_body: None,
1045            protection: Vec::new(),
1046            disabled: false,
1047            source: None,
1048            popularity: None,
1049            access: crate::AccessPolicy::default(),
1050        }
1051    }
1052
1053    fn user() -> Username {
1054        Username::new("alice").unwrap()
1055    }
1056
1057    #[tokio::test]
1058    async fn regex_check_short_circuits_before_any_request() {
1059        // Stand up a mock that would 200 on *anything* — if probe_once
1060        // failed to short-circuit on regex mismatch, the username
1061        // "alice" (5 chars) would resolve to Found here.
1062        let server = MockServer::start().await;
1063        Mock::given(any())
1064            .respond_with(ResponseTemplate::new(200))
1065            .mount(&server)
1066            .await;
1067        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1068        // The site only accepts usernames of 8+ chars; "alice" is 5.
1069        site.regex_check = Some("^[A-Za-z]{8,}$".into());
1070        let outcome = build_client().check(&site, &user()).await;
1071        assert_eq!(outcome.kind, MatchKind::Uncertain);
1072        assert!(
1073            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
1074            "expected UsernameNotAllowed, got {:?}",
1075            outcome.reason,
1076        );
1077        // No request should have hit the mock — assert by counting
1078        // received_requests on the wiremock server.
1079        let recvd = server.received_requests().await.unwrap_or_default();
1080        assert_eq!(
1081            recvd.len(),
1082            0,
1083            "regex_check mismatch must skip the HTTP request entirely"
1084        );
1085    }
1086
1087    #[tokio::test]
1088    async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
1089        // A mock that would 200 on anything — if the geo gate failed to
1090        // short-circuit, "alice" would resolve to Found here.
1091        let server = MockServer::start().await;
1092        Mock::given(any())
1093            .respond_with(ResponseTemplate::new(200))
1094            .mount(&server)
1095            .await;
1096        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1097        // Require a Polish egress; the default client has no egress pool,
1098        // so nothing can satisfy it.
1099        site.access = crate::access::AccessPolicy {
1100            geo: vec![crate::access::CountryCode::new("pl").unwrap()],
1101            ..crate::access::AccessPolicy::default()
1102        };
1103        let outcome = build_client().check(&site, &user()).await;
1104        assert_eq!(outcome.kind, MatchKind::Uncertain);
1105        assert!(
1106            matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
1107            "expected GeoUnavailable, got {:?}",
1108            outcome.reason,
1109        );
1110        // The site must NOT have been probed — an unreachable geo is not
1111        // evidence of absence, and we don't fetch from the wrong location.
1112        let recvd = server.received_requests().await.unwrap_or_default();
1113        assert_eq!(
1114            recvd.len(),
1115            0,
1116            "geo-unavailable must skip the HTTP request entirely"
1117        );
1118    }
1119
1120    #[tokio::test]
1121    async fn session_headers_are_sent_on_probe() {
1122        // Only respond 200 when the request carries the session cookie,
1123        // so a Found verdict proves the header was actually applied.
1124        let server = MockServer::start().await;
1125        Mock::given(any())
1126            .and(wiremock::matchers::header("cookie", "sessionid=real"))
1127            .respond_with(ResponseTemplate::new(200))
1128            .mount(&server)
1129            .await;
1130        let mut headers = std::collections::BTreeMap::new();
1131        headers.insert("Cookie".to_string(), "sessionid=real".to_string());
1132        let mut store = SessionStore::new();
1133        store.insert("acct", crate::access::Session::from_headers(headers));
1134        let client = Client::builder()
1135            .timeout(Duration::from_secs(2))
1136            .min_request_interval(Duration::ZERO)
1137            .max_retries(0)
1138            .sessions(store)
1139            .build()
1140            .expect("client builds");
1141        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1142        site.access.session = Some("acct".to_string());
1143        let outcome = client.check(&site, &user()).await;
1144        assert_eq!(
1145            outcome.kind,
1146            MatchKind::Found,
1147            "session cookie should unlock the 200 (got {:?})",
1148            outcome.reason,
1149        );
1150    }
1151
1152    #[tokio::test]
1153    async fn missing_named_session_is_session_required() {
1154        let server = MockServer::start().await;
1155        Mock::given(any())
1156            .respond_with(ResponseTemplate::new(200))
1157            .mount(&server)
1158            .await;
1159        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1160        // Names a session the (empty) store doesn't have.
1161        site.access.session = Some("not-configured".to_string());
1162        let outcome = build_client().check(&site, &user()).await;
1163        assert_eq!(outcome.kind, MatchKind::Uncertain);
1164        assert!(
1165            matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
1166            "expected SessionRequired, got {:?}",
1167            outcome.reason,
1168        );
1169        let recvd = server.received_requests().await.unwrap_or_default();
1170        assert_eq!(
1171            recvd.len(),
1172            0,
1173            "a missing session must skip the request, not probe unauthenticated"
1174        );
1175    }
1176
1177    #[cfg(feature = "impersonate")]
1178    #[tokio::test]
1179    async fn impersonate_routes_pure_tls_fingerprint_site() {
1180        let server = MockServer::start().await;
1181        Mock::given(any())
1182            .respond_with(ResponseTemplate::new(200))
1183            .mount(&server)
1184            .await;
1185        let client = Client::builder()
1186            .timeout(Duration::from_secs(2))
1187            .min_request_interval(Duration::ZERO)
1188            .max_retries(0)
1189            .build()
1190            .expect("client builds with impersonate");
1191        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1192        // Pure TLS-fingerprint protection — exactly the shape that
1193        // routes to the impersonate fetcher.
1194        site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1195        let outcome = client.check(&site, &user()).await;
1196        assert_eq!(
1197            outcome.kind,
1198            MatchKind::Found,
1199            "expected Found (reason {:?})",
1200            outcome.reason,
1201        );
1202        // wreq's Chrome-134 emulation sets a Chrome-shaped User-Agent —
1203        // observable proof that the request came from the impersonate
1204        // path and not the default `adler/<version>` HTTP fetcher.
1205        let recvd = server.received_requests().await.expect("received requests");
1206        assert_eq!(recvd.len(), 1, "expected exactly one request");
1207        let ua = recvd[0]
1208            .headers
1209            .get("user-agent")
1210            .and_then(|v| v.to_str().ok())
1211            .unwrap_or("");
1212        assert!(
1213            ua.contains("Chrome/"),
1214            "expected Chrome-shaped UA from wreq, got {ua:?}"
1215        );
1216    }
1217
1218    #[tokio::test]
1219    async fn regex_check_pass_proceeds_to_probe() {
1220        let server = MockServer::start().await;
1221        Mock::given(any())
1222            .and(path("/alice"))
1223            .respond_with(ResponseTemplate::new(200))
1224            .mount(&server)
1225            .await;
1226        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1227        // Pattern that matches "alice".
1228        site.regex_check = Some("^[a-z]{3,}$".into());
1229        let outcome = build_client().check(&site, &user()).await;
1230        assert_eq!(outcome.kind, MatchKind::Found);
1231    }
1232
1233    #[tokio::test]
1234    async fn status_signal_reports_found_on_match() {
1235        let server = MockServer::start().await;
1236        Mock::given(any())
1237            .and(path("/alice"))
1238            .respond_with(ResponseTemplate::new(200))
1239            .mount(&server)
1240            .await;
1241        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1242        let outcome = build_client().check(&site, &user()).await;
1243        assert_eq!(outcome.kind, MatchKind::Found);
1244        assert!(outcome.url.ends_with("/alice"));
1245        assert!(outcome.reason.is_none());
1246        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1247    }
1248
1249    #[tokio::test]
1250    async fn status_signal_pair_reports_not_found_on_404() {
1251        let server = MockServer::start().await;
1252        Mock::given(any())
1253            .and(path("/alice"))
1254            .respond_with(ResponseTemplate::new(404))
1255            .mount(&server)
1256            .await;
1257        let site = site_with(
1258            &server,
1259            vec![
1260                Signal::StatusFound { codes: vec![200] },
1261                Signal::StatusNotFound { codes: vec![404] },
1262            ],
1263        );
1264        let outcome = build_client().check(&site, &user()).await;
1265        assert_eq!(outcome.kind, MatchKind::NotFound);
1266        // Only the NotFound-voting signal is cited as evidence.
1267        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1268    }
1269
1270    #[tokio::test]
1271    async fn body_absent_signal_detects_missing_account() {
1272        let server = MockServer::start().await;
1273        Mock::given(any())
1274            .and(path("/alice"))
1275            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1276            .mount(&server)
1277            .await;
1278        let site = site_with(
1279            &server,
1280            vec![Signal::BodyAbsent {
1281                text: "Profile not found".into(),
1282            }],
1283        );
1284        let outcome = build_client().check(&site, &user()).await;
1285        assert_eq!(outcome.kind, MatchKind::NotFound);
1286    }
1287
1288    #[tokio::test]
1289    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1290        // Phase 2 semantics: absence of an absence-marker is not evidence
1291        // of presence — it just means we have no signal that fired.
1292        let server = MockServer::start().await;
1293        Mock::given(any())
1294            .and(path("/alice"))
1295            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1296            .mount(&server)
1297            .await;
1298        let site = site_with(
1299            &server,
1300            vec![Signal::BodyAbsent {
1301                text: "Profile not found".into(),
1302            }],
1303        );
1304        let outcome = build_client().check(&site, &user()).await;
1305        assert_eq!(outcome.kind, MatchKind::Uncertain);
1306    }
1307
1308    #[tokio::test]
1309    async fn body_present_plus_absent_resolve_to_found() {
1310        let server = MockServer::start().await;
1311        Mock::given(any())
1312            .and(path("/alice"))
1313            .respond_with(
1314                ResponseTemplate::new(200)
1315                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
1316            )
1317            .mount(&server)
1318            .await;
1319        let site = site_with(
1320            &server,
1321            vec![
1322                Signal::BodyPresent {
1323                    text: "profile-card".into(),
1324                },
1325                Signal::BodyAbsent {
1326                    text: "Profile not found".into(),
1327                },
1328            ],
1329        );
1330        let outcome = build_client().check(&site, &user()).await;
1331        assert_eq!(outcome.kind, MatchKind::Found);
1332    }
1333
1334    #[tokio::test]
1335    async fn redirect_absent_signal_detects_missing_account() {
1336        let server = MockServer::start().await;
1337        Mock::given(any())
1338            .and(path("/alice"))
1339            .respond_with(
1340                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1341            )
1342            .mount(&server)
1343            .await;
1344        Mock::given(any())
1345            .and(path("/login"))
1346            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1347            .mount(&server)
1348            .await;
1349        let site = site_with(
1350            &server,
1351            vec![Signal::RedirectAbsent {
1352                fragment: "/login".into(),
1353            }],
1354        );
1355        let outcome = build_client().check(&site, &user()).await;
1356        assert_eq!(outcome.kind, MatchKind::NotFound);
1357    }
1358
1359    #[tokio::test]
1360    async fn negative_signal_wins_over_positive() {
1361        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1362        // (error marker appears). Negative-priority aggregation → NotFound.
1363        // This is the canonical Sherlock "message" pattern: a site that
1364        // returns 200 for everyone and differentiates via an error string.
1365        let server = MockServer::start().await;
1366        Mock::given(any())
1367            .and(path("/alice"))
1368            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1369            .mount(&server)
1370            .await;
1371        let site = site_with(
1372            &server,
1373            vec![
1374                Signal::StatusFound { codes: vec![200] },
1375                Signal::BodyAbsent {
1376                    text: "Profile not found".into(),
1377                },
1378            ],
1379        );
1380        let outcome = build_client().check(&site, &user()).await;
1381        assert_eq!(outcome.kind, MatchKind::NotFound);
1382    }
1383
1384    #[tokio::test]
1385    async fn network_failure_yields_uncertain() {
1386        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1387        let port = listener.local_addr().unwrap().port();
1388        drop(listener);
1389
1390        let site = Site {
1391            name: "Dead".into(),
1392            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1393            signals: vec![Signal::StatusFound { codes: vec![200] }],
1394            known_present: None,
1395            known_absent: None,
1396            extract: Vec::new(),
1397            tags: Vec::new(),
1398            request_headers: std::collections::BTreeMap::new(),
1399            regex_check: None,
1400            engine: None,
1401            strip_bad_char: None,
1402            request_method: crate::site::HttpMethod::Get,
1403            request_body: None,
1404            protection: Vec::new(),
1405            disabled: false,
1406            source: None,
1407            popularity: None,
1408            access: crate::AccessPolicy::default(),
1409        };
1410        let client = Client::builder()
1411            .timeout(Duration::from_millis(500))
1412            .connect_timeout(Duration::from_millis(500))
1413            .max_retries(0)
1414            .build()
1415            .unwrap();
1416        let outcome = client.check(&site, &user()).await;
1417        assert_eq!(outcome.kind, MatchKind::Uncertain);
1418        assert!(outcome.reason.is_some());
1419    }
1420
1421    #[tokio::test]
1422    async fn throttle_spaces_consecutive_calls_to_same_host() {
1423        let server = MockServer::start().await;
1424        Mock::given(any())
1425            .and(path("/alice"))
1426            .respond_with(ResponseTemplate::new(200))
1427            .mount(&server)
1428            .await;
1429        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1430        // Interval is intentionally much larger than typical wiremock latency
1431        // (≤10 ms locally, can spike under heavy parallel test load). Any
1432        // value too close to HTTP latency would let the first request burn
1433        // through the throttle window and make the assertion flaky.
1434        let client = Client::builder()
1435            .timeout(Duration::from_secs(2))
1436            .min_request_interval(Duration::from_millis(300))
1437            .build()
1438            .unwrap();
1439
1440        client.check(&site, &user()).await;
1441        let started = Instant::now();
1442        client.check(&site, &user()).await;
1443        let elapsed = started.elapsed();
1444        assert!(
1445            elapsed >= Duration::from_millis(200),
1446            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1447        );
1448    }
1449
1450    #[tokio::test]
1451    async fn builder_overrides_user_agent() {
1452        let server = MockServer::start().await;
1453        Mock::given(any())
1454            .and(path("/alice"))
1455            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1456            .respond_with(ResponseTemplate::new(200))
1457            .mount(&server)
1458            .await;
1459        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1460        let client = Client::builder()
1461            .user_agent("adler-test/1.0")
1462            .build()
1463            .unwrap();
1464        let outcome = client.check(&site, &user()).await;
1465        assert_eq!(outcome.kind, MatchKind::Found);
1466    }
1467
1468    #[tokio::test]
1469    async fn rate_limit_429_yields_uncertain_with_note() {
1470        let server = MockServer::start().await;
1471        Mock::given(any())
1472            .and(path("/alice"))
1473            .respond_with(ResponseTemplate::new(429))
1474            .mount(&server)
1475            .await;
1476        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1477        let outcome = build_client().check(&site, &user()).await;
1478        assert_eq!(outcome.kind, MatchKind::Uncertain);
1479        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1480    }
1481
1482    #[tokio::test]
1483    async fn cloudflare_server_header_yields_uncertain() {
1484        let server = MockServer::start().await;
1485        Mock::given(any())
1486            .and(path("/alice"))
1487            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1488            .mount(&server)
1489            .await;
1490        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1491        let outcome = build_client().check(&site, &user()).await;
1492        assert_eq!(outcome.kind, MatchKind::Uncertain);
1493        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1494    }
1495
1496    #[tokio::test]
1497    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1498        // Body-based ban detection only runs when a signal already needs
1499        // the body — this site uses BodyAbsent so the body is read.
1500        let server = MockServer::start().await;
1501        Mock::given(any())
1502            .and(path("/alice"))
1503            .respond_with(
1504                ResponseTemplate::new(200)
1505                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1506            )
1507            .mount(&server)
1508            .await;
1509        let site = site_with(
1510            &server,
1511            vec![Signal::BodyAbsent {
1512                text: "Profile not found".into(),
1513            }],
1514        );
1515        let outcome = build_client().check(&site, &user()).await;
1516        assert_eq!(outcome.kind, MatchKind::Uncertain);
1517        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1518    }
1519
1520    #[tokio::test]
1521    async fn ban_detection_does_not_fire_on_legitimate_403() {
1522        let server = MockServer::start().await;
1523        Mock::given(any())
1524            .and(path("/alice"))
1525            .respond_with(ResponseTemplate::new(403))
1526            .mount(&server)
1527            .await;
1528        let site = site_with(
1529            &server,
1530            vec![
1531                Signal::StatusFound { codes: vec![200] },
1532                Signal::StatusNotFound { codes: vec![403] },
1533            ],
1534        );
1535        let outcome = build_client().check(&site, &user()).await;
1536        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1537        assert_eq!(outcome.kind, MatchKind::NotFound);
1538        assert!(outcome.reason.is_none());
1539    }
1540
1541    #[tokio::test]
1542    async fn retry_recovers_after_transient_429() {
1543        let server = MockServer::start().await;
1544        // First request: 429. Subsequent: 200.
1545        Mock::given(any())
1546            .and(path("/alice"))
1547            .respond_with(ResponseTemplate::new(429))
1548            .up_to_n_times(1)
1549            .mount(&server)
1550            .await;
1551        Mock::given(any())
1552            .and(path("/alice"))
1553            .respond_with(ResponseTemplate::new(200))
1554            .mount(&server)
1555            .await;
1556        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1557        let client = Client::builder()
1558            .timeout(Duration::from_secs(2))
1559            .min_request_interval(Duration::ZERO)
1560            .max_retries(2)
1561            .base_backoff_delay(Duration::from_millis(20))
1562            .max_backoff_delay(Duration::from_millis(100))
1563            .build()
1564            .unwrap();
1565        let outcome = client.check(&site, &user()).await;
1566        assert_eq!(outcome.kind, MatchKind::Found);
1567        assert!(outcome.reason.is_none());
1568    }
1569
1570    #[tokio::test]
1571    async fn retry_exhausts_and_returns_uncertain() {
1572        let server = MockServer::start().await;
1573        Mock::given(any())
1574            .and(path("/alice"))
1575            .respond_with(ResponseTemplate::new(429))
1576            .mount(&server)
1577            .await;
1578        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1579        let client = Client::builder()
1580            .timeout(Duration::from_secs(2))
1581            .min_request_interval(Duration::ZERO)
1582            .max_retries(2)
1583            .base_backoff_delay(Duration::from_millis(10))
1584            .max_backoff_delay(Duration::from_millis(50))
1585            .build()
1586            .unwrap();
1587        let outcome = client.check(&site, &user()).await;
1588        assert_eq!(outcome.kind, MatchKind::Uncertain);
1589        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1590    }
1591
1592    #[tokio::test]
1593    async fn retry_does_not_fire_on_network_error() {
1594        // Connection refused → Uncertain note starts with "request:", not a
1595        // ban marker. We must NOT retry — otherwise a single dead site
1596        // burns the full backoff budget before reporting.
1597        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1598        let port = listener.local_addr().unwrap().port();
1599        drop(listener);
1600        let site = Site {
1601            name: "Dead".into(),
1602            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1603            signals: vec![Signal::StatusFound { codes: vec![200] }],
1604            known_present: None,
1605            known_absent: None,
1606            extract: Vec::new(),
1607            tags: Vec::new(),
1608            request_headers: std::collections::BTreeMap::new(),
1609            regex_check: None,
1610            engine: None,
1611            strip_bad_char: None,
1612            request_method: crate::site::HttpMethod::Get,
1613            request_body: None,
1614            protection: Vec::new(),
1615            disabled: false,
1616            source: None,
1617            popularity: None,
1618            access: crate::AccessPolicy::default(),
1619        };
1620        let client = Client::builder()
1621            .timeout(Duration::from_millis(500))
1622            .connect_timeout(Duration::from_millis(500))
1623            .min_request_interval(Duration::ZERO)
1624            .max_retries(3)
1625            .base_backoff_delay(Duration::from_secs(60))
1626            .build()
1627            .unwrap();
1628        let started = Instant::now();
1629        let outcome = client.check(&site, &user()).await;
1630        // If retry fired, we'd be sleeping minutes; instead this returns
1631        // promptly with an Uncertain.
1632        assert!(started.elapsed() < Duration::from_secs(5));
1633        assert_eq!(outcome.kind, MatchKind::Uncertain);
1634        assert!(
1635            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1636            "got {:?}",
1637            outcome.reason,
1638        );
1639    }
1640
1641    #[tokio::test]
1642    async fn rotates_user_agent_per_request() {
1643        // The mock only matches when the request carries one of the pooled
1644        // UAs; if rotation weren't applied, the default adler/x.y UA would
1645        // miss and the verdict would be NotFound.
1646        let server = MockServer::start().await;
1647        Mock::given(any())
1648            .and(path("/alice"))
1649            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1650            .respond_with(ResponseTemplate::new(200))
1651            .mount(&server)
1652            .await;
1653        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1654        let client = Client::builder()
1655            .min_request_interval(Duration::ZERO)
1656            .max_retries(0)
1657            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1658            .build()
1659            .unwrap();
1660        let outcome = client.check(&site, &user()).await;
1661        assert_eq!(outcome.kind, MatchKind::Found);
1662    }
1663
1664    #[test]
1665    fn invalid_proxy_url_fails_build() {
1666        let err = Client::builder().proxy("not a url").build().unwrap_err();
1667        assert!(matches!(err, Error::HttpSetup { .. }));
1668    }
1669
1670    #[test]
1671    fn schemeless_proxy_is_rejected_up_front() {
1672        // reqwest would silently treat this as a host; we require a scheme.
1673        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1674        let Error::HttpSetup { message } = err else {
1675            panic!("expected HttpSetup, got {err:?}");
1676        };
1677        assert!(message.contains("must start with"), "{message}");
1678    }
1679
1680    #[test]
1681    fn socks5_proxy_scheme_is_accepted() {
1682        // Valid scheme + endpoint builds fine (no connection is attempted).
1683        assert!(
1684            Client::builder()
1685                .proxy("socks5://127.0.0.1:9050")
1686                .build()
1687                .is_ok()
1688        );
1689    }
1690
1691    #[tokio::test]
1692    async fn global_rps_cap_spaces_requests_across_hosts() {
1693        // Two distinct host paths; per-host throttle is disabled, so any
1694        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1695        let server = MockServer::start().await;
1696        Mock::given(any())
1697            .respond_with(ResponseTemplate::new(200))
1698            .mount(&server)
1699            .await;
1700        let site_a = Site {
1701            name: "A".into(),
1702            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1703            signals: vec![Signal::StatusFound { codes: vec![200] }],
1704            known_present: None,
1705            known_absent: None,
1706            extract: Vec::new(),
1707            tags: Vec::new(),
1708            request_headers: std::collections::BTreeMap::new(),
1709            regex_check: None,
1710            engine: None,
1711            strip_bad_char: None,
1712            request_method: crate::site::HttpMethod::Get,
1713            request_body: None,
1714            protection: Vec::new(),
1715            disabled: false,
1716            source: None,
1717            popularity: None,
1718            access: crate::AccessPolicy::default(),
1719        };
1720        let site_b = Site {
1721            name: "B".into(),
1722            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1723            signals: vec![Signal::StatusFound { codes: vec![200] }],
1724            known_present: None,
1725            known_absent: None,
1726            extract: Vec::new(),
1727            tags: Vec::new(),
1728            request_headers: std::collections::BTreeMap::new(),
1729            regex_check: None,
1730            engine: None,
1731            strip_bad_char: None,
1732            request_method: crate::site::HttpMethod::Get,
1733            request_body: None,
1734            protection: Vec::new(),
1735            disabled: false,
1736            source: None,
1737            popularity: None,
1738            access: crate::AccessPolicy::default(),
1739        };
1740        // 2 RPS → ~500 ms between requests. A large interval keeps the
1741        // assertion robust even when the first probe's own duration (which
1742        // eats into the measured gap) is inflated by test instrumentation
1743        // such as coverage tooling.
1744        let client = Client::builder()
1745            .min_request_interval(Duration::ZERO)
1746            .max_retries(0)
1747            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1748            .build()
1749            .unwrap();
1750        // First request consumes the slot at t≈0; second waits ~500 ms even
1751        // though it targets a different host.
1752        client.check(&site_a, &user()).await;
1753        let started = Instant::now();
1754        client.check(&site_b, &user()).await;
1755        assert!(
1756            started.elapsed() >= Duration::from_millis(350),
1757            "global cap should space cross-host requests, got {:?}",
1758            started.elapsed(),
1759        );
1760    }
1761
1762    #[tokio::test]
1763    async fn respect_robots_skips_disallowed_paths() {
1764        let server = MockServer::start().await;
1765        Mock::given(any())
1766            .and(path("/robots.txt"))
1767            .respond_with(
1768                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1769            )
1770            .mount(&server)
1771            .await;
1772        Mock::given(any())
1773            .and(path("/no/alice"))
1774            .respond_with(ResponseTemplate::new(200))
1775            .mount(&server)
1776            .await;
1777        Mock::given(any())
1778            .and(path("/yes/alice"))
1779            .respond_with(ResponseTemplate::new(200))
1780            .mount(&server)
1781            .await;
1782        let client = Client::builder()
1783            .min_request_interval(Duration::ZERO)
1784            .max_retries(0)
1785            .respect_robots(true)
1786            .build()
1787            .unwrap();
1788
1789        let disallowed = Site {
1790            name: "No".into(),
1791            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1792            signals: vec![Signal::StatusFound { codes: vec![200] }],
1793            known_present: None,
1794            known_absent: None,
1795            extract: Vec::new(),
1796            tags: Vec::new(),
1797            request_headers: std::collections::BTreeMap::new(),
1798            regex_check: None,
1799            engine: None,
1800            strip_bad_char: None,
1801            request_method: crate::site::HttpMethod::Get,
1802            request_body: None,
1803            protection: Vec::new(),
1804            disabled: false,
1805            source: None,
1806            popularity: None,
1807            access: crate::AccessPolicy::default(),
1808        };
1809        let allowed = Site {
1810            name: "Yes".into(),
1811            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1812            signals: vec![Signal::StatusFound { codes: vec![200] }],
1813            known_present: None,
1814            known_absent: None,
1815            extract: Vec::new(),
1816            tags: Vec::new(),
1817            request_headers: std::collections::BTreeMap::new(),
1818            regex_check: None,
1819            engine: None,
1820            strip_bad_char: None,
1821            request_method: crate::site::HttpMethod::Get,
1822            request_body: None,
1823            protection: Vec::new(),
1824            disabled: false,
1825            source: None,
1826            popularity: None,
1827            access: crate::AccessPolicy::default(),
1828        };
1829
1830        let no = client.check(&disallowed, &user()).await;
1831        assert_eq!(no.kind, MatchKind::Uncertain);
1832        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1833
1834        let yes = client.check(&allowed, &user()).await;
1835        assert_eq!(yes.kind, MatchKind::Found);
1836    }
1837
1838    #[tokio::test]
1839    async fn body_read_skipped_when_no_body_signal_needed() {
1840        // Mock returns body that would fail a body_absent check — but since
1841        // we only have a status signal, body is never read.
1842        let server = MockServer::start().await;
1843        Mock::given(any())
1844            .and(path("/alice"))
1845            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1846            .mount(&server)
1847            .await;
1848        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1849        let outcome = build_client().check(&site, &user()).await;
1850        assert_eq!(outcome.kind, MatchKind::Found);
1851    }
1852
1853    // ===== Browser routing =====
1854
1855    /// Test backend that returns a canned page and counts calls. Lets the
1856    /// routing tests assert "Client did/did not invoke the browser" without
1857    /// involving a real Chrome process.
1858    #[derive(Debug)]
1859    struct RecordingBackend {
1860        page: RenderedPage,
1861        calls: std::sync::atomic::AtomicUsize,
1862    }
1863
1864    impl RecordingBackend {
1865        fn with_page(page: RenderedPage) -> Self {
1866            Self {
1867                page,
1868                calls: std::sync::atomic::AtomicUsize::new(0),
1869            }
1870        }
1871        fn call_count(&self) -> usize {
1872            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1873        }
1874    }
1875
1876    #[async_trait::async_trait]
1877    impl BrowserBackend for RecordingBackend {
1878        async fn fetch(
1879            &self,
1880            _url: &url::Url,
1881            _headers: &std::collections::BTreeMap<String, String>,
1882            _timeout: Duration,
1883        ) -> Result<RenderedPage> {
1884            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1885            Ok(self.page.clone())
1886        }
1887    }
1888
1889    fn site_bot_protected(server: &MockServer) -> Site {
1890        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1891        s.tags = vec!["bot-protected".into()];
1892        s
1893    }
1894
1895    #[tokio::test]
1896    async fn browser_routes_bot_protected_sites() {
1897        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1898        // returns its canned page directly.
1899        let server = MockServer::start().await;
1900        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1901            status: 200,
1902            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1903            body: "<html></html>".into(),
1904            elapsed_ms: 42,
1905        }));
1906        let client = Client::builder()
1907            .min_request_interval(Duration::ZERO)
1908            .max_retries(0)
1909            .browser(backend.clone())
1910            .build()
1911            .unwrap();
1912        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1913        assert_eq!(outcome.kind, MatchKind::Found);
1914        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1915    }
1916
1917    #[tokio::test]
1918    async fn non_bot_protected_sites_skip_browser() {
1919        let server = MockServer::start().await;
1920        Mock::given(any())
1921            .and(path("/alice"))
1922            .respond_with(ResponseTemplate::new(200))
1923            .mount(&server)
1924            .await;
1925        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1926            status: 500, // would make wiremock case fail if browser was taken
1927            final_url: url::Url::parse("https://x/").unwrap(),
1928            body: String::new(),
1929            elapsed_ms: 0,
1930        }));
1931        let client = Client::builder()
1932            .min_request_interval(Duration::ZERO)
1933            .max_retries(0)
1934            .browser(backend.clone())
1935            .build()
1936            .unwrap();
1937        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1938        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1939        let outcome = client.check(&site, &user()).await;
1940        assert_eq!(outcome.kind, MatchKind::Found);
1941        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1942    }
1943
1944    #[tokio::test]
1945    async fn browser_budget_exhaust_yields_uncertain() {
1946        let server = MockServer::start().await;
1947        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1948            status: 200,
1949            final_url: url::Url::parse("https://x/").unwrap(),
1950            body: String::new(),
1951            elapsed_ms: 0,
1952        }));
1953        let client = Client::builder()
1954            .min_request_interval(Duration::ZERO)
1955            .max_retries(0)
1956            .browser(backend.clone())
1957            .browser_budget(1)
1958            .build()
1959            .unwrap();
1960        let site = site_bot_protected(&server);
1961        // First call consumes the only slot.
1962        let first = client.check(&site, &user()).await;
1963        assert_eq!(first.kind, MatchKind::Found);
1964        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1965        let second = client.check(&site, &user()).await;
1966        assert_eq!(second.kind, MatchKind::Uncertain);
1967        assert!(matches!(
1968            second.reason,
1969            Some(UncertainReason::BrowserBudget)
1970        ));
1971        assert_eq!(
1972            backend.call_count(),
1973            1,
1974            "second call must not invoke backend"
1975        );
1976    }
1977
1978    #[tokio::test]
1979    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1980        struct FailingBackend;
1981        #[async_trait::async_trait]
1982        impl BrowserBackend for FailingBackend {
1983            async fn fetch(
1984                &self,
1985                _url: &url::Url,
1986                _headers: &std::collections::BTreeMap<String, String>,
1987                _timeout: Duration,
1988            ) -> Result<RenderedPage> {
1989                Err(Error::BrowserSetup {
1990                    message: "simulated crash".into(),
1991                })
1992            }
1993        }
1994        impl std::fmt::Debug for FailingBackend {
1995            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1996                f.write_str("FailingBackend")
1997            }
1998        }
1999
2000        let server = MockServer::start().await;
2001        let client = Client::builder()
2002            .min_request_interval(Duration::ZERO)
2003            .max_retries(0)
2004            .browser(Arc::new(FailingBackend))
2005            .build()
2006            .unwrap();
2007        let outcome = client.check(&site_bot_protected(&server), &user()).await;
2008        assert_eq!(outcome.kind, MatchKind::Uncertain);
2009        match outcome.reason {
2010            Some(UncertainReason::BrowserFailed(msg)) => {
2011                assert!(msg.contains("simulated crash"), "got: {msg}");
2012            }
2013            other => panic!("expected BrowserFailed, got {other:?}"),
2014        }
2015    }
2016
2017    #[tokio::test]
2018    async fn status_only_site_uses_head_request() {
2019        // Site with only status signals (no body markers, no enrichment)
2020        // should be probed with HEAD — saves the body download on
2021        // ~30% of the registry.
2022        let server = MockServer::start().await;
2023        Mock::given(method("HEAD"))
2024            .and(path("/alice"))
2025            .respond_with(ResponseTemplate::new(200))
2026            .mount(&server)
2027            .await;
2028        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2029        let outcome = build_client().check(&site, &user()).await;
2030        assert_eq!(outcome.kind, MatchKind::Found);
2031        let recvd = server.received_requests().await.unwrap_or_default();
2032        assert_eq!(recvd.len(), 1);
2033        assert_eq!(recvd[0].method.as_str(), "HEAD");
2034    }
2035
2036    #[tokio::test]
2037    async fn body_signal_site_uses_get_request() {
2038        // Same baseline plus a body-marker signal — must still GET so
2039        // the body actually arrives for matching.
2040        let server = MockServer::start().await;
2041        Mock::given(any())
2042            .and(path("/alice"))
2043            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
2044            .mount(&server)
2045            .await;
2046        let site = site_with(
2047            &server,
2048            vec![Signal::BodyPresent {
2049                text: "hello".into(),
2050            }],
2051        );
2052        let outcome = build_client().check(&site, &user()).await;
2053        assert_eq!(outcome.kind, MatchKind::Found);
2054        let recvd = server.received_requests().await.unwrap_or_default();
2055        assert_eq!(recvd[0].method.as_str(), "GET");
2056    }
2057
2058    #[tokio::test]
2059    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
2060        // A site that declares `protection: [Cloudflare]` but doesn't
2061        // carry the legacy `bot-protected` tag should still route
2062        // through the browser backend — the new structured field is
2063        // an additional signal, not a tag replacement.
2064        let server = MockServer::start().await;
2065        Mock::given(any())
2066            .respond_with(ResponseTemplate::new(200))
2067            .mount(&server)
2068            .await;
2069        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2070        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
2071        // No bot-protected tag — pure structured-field test.
2072        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2073            status: 200,
2074            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2075            body: String::new(),
2076            elapsed_ms: 0,
2077        }));
2078        let client = Client::builder()
2079            .min_request_interval(Duration::ZERO)
2080            .max_retries(0)
2081            .browser(backend)
2082            .build()
2083            .unwrap();
2084        let outcome = client.check(&site, &user()).await;
2085        // The recording backend always returns a synthetic 200, so
2086        // Found means we went through the browser path.
2087        assert_eq!(outcome.kind, MatchKind::Found);
2088        // No raw HTTP probe should have hit the mock server.
2089        let recvd = server.received_requests().await.unwrap_or_default();
2090        assert_eq!(
2091            recvd.len(),
2092            0,
2093            "structured protection must skip the raw HTTP path"
2094        );
2095    }
2096
2097    #[tokio::test]
2098    async fn post_method_sends_body_with_username_substituted() {
2099        // A POST-probed site (e.g. Anilist GraphQL) — the username
2100        // goes in the body, not the URL. Adler should substitute
2101        // `{username}` and send a POST with the rendered payload.
2102        let server = MockServer::start().await;
2103        Mock::given(method("POST"))
2104            .and(path("/api"))
2105            .respond_with(ResponseTemplate::new(200))
2106            .mount(&server)
2107            .await;
2108        // URL substitution still requires the `{username}` placeholder,
2109        // even for POST sites where the username also lives in the
2110        // body. Most real POST endpoints encode the username in both
2111        // (e.g. query string + body); we mirror that.
2112        let site = Site {
2113            name: "ApiPost".into(),
2114            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
2115            signals: vec![Signal::StatusFound { codes: vec![200] }],
2116            known_present: None,
2117            known_absent: None,
2118            extract: Vec::new(),
2119            tags: Vec::new(),
2120            request_headers: std::collections::BTreeMap::new(),
2121            regex_check: None,
2122            engine: None,
2123            strip_bad_char: None,
2124            request_method: HttpMethod::Post,
2125            request_body: Some(r#"{"name":"{username}"}"#.into()),
2126            protection: Vec::new(),
2127            disabled: false,
2128            source: None,
2129            popularity: None,
2130            access: crate::AccessPolicy::default(),
2131        };
2132        let outcome = build_client().check(&site, &user()).await;
2133        assert_eq!(outcome.kind, MatchKind::Found);
2134        let recvd = server.received_requests().await.unwrap_or_default();
2135        assert_eq!(recvd.len(), 1);
2136        assert_eq!(recvd[0].method.as_str(), "POST");
2137        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
2138        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
2139    }
2140
2141    #[tokio::test]
2142    async fn head_405_falls_back_to_get() {
2143        // A server that rejects HEAD with 405 — Adler should silently
2144        // retry with GET so the optimisation can never cost accuracy.
2145        let server = MockServer::start().await;
2146        Mock::given(method("HEAD"))
2147            .and(path("/alice"))
2148            .respond_with(ResponseTemplate::new(405))
2149            .mount(&server)
2150            .await;
2151        Mock::given(any())
2152            .and(path("/alice"))
2153            .respond_with(ResponseTemplate::new(200))
2154            .mount(&server)
2155            .await;
2156        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2157        let outcome = build_client().check(&site, &user()).await;
2158        assert_eq!(outcome.kind, MatchKind::Found);
2159        let recvd = server.received_requests().await.unwrap_or_default();
2160        assert_eq!(recvd.len(), 2);
2161        assert_eq!(recvd[0].method.as_str(), "HEAD");
2162        assert_eq!(recvd[1].method.as_str(), "GET");
2163    }
2164
2165    // ------------------------------------------------------------------
2166    // Phase 4 — automatic escalation when the cheap transport hits a
2167    // Cloudflare / rate-limit Uncertain that the browser could resolve.
2168    // ------------------------------------------------------------------
2169
2170    /// Mocked HTTP that always responds with a Cloudflare 503 (server
2171    /// header + 503 status — what the pre-body ban detector turns into
2172    /// `Uncertain(CloudflareChallenge)`).
2173    async fn cloudflare_503_server() -> MockServer {
2174        let server = MockServer::start().await;
2175        Mock::given(any())
2176            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
2177            .mount(&server)
2178            .await;
2179        server
2180    }
2181
2182    #[tokio::test]
2183    async fn http_success_stamps_http_transport_no_escalations() {
2184        let server = MockServer::start().await;
2185        Mock::given(any())
2186            .respond_with(ResponseTemplate::new(200))
2187            .mount(&server)
2188            .await;
2189        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2190        let outcome = build_client().check(&site, &user()).await;
2191        assert_eq!(outcome.kind, MatchKind::Found);
2192        assert_eq!(
2193            outcome.transport,
2194            Some(crate::escalation::TransportTier::Http),
2195            "successful HTTP probe must stamp Http transport"
2196        );
2197        assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
2198    }
2199
2200    #[tokio::test]
2201    async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
2202        let server = cloudflare_503_server().await;
2203        // Browser returns a 200 that the StatusFound signal turns into Found.
2204        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2205            status: 200,
2206            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2207            body: String::new(),
2208            elapsed_ms: 5,
2209        }));
2210        let client = Client::builder()
2211            .min_request_interval(Duration::ZERO)
2212            .max_retries(0)
2213            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2214            .build()
2215            .unwrap();
2216        // Non-bot-protected site — HTTP path runs first, hits Cloudflare,
2217        // escalation routes to the browser, browser's 200 → Found.
2218        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2219        let outcome = client.check(&site, &user()).await;
2220        assert_eq!(
2221            outcome.kind,
2222            MatchKind::Found,
2223            "escalation should flip CF challenge to Found via browser (reason {:?})",
2224            outcome.reason
2225        );
2226        assert_eq!(
2227            outcome.transport,
2228            Some(crate::escalation::TransportTier::Browser),
2229            "escalated outcome must be stamped Browser"
2230        );
2231        assert_eq!(
2232            outcome.escalations, 1,
2233            "exactly one escalation should have fired"
2234        );
2235        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
2236    }
2237
2238    #[tokio::test]
2239    async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
2240        let server = cloudflare_503_server().await;
2241        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2242            status: 200,
2243            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2244            body: String::new(),
2245            elapsed_ms: 0,
2246        }));
2247        let client = Client::builder()
2248            .min_request_interval(Duration::ZERO)
2249            .max_retries(0)
2250            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2251            .disable_escalation()
2252            .build()
2253            .unwrap();
2254        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2255        let outcome = client.check(&site, &user()).await;
2256        assert_eq!(outcome.kind, MatchKind::Uncertain);
2257        assert!(matches!(
2258            outcome.reason,
2259            Some(UncertainReason::CloudflareChallenge)
2260        ));
2261        assert_eq!(
2262            outcome.transport,
2263            Some(crate::escalation::TransportTier::Http),
2264            "primary transport must still be stamped"
2265        );
2266        assert_eq!(outcome.escalations, 0);
2267        assert_eq!(
2268            backend.call_count(),
2269            0,
2270            "browser must not be touched when --no-escalation"
2271        );
2272    }
2273
2274    #[tokio::test]
2275    async fn escalation_budget_zero_keeps_browser_untouched() {
2276        let server = cloudflare_503_server().await;
2277        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2278            status: 200,
2279            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2280            body: String::new(),
2281            elapsed_ms: 0,
2282        }));
2283        let client = Client::builder()
2284            .min_request_interval(Duration::ZERO)
2285            .max_retries(0)
2286            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2287            .escalation_budget(0)
2288            .build()
2289            .unwrap();
2290        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2291        let outcome = client.check(&site, &user()).await;
2292        assert_eq!(outcome.kind, MatchKind::Uncertain);
2293        assert!(matches!(
2294            outcome.reason,
2295            Some(UncertainReason::CloudflareChallenge)
2296        ));
2297        assert_eq!(outcome.escalations, 0);
2298        assert_eq!(
2299            backend.call_count(),
2300            0,
2301            "zero budget must deny every escalation"
2302        );
2303    }
2304
2305    #[tokio::test]
2306    async fn escalation_consumes_budget_then_stops() {
2307        let server = cloudflare_503_server().await;
2308        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2309            status: 200,
2310            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2311            body: String::new(),
2312            elapsed_ms: 0,
2313        }));
2314        let client = Client::builder()
2315            .min_request_interval(Duration::ZERO)
2316            .max_retries(0)
2317            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2318            .escalation_budget(1)
2319            .build()
2320            .unwrap();
2321        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2322        // First call burns the only escalation slot.
2323        let first = client.check(&site, &user()).await;
2324        assert_eq!(first.kind, MatchKind::Found);
2325        assert_eq!(first.escalations, 1);
2326        // Second call's escalation is denied → cheap-path Uncertain survives.
2327        let second = client.check(&site, &user()).await;
2328        assert_eq!(second.kind, MatchKind::Uncertain);
2329        assert!(matches!(
2330            second.reason,
2331            Some(UncertainReason::CloudflareChallenge)
2332        ));
2333        assert_eq!(second.escalations, 0);
2334        assert_eq!(backend.call_count(), 1, "browser called exactly once total");
2335    }
2336}