Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use reqwest::redirect;
16
17use crate::ban;
18use crate::browser::{BrowserBackend, BrowserBudget, RenderedPage};
19use crate::check::{CheckOutcome, MatchKind, UncertainReason};
20use crate::error::{Error, Result};
21use crate::retry::{self, RetryPolicy};
22use crate::robots::RobotsCache;
23use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
24use crate::throttle::HostThrottle;
25use crate::username::Username;
26
27const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
28const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
29const DEFAULT_REDIRECT_LIMIT: usize = 8;
30const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
31/// Single fixed key for the global rate limiter (it gates all hosts).
32const GLOBAL_THROTTLE_KEY: &str = "*global*";
33
34/// HTTP client used to probe sites.
35///
36/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
37/// internally, and the throttle is `Arc`-backed, so cloning is the
38/// recommended way to share a client between tasks. Cloned clients share
39/// throttle state, which is what you want: a fan-out scan must not
40/// accidentally exceed a per-host budget by spawning more clients.
41#[derive(Clone)]
42pub struct Client {
43    inner: reqwest::Client,
44    throttle: HostThrottle,
45    /// Global RPS cap applied across all hosts. `None` → uncapped.
46    global_throttle: Option<HostThrottle>,
47    retry: RetryPolicy,
48    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
49    /// `Arc<[String]>` so cloning a client per task stays cheap.
50    user_agents: Arc<[String]>,
51    /// Extract profile fields from `Found` pages that declare extractors.
52    enrich: bool,
53    /// When set, skip probes disallowed by the host's `robots.txt`.
54    robots: Option<RobotsCache>,
55    /// Browser backend used for `bot-protected` sites. `None` → those sites
56    /// stay on the raw HTTP path and typically end up `Uncertain`.
57    browser: Option<Arc<dyn BrowserBackend>>,
58    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
59    /// for a single scan, so several tasks compete for the same budget.
60    browser_budget: Arc<BrowserBudget>,
61}
62
63impl Client {
64    /// Start configuring a new client.
65    pub fn builder() -> ClientBuilder {
66        ClientBuilder::default()
67    }
68
69    /// Probe a single site for `username`, retrying on transient bans.
70    ///
71    /// Network failures, timeouts, and unexpected response shapes all yield
72    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
73    /// returns an error: at the executor level we want a partial result for
74    /// every site, not abort-on-first-failure semantics.
75    ///
76    /// When ban detection classifies a response as `rate_limited` /
77    /// `cloudflare_challenge`, the call is retried with jittered exponential
78    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
79    /// Uncertain (network errors, body read failures) is **not** retried —
80    /// those failures rarely fix themselves in the seconds-to-minutes window
81    /// we'd block for.
82    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
83    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
84        let mut attempt: u32 = 0;
85        loop {
86            let outcome = self.probe_once(site, username).await;
87            if !retry::should_retry(&outcome, attempt, &self.retry) {
88                return outcome;
89            }
90            let delay = retry::backoff_delay(attempt, &self.retry);
91            tracing::info!(
92                site = %site.name,
93                attempt = attempt + 1,
94                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
95                ?delay,
96                "transient ban, retrying",
97            );
98            tokio::time::sleep(delay).await;
99            attempt += 1;
100        }
101    }
102
103    /// Fetch a URL and return raw response data (status, final URL, body)
104    /// with the same throttle / User-Agent / proxy machinery as `check`,
105    /// but without signal evaluation or retry.
106    ///
107    /// Returns `None` on any network/transport error. Intended for
108    /// diagnostics such as `adler --doctor --fix`, which diffs the
109    /// responses for a known-present and a nonsense user to derive a
110    /// signature.
111    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
112        let host = host_of(url);
113        if let Some(global) = &self.global_throttle {
114            global.wait(GLOBAL_THROTTLE_KEY).await;
115        }
116        self.throttle.wait(&host).await;
117        let mut request = self.inner.get(url);
118        if let Some(ua) = self.pick_user_agent() {
119            request = request.header(reqwest::header::USER_AGENT, ua);
120        }
121        let response = request.send().await.ok()?;
122        let status = response.status().as_u16();
123        let final_url = response.url().to_string();
124        let body = response.text().await.unwrap_or_default();
125        Some(RawResponse {
126            status,
127            final_url,
128            body,
129        })
130    }
131
132    /// Same as [`Self::fetch`] but routes through the configured browser
133    /// backend when the site is tagged `bot-protected` and a backend is
134    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
135    /// so that the diff-derivation works against the JS-rendered page
136    /// (login wall vs. real profile) rather than two identical raw-HTTP
137    /// shells.
138    ///
139    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
140    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
141    /// callers get the same `Option<RawResponse>` shape either way.
142    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
143        if let Some(backend) = self.browser.as_deref() {
144            let has_tag = site
145                .tags
146                .iter()
147                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
148            if has_tag || !site.protection.is_empty() {
149                let parsed = url::Url::parse(url).ok()?;
150                match backend
151                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
152                    .await
153                {
154                    Ok(page) => {
155                        return Some(RawResponse {
156                            status: page.status,
157                            final_url: page.final_url.to_string(),
158                            body: page.body,
159                        });
160                    }
161                    Err(err) => {
162                        tracing::warn!(
163                            site = %site.name, %url, error = %err,
164                            "browser fetch failed in doctor; falling back to raw HTTP",
165                        );
166                    }
167                }
168            }
169        }
170        self.fetch(url).await
171    }
172
173    /// Pick a User-Agent for the next request from the rotation pool, or
174    /// `None` to fall back on the client's fixed header.
175    fn pick_user_agent(&self) -> Option<&str> {
176        match self.user_agents.len() {
177            0 => None,
178            1 => Some(&self.user_agents[0]),
179            n => Some(&self.user_agents[fastrand::usize(0..n)]),
180        }
181    }
182
183    // Splitting probe_once into helpers would scatter the request/response
184    // flow that has to read top-to-bottom; one long function reads better.
185    #[allow(clippy::too_many_lines)]
186    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
187        let url = site.url_for(username);
188
189        // Site-level username constraint (Sherlock's `regexCheck`).
190        // Mismatch → skip the probe entirely. Saves a request and
191        // sidesteps the false-positive class where a site 404s on
192        // illegal usernames in a way our signal can't distinguish
193        // from a missing account. If the pattern fails to compile
194        // (Sherlock occasionally uses lookarounds, which our `regex`
195        // crate can't express), we let validate's warn-log stand
196        // and silently fall through — the rest of the probe still
197        // works.
198        if let Some(pat) = &site.regex_check {
199            if let Ok(re) = regex::Regex::new(pat) {
200                if !re.is_match(username.as_str()) {
201                    return uncertain(
202                        &site.name,
203                        url,
204                        Instant::now(),
205                        UncertainReason::UsernameNotAllowed,
206                    );
207                }
208            }
209        }
210
211        // Auto-route bot-protected sites through the browser backend when
212        // one is configured. Raw HTTP can't see past their JS/login wall,
213        // so this is the only way they ever produce a Found verdict.
214        // A site is "bot-protected" in the routing sense if it carries
215        // the legacy tag OR declares any specific protection mechanism
216        // via the new `protection` field — either signal is enough.
217        if let Some(backend) = self.browser.as_deref() {
218            let has_tag = site
219                .tags
220                .iter()
221                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
222            if has_tag || !site.protection.is_empty() {
223                if self.browser_budget.try_consume() {
224                    return self.probe_with_browser(site, &url, backend).await;
225                }
226                tracing::warn!(site = %site.name, "browser budget exhausted");
227                return uncertain(
228                    &site.name,
229                    url,
230                    Instant::now(),
231                    UncertainReason::BrowserBudget,
232                );
233            }
234        }
235
236        let host = host_of(&url);
237
238        // robots.txt gate, before consuming a throttle slot or probing.
239        if let Some(robots) = &self.robots {
240            if let Some((origin, path)) = origin_and_path(&url) {
241                if !robots.allowed(&origin, &path).await {
242                    tracing::debug!(%url, "skipped by robots.txt");
243                    return uncertain(
244                        &site.name,
245                        url,
246                        Instant::now(),
247                        UncertainReason::RobotsDisallowed,
248                    );
249                }
250            }
251        }
252
253        // Global cap first (gates every request), then per-host spacing.
254        if let Some(global) = &self.global_throttle {
255            global.wait(GLOBAL_THROTTLE_KEY).await;
256        }
257        self.throttle.wait(&host).await;
258        let started = Instant::now();
259        tracing::debug!(%url, %host, "probing");
260
261        // Read the body if a signal needs it, or if enrichment is on and the
262        // site has extractor rules (extraction needs the body).
263        let want_enrich = self.enrich && !site.extract.is_empty();
264        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
265
266        // POST sites carry their own body payload (the username goes in
267        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
268        // HEAD optimisation only applies to GET-probed sites: a HEAD
269        // for a POST endpoint would defeat its purpose. Body
270        // substitution mirrors URL substitution: `{username}` in
271        // `Site::request_body` is replaced before sending.
272        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
273            const USERNAME_PH: &str = "{username}";
274            site.request_body
275                .as_deref()
276                .map(|t| t.replace(USERNAME_PH, username.as_str()))
277        } else {
278            None
279        };
280
281        // For status-only sites (only StatusFound / StatusNotFound /
282        // RedirectAbsent signals, no enrichment), HEAD avoids the body
283        // download entirely — saving bandwidth and time on the
284        // ~30% of the registry that doesn't need a body marker.
285        // Some servers reject HEAD with 405; we transparently retry
286        // with GET so the optimisation never costs accuracy. POST
287        // probes always go out as POST regardless of body needs.
288        let response = match site.request_method {
289            HttpMethod::Post => {
290                send_request_with_body(
291                    &self.inner,
292                    reqwest::Method::POST,
293                    &url,
294                    self.pick_user_agent(),
295                    body_for_post.as_deref(),
296                )
297                .await
298            }
299            HttpMethod::Get if needs_body => {
300                send_request(
301                    &self.inner,
302                    reqwest::Method::GET,
303                    &url,
304                    self.pick_user_agent(),
305                )
306                .await
307            }
308            HttpMethod::Get => {
309                match send_request(
310                    &self.inner,
311                    reqwest::Method::HEAD,
312                    &url,
313                    self.pick_user_agent(),
314                )
315                .await
316                {
317                    Ok(r) if r.status().as_u16() == 405 => {
318                        send_request(
319                            &self.inner,
320                            reqwest::Method::GET,
321                            &url,
322                            self.pick_user_agent(),
323                        )
324                        .await
325                    }
326                    other => other,
327                }
328            }
329        };
330        let response = match response {
331            Ok(r) => r,
332            Err(err) => {
333                tracing::debug!(error = %err, "request failed");
334                return uncertain(
335                    &site.name,
336                    url,
337                    started,
338                    UncertainReason::Network(err.to_string()),
339                );
340            }
341        };
342
343        let status = response.status().as_u16();
344        let final_url = response.url().to_string();
345
346        if let Some(reason) = ban::detect_pre_body(status, response.headers()) {
347            tracing::warn!(%host, status, %reason, "ban-like response");
348            return uncertain(&site.name, url, started, reason);
349        }
350        let body = if needs_body {
351            match response.text().await {
352                Ok(b) => b,
353                Err(err) => {
354                    return uncertain(
355                        &site.name,
356                        url,
357                        started,
358                        UncertainReason::BodyRead(err.to_string()),
359                    );
360                }
361            }
362        } else {
363            String::new()
364        };
365
366        if !body.is_empty() {
367            if let Some(reason) = ban::detect_in_body(&body) {
368                tracing::warn!(%host, %reason, "ban-like body");
369                return uncertain(&site.name, url, started, reason);
370            }
371        }
372
373        let probe = Probe {
374            status,
375            final_url: &final_url,
376            body: &body,
377        };
378        let votes: Vec<(&Signal, SignalVerdict)> = site
379            .signals
380            .iter()
381            .map(|s| (s, s.evaluate(&probe)))
382            .collect();
383        let kind = aggregate(votes.iter().map(|(_, v)| *v));
384        let mut result = outcome(&site.name, url, started, kind);
385        // Record which signals produced the verdict (the winning polarity).
386        let winning = match kind {
387            MatchKind::Found => Some(SignalVerdict::Found),
388            MatchKind::NotFound => Some(SignalVerdict::NotFound),
389            MatchKind::Uncertain => None,
390        };
391        if let Some(want) = winning {
392            result.evidence = votes
393                .iter()
394                .filter(|(_, v)| *v == want)
395                .map(|(s, _)| s.describe_match(&probe))
396                .collect();
397        }
398        if want_enrich && kind == MatchKind::Found {
399            result.enrichment = crate::enrich::extract(&body, &site.extract);
400        }
401        result
402    }
403
404    /// Render `url` through the configured [`BrowserBackend`] and run the
405    /// same signal pipeline on the result. Per-fetch failures (timeout,
406    /// navigation error, etc.) surface as `Uncertain(BrowserFailed)` so
407    /// one flaky bot-protected site can't abort the scan.
408    async fn probe_with_browser(
409        &self,
410        site: &Site,
411        url: &str,
412        backend: &dyn BrowserBackend,
413    ) -> CheckOutcome {
414        let started = Instant::now();
415        let parsed = match url::Url::parse(url) {
416            Ok(u) => u,
417            Err(err) => {
418                return uncertain(
419                    &site.name,
420                    url.to_owned(),
421                    started,
422                    UncertainReason::Other(format!("invalid url: {err}")),
423                );
424            }
425        };
426
427        let page: RenderedPage = match backend
428            .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
429            .await
430        {
431            Ok(p) => p,
432            Err(err) => {
433                tracing::warn!(site = %site.name, %url, error = %err, "browser fetch failed");
434                return uncertain(
435                    &site.name,
436                    url.to_owned(),
437                    started,
438                    UncertainReason::BrowserFailed(err.to_string()),
439                );
440            }
441        };
442
443        let final_url_str = page.final_url.as_str().to_owned();
444        let probe = Probe {
445            status: page.status,
446            final_url: &final_url_str,
447            body: &page.body,
448        };
449        let votes: Vec<(&Signal, SignalVerdict)> = site
450            .signals
451            .iter()
452            .map(|s| (s, s.evaluate(&probe)))
453            .collect();
454        let kind = aggregate(votes.iter().map(|(_, v)| *v));
455        let mut result = outcome(&site.name, url.to_owned(), started, kind);
456        let winning = match kind {
457            MatchKind::Found => Some(SignalVerdict::Found),
458            MatchKind::NotFound => Some(SignalVerdict::NotFound),
459            MatchKind::Uncertain => None,
460        };
461        if let Some(want) = winning {
462            result.evidence = votes
463                .iter()
464                .filter(|(_, v)| *v == want)
465                .map(|(s, _)| s.describe_match(&probe))
466                .collect();
467        }
468        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
469            result.enrichment = crate::enrich::extract(&page.body, &site.extract);
470        }
471        result
472    }
473}
474
475/// Raw response data returned by [`Client::fetch`] for diagnostics.
476#[derive(Debug, Clone)]
477pub struct RawResponse {
478    /// HTTP status code.
479    pub status: u16,
480    /// Final URL after redirects.
481    pub final_url: String,
482    /// Decoded response body.
483    pub body: String,
484}
485
486/// Builder for [`Client`].
487#[derive(Clone)]
488#[must_use = "ClientBuilder does nothing until `.build()` is called"]
489pub struct ClientBuilder {
490    timeout: Duration,
491    connect_timeout: Duration,
492    user_agent: String,
493    follow_redirects: bool,
494    redirect_limit: usize,
495    min_request_interval: Duration,
496    max_rps: Option<NonZeroU32>,
497    retry: RetryPolicy,
498    proxy: Option<String>,
499    user_agents: Vec<String>,
500    enrich: bool,
501    respect_robots: bool,
502    browser: Option<Arc<dyn BrowserBackend>>,
503    browser_budget: usize,
504}
505
506impl Default for ClientBuilder {
507    fn default() -> Self {
508        Self {
509            timeout: DEFAULT_TIMEOUT,
510            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
511            user_agent: default_user_agent(),
512            follow_redirects: true,
513            redirect_limit: DEFAULT_REDIRECT_LIMIT,
514            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
515            max_rps: None,
516            retry: RetryPolicy::default(),
517            proxy: None,
518            user_agents: Vec::new(),
519            enrich: false,
520            respect_robots: false,
521            browser: None,
522            browser_budget: DEFAULT_BROWSER_BUDGET,
523        }
524    }
525}
526
527impl ClientBuilder {
528    /// Per-request timeout (covers connect, headers, and body read).
529    pub fn timeout(mut self, timeout: Duration) -> Self {
530        self.timeout = timeout;
531        self
532    }
533
534    /// TCP-connect timeout, applied independently of the request timeout.
535    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
536        self.connect_timeout = timeout;
537        self
538    }
539
540    /// Override the `User-Agent` header sent on every request.
541    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
542        self.user_agent = user_agent.into();
543        self
544    }
545
546    /// Toggle automatic redirect following. Defaults to `true`; disable when
547    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
548    pub fn follow_redirects(mut self, follow: bool) -> Self {
549        self.follow_redirects = follow;
550        self
551    }
552
553    /// Minimum time between consecutive requests to the same host.
554    ///
555    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
556    /// rate-limit responses on common OSINT targets while keeping fan-out
557    /// across many sites fast.
558    pub fn min_request_interval(mut self, interval: Duration) -> Self {
559        self.min_request_interval = interval;
560        self
561    }
562
563    /// Cap the total request rate across *all* hosts to `rps` requests per
564    /// second. Independent of (and composed with) the per-host interval —
565    /// useful on a metered connection or behind a shared-quota proxy.
566    /// Uncapped by default.
567    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
568        self.max_rps = Some(rps);
569        self
570    }
571
572    /// Maximum retry attempts after a transient ban response. Defaults to 2
573    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
574    pub fn max_retries(mut self, n: u32) -> Self {
575        self.retry.max_retries = n;
576        self
577    }
578
579    /// Base delay for the first retry. Subsequent retries double until
580    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
581    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
582        self.retry.base_delay = d;
583        self
584    }
585
586    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
587    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
588        self.retry.max_delay = d;
589        self
590    }
591
592    /// Route all requests through a proxy. Accepts `http://`, `https://`,
593    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
594    pub fn proxy(mut self, url: impl Into<String>) -> Self {
595        self.proxy = Some(url.into());
596        self
597    }
598
599    /// Rotate the `User-Agent` header per request, picking uniformly at
600    /// random from `agents`. An empty list (the default) keeps the single
601    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
602    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
603        self.user_agents = agents;
604        self
605    }
606
607    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
608    /// pages. Off by default; enables an extra body read for matching sites.
609    pub fn enrich(mut self, enrich: bool) -> Self {
610        self.enrich = enrich;
611        self
612    }
613
614    /// Honor each host's `robots.txt`: probes to disallowed paths are
615    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
616    /// default. Adds one cached `robots.txt` fetch per origin.
617    pub fn respect_robots(mut self, respect: bool) -> Self {
618        self.respect_robots = respect;
619        self
620    }
621
622    /// Attach a browser backend. Sites tagged `bot-protected` will be
623    /// routed through it instead of the raw HTTP path, up to the
624    /// [`browser_budget`](Self::browser_budget) cap.
625    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
626        self.browser = Some(backend);
627        self
628    }
629
630    /// Per-scan cap on how many `bot-protected` sites are allowed to use
631    /// the browser backend. Once exhausted, the rest fall back to
632    /// `Uncertain(BrowserBudget)`. Defaults to
633    /// [`DEFAULT_BROWSER_BUDGET`].
634    pub const fn browser_budget(mut self, cap: usize) -> Self {
635        self.browser_budget = cap;
636        self
637    }
638
639    /// Build a [`Client`].
640    pub fn build(self) -> Result<Client> {
641        let redirect_policy = if self.follow_redirects {
642            redirect::Policy::limited(self.redirect_limit)
643        } else {
644            redirect::Policy::none()
645        };
646        let mut builder = reqwest::Client::builder()
647            .user_agent(self.user_agent)
648            .timeout(self.timeout)
649            .connect_timeout(self.connect_timeout)
650            .redirect(redirect_policy);
651        if let Some(proxy_url) = &self.proxy {
652            // reqwest treats a schemeless string (e.g. "not-a-url") as a host
653            // and silently defaults it to http://, so every probe would fail
654            // confusingly. Require an explicit, supported scheme up front.
655            const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
656            if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
657                return Err(Error::HttpSetup {
658                    message: format!(
659                        "invalid proxy {proxy_url:?}: must start with one of {}",
660                        SCHEMES.join(", ")
661                    ),
662                });
663            }
664            let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
665                message: format!("invalid proxy {proxy_url:?}: {e}"),
666            })?;
667            builder = builder.proxy(proxy);
668        }
669        let inner = builder.build().map_err(|e| Error::HttpSetup {
670            message: e.to_string(),
671        })?;
672        let global_throttle = self.max_rps.map(|rps| {
673            // Min spacing between any two requests = 1s / rps.
674            let interval = Duration::from_secs(1) / rps.get();
675            HostThrottle::new(interval)
676        });
677        let robots = self
678            .respect_robots
679            .then(|| RobotsCache::new(inner.clone(), "adler"));
680        Ok(Client {
681            inner,
682            throttle: HostThrottle::new(self.min_request_interval),
683            global_throttle,
684            retry: self.retry,
685            user_agents: Arc::from(self.user_agents),
686            enrich: self.enrich,
687            robots,
688            browser: self.browser,
689            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
690        })
691    }
692}
693
694/// Default ceiling on browser-backed probes per scan when no other value
695/// is specified.
696///
697/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
698/// headroom while still being a guardrail against a misconfigured flag
699/// burning a whole Browserbase quota.
700pub const DEFAULT_BROWSER_BUDGET: usize = 50;
701
702impl fmt::Debug for Client {
703    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
704        f.debug_struct("Client")
705            .field("throttle", &self.throttle)
706            .field("global_throttle", &self.global_throttle)
707            .field("retry", &self.retry)
708            .field("user_agents", &self.user_agents)
709            .field("enrich", &self.enrich)
710            .field("robots", &self.robots.is_some())
711            .field("browser", &self.browser.is_some())
712            .field("browser_budget", &self.browser_budget)
713            .finish_non_exhaustive()
714    }
715}
716
717impl fmt::Debug for ClientBuilder {
718    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719        f.debug_struct("ClientBuilder")
720            .field("timeout", &self.timeout)
721            .field("connect_timeout", &self.connect_timeout)
722            .field("user_agent", &self.user_agent)
723            .field("follow_redirects", &self.follow_redirects)
724            .field("redirect_limit", &self.redirect_limit)
725            .field("min_request_interval", &self.min_request_interval)
726            .field("max_rps", &self.max_rps)
727            .field("retry", &self.retry)
728            .field("proxy", &self.proxy)
729            .field("user_agents", &self.user_agents)
730            .field("enrich", &self.enrich)
731            .field("respect_robots", &self.respect_robots)
732            .field("browser", &self.browser.is_some())
733            .field("browser_budget", &self.browser_budget)
734            .finish()
735    }
736}
737
738/// Per-fetch timeout passed to [`BrowserBackend::fetch`]. Browser fetches
739/// (JS execution + waits) are inherently slower than raw HTTP, so this is
740/// generous on purpose.
741const BROWSER_TIMEOUT: Duration = Duration::from_secs(60);
742
743const BOT_PROTECTED_TAG: &str = "bot-protected";
744
745fn default_user_agent() -> String {
746    format!("adler/{}", env!("CARGO_PKG_VERSION"))
747}
748
749/// Issue a single HTTP request with the configured client, an optional
750/// User-Agent override, and the given method. Centralised so the probe
751/// path can transparently swap HEAD for GET (and retry on 405) without
752/// duplicating the request-build logic.
753async fn send_request(
754    client: &reqwest::Client,
755    method: reqwest::Method,
756    url: &str,
757    ua: Option<&str>,
758) -> reqwest::Result<reqwest::Response> {
759    send_request_with_body(client, method, url, ua, None).await
760}
761
762/// Same as [`send_request`] but with an optional request body — used
763/// for POST probes against API endpoints (GraphQL, login form, …).
764/// When `body` is `Some`, the request is sent with a `application/json`
765/// content type by default; sites that need a different content type
766/// declare it through [`Site::request_headers`].
767async fn send_request_with_body(
768    client: &reqwest::Client,
769    method: reqwest::Method,
770    url: &str,
771    ua: Option<&str>,
772    body: Option<&str>,
773) -> reqwest::Result<reqwest::Response> {
774    let mut request = client.request(method, url);
775    if let Some(ua) = ua {
776        request = request.header(reqwest::header::USER_AGENT, ua);
777    }
778    if let Some(b) = body {
779        request = request
780            .header(reqwest::header::CONTENT_TYPE, "application/json")
781            .body(b.to_owned());
782    }
783    request.send().await
784}
785
786fn host_of(url: &str) -> String {
787    reqwest::Url::parse(url)
788        .ok()
789        .and_then(|u| u.host_str().map(str::to_owned))
790        .unwrap_or_else(|| "unknown".into())
791}
792
793/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
794/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
795fn origin_and_path(url: &str) -> Option<(String, String)> {
796    let parsed = reqwest::Url::parse(url).ok()?;
797    let host = parsed.host_str()?;
798    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
799    let origin = format!("{}://{host}{port}", parsed.scheme());
800    let path = parsed.query().map_or_else(
801        || parsed.path().to_owned(),
802        |q| format!("{}?{q}", parsed.path()),
803    );
804    Some((origin, path))
805}
806
807fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
808    CheckOutcome {
809        site: site.to_owned(),
810        url,
811        kind,
812        reason: None,
813        elapsed_ms: elapsed_ms(started),
814        enrichment: std::collections::BTreeMap::new(),
815        evidence: Vec::new(),
816    }
817}
818
819fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
820    CheckOutcome {
821        site: site.to_owned(),
822        url,
823        kind: MatchKind::Uncertain,
824        reason: Some(reason),
825        elapsed_ms: elapsed_ms(started),
826        enrichment: std::collections::BTreeMap::new(),
827        evidence: Vec::new(),
828    }
829}
830
831fn elapsed_ms(started: Instant) -> u64 {
832    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
833}
834
835#[cfg(test)]
836mod tests {
837    use super::*;
838    use crate::site::{Signal, UrlTemplate};
839    use wiremock::matchers::{any, method, path};
840    use wiremock::{Mock, MockServer, ResponseTemplate};
841
842    fn build_client() -> Client {
843        Client::builder()
844            .timeout(Duration::from_secs(2))
845            // Tests share `127.0.0.1` as host — keep throttle out of the
846            // way for everything but the dedicated throttle test below.
847            .min_request_interval(Duration::ZERO)
848            // Default retry would re-hit ban-test mocks; tests opt in
849            // explicitly when they want to exercise the retry path.
850            .max_retries(0)
851            .build()
852            .expect("client builds")
853    }
854
855    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
856        Site {
857            name: "Mock".into(),
858            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
859            signals,
860            known_present: None,
861            known_absent: None,
862            extract: Vec::new(),
863            tags: Vec::new(),
864            request_headers: std::collections::BTreeMap::new(),
865            regex_check: None,
866            engine: None,
867            strip_bad_char: None,
868            request_method: crate::site::HttpMethod::Get,
869            request_body: None,
870            protection: Vec::new(),
871            disabled: false,
872            source: None,
873            popularity: None,
874        }
875    }
876
877    fn user() -> Username {
878        Username::new("alice").unwrap()
879    }
880
881    #[tokio::test]
882    async fn regex_check_short_circuits_before_any_request() {
883        // Stand up a mock that would 200 on *anything* — if probe_once
884        // failed to short-circuit on regex mismatch, the username
885        // "alice" (5 chars) would resolve to Found here.
886        let server = MockServer::start().await;
887        Mock::given(any())
888            .respond_with(ResponseTemplate::new(200))
889            .mount(&server)
890            .await;
891        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
892        // The site only accepts usernames of 8+ chars; "alice" is 5.
893        site.regex_check = Some("^[A-Za-z]{8,}$".into());
894        let outcome = build_client().check(&site, &user()).await;
895        assert_eq!(outcome.kind, MatchKind::Uncertain);
896        assert!(
897            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
898            "expected UsernameNotAllowed, got {:?}",
899            outcome.reason,
900        );
901        // No request should have hit the mock — assert by counting
902        // received_requests on the wiremock server.
903        let recvd = server.received_requests().await.unwrap_or_default();
904        assert_eq!(
905            recvd.len(),
906            0,
907            "regex_check mismatch must skip the HTTP request entirely"
908        );
909    }
910
911    #[tokio::test]
912    async fn regex_check_pass_proceeds_to_probe() {
913        let server = MockServer::start().await;
914        Mock::given(any())
915            .and(path("/alice"))
916            .respond_with(ResponseTemplate::new(200))
917            .mount(&server)
918            .await;
919        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
920        // Pattern that matches "alice".
921        site.regex_check = Some("^[a-z]{3,}$".into());
922        let outcome = build_client().check(&site, &user()).await;
923        assert_eq!(outcome.kind, MatchKind::Found);
924    }
925
926    #[tokio::test]
927    async fn status_signal_reports_found_on_match() {
928        let server = MockServer::start().await;
929        Mock::given(any())
930            .and(path("/alice"))
931            .respond_with(ResponseTemplate::new(200))
932            .mount(&server)
933            .await;
934        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
935        let outcome = build_client().check(&site, &user()).await;
936        assert_eq!(outcome.kind, MatchKind::Found);
937        assert!(outcome.url.ends_with("/alice"));
938        assert!(outcome.reason.is_none());
939        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
940    }
941
942    #[tokio::test]
943    async fn status_signal_pair_reports_not_found_on_404() {
944        let server = MockServer::start().await;
945        Mock::given(any())
946            .and(path("/alice"))
947            .respond_with(ResponseTemplate::new(404))
948            .mount(&server)
949            .await;
950        let site = site_with(
951            &server,
952            vec![
953                Signal::StatusFound { codes: vec![200] },
954                Signal::StatusNotFound { codes: vec![404] },
955            ],
956        );
957        let outcome = build_client().check(&site, &user()).await;
958        assert_eq!(outcome.kind, MatchKind::NotFound);
959        // Only the NotFound-voting signal is cited as evidence.
960        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
961    }
962
963    #[tokio::test]
964    async fn body_absent_signal_detects_missing_account() {
965        let server = MockServer::start().await;
966        Mock::given(any())
967            .and(path("/alice"))
968            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
969            .mount(&server)
970            .await;
971        let site = site_with(
972            &server,
973            vec![Signal::BodyAbsent {
974                text: "Profile not found".into(),
975            }],
976        );
977        let outcome = build_client().check(&site, &user()).await;
978        assert_eq!(outcome.kind, MatchKind::NotFound);
979    }
980
981    #[tokio::test]
982    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
983        // Phase 2 semantics: absence of an absence-marker is not evidence
984        // of presence — it just means we have no signal that fired.
985        let server = MockServer::start().await;
986        Mock::given(any())
987            .and(path("/alice"))
988            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
989            .mount(&server)
990            .await;
991        let site = site_with(
992            &server,
993            vec![Signal::BodyAbsent {
994                text: "Profile not found".into(),
995            }],
996        );
997        let outcome = build_client().check(&site, &user()).await;
998        assert_eq!(outcome.kind, MatchKind::Uncertain);
999    }
1000
1001    #[tokio::test]
1002    async fn body_present_plus_absent_resolve_to_found() {
1003        let server = MockServer::start().await;
1004        Mock::given(any())
1005            .and(path("/alice"))
1006            .respond_with(
1007                ResponseTemplate::new(200)
1008                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
1009            )
1010            .mount(&server)
1011            .await;
1012        let site = site_with(
1013            &server,
1014            vec![
1015                Signal::BodyPresent {
1016                    text: "profile-card".into(),
1017                },
1018                Signal::BodyAbsent {
1019                    text: "Profile not found".into(),
1020                },
1021            ],
1022        );
1023        let outcome = build_client().check(&site, &user()).await;
1024        assert_eq!(outcome.kind, MatchKind::Found);
1025    }
1026
1027    #[tokio::test]
1028    async fn redirect_absent_signal_detects_missing_account() {
1029        let server = MockServer::start().await;
1030        Mock::given(any())
1031            .and(path("/alice"))
1032            .respond_with(
1033                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1034            )
1035            .mount(&server)
1036            .await;
1037        Mock::given(any())
1038            .and(path("/login"))
1039            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1040            .mount(&server)
1041            .await;
1042        let site = site_with(
1043            &server,
1044            vec![Signal::RedirectAbsent {
1045                fragment: "/login".into(),
1046            }],
1047        );
1048        let outcome = build_client().check(&site, &user()).await;
1049        assert_eq!(outcome.kind, MatchKind::NotFound);
1050    }
1051
1052    #[tokio::test]
1053    async fn negative_signal_wins_over_positive() {
1054        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1055        // (error marker appears). Negative-priority aggregation → NotFound.
1056        // This is the canonical Sherlock "message" pattern: a site that
1057        // returns 200 for everyone and differentiates via an error string.
1058        let server = MockServer::start().await;
1059        Mock::given(any())
1060            .and(path("/alice"))
1061            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1062            .mount(&server)
1063            .await;
1064        let site = site_with(
1065            &server,
1066            vec![
1067                Signal::StatusFound { codes: vec![200] },
1068                Signal::BodyAbsent {
1069                    text: "Profile not found".into(),
1070                },
1071            ],
1072        );
1073        let outcome = build_client().check(&site, &user()).await;
1074        assert_eq!(outcome.kind, MatchKind::NotFound);
1075    }
1076
1077    #[tokio::test]
1078    async fn network_failure_yields_uncertain() {
1079        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1080        let port = listener.local_addr().unwrap().port();
1081        drop(listener);
1082
1083        let site = Site {
1084            name: "Dead".into(),
1085            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1086            signals: vec![Signal::StatusFound { codes: vec![200] }],
1087            known_present: None,
1088            known_absent: None,
1089            extract: Vec::new(),
1090            tags: Vec::new(),
1091            request_headers: std::collections::BTreeMap::new(),
1092            regex_check: None,
1093            engine: None,
1094            strip_bad_char: None,
1095            request_method: crate::site::HttpMethod::Get,
1096            request_body: None,
1097            protection: Vec::new(),
1098            disabled: false,
1099            source: None,
1100            popularity: None,
1101        };
1102        let client = Client::builder()
1103            .timeout(Duration::from_millis(500))
1104            .connect_timeout(Duration::from_millis(500))
1105            .max_retries(0)
1106            .build()
1107            .unwrap();
1108        let outcome = client.check(&site, &user()).await;
1109        assert_eq!(outcome.kind, MatchKind::Uncertain);
1110        assert!(outcome.reason.is_some());
1111    }
1112
1113    #[tokio::test]
1114    async fn throttle_spaces_consecutive_calls_to_same_host() {
1115        let server = MockServer::start().await;
1116        Mock::given(any())
1117            .and(path("/alice"))
1118            .respond_with(ResponseTemplate::new(200))
1119            .mount(&server)
1120            .await;
1121        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1122        // Interval is intentionally much larger than typical wiremock latency
1123        // (≤10 ms locally, can spike under heavy parallel test load). Any
1124        // value too close to HTTP latency would let the first request burn
1125        // through the throttle window and make the assertion flaky.
1126        let client = Client::builder()
1127            .timeout(Duration::from_secs(2))
1128            .min_request_interval(Duration::from_millis(300))
1129            .build()
1130            .unwrap();
1131
1132        client.check(&site, &user()).await;
1133        let started = Instant::now();
1134        client.check(&site, &user()).await;
1135        let elapsed = started.elapsed();
1136        assert!(
1137            elapsed >= Duration::from_millis(200),
1138            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1139        );
1140    }
1141
1142    #[tokio::test]
1143    async fn builder_overrides_user_agent() {
1144        let server = MockServer::start().await;
1145        Mock::given(any())
1146            .and(path("/alice"))
1147            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1148            .respond_with(ResponseTemplate::new(200))
1149            .mount(&server)
1150            .await;
1151        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1152        let client = Client::builder()
1153            .user_agent("adler-test/1.0")
1154            .build()
1155            .unwrap();
1156        let outcome = client.check(&site, &user()).await;
1157        assert_eq!(outcome.kind, MatchKind::Found);
1158    }
1159
1160    #[tokio::test]
1161    async fn rate_limit_429_yields_uncertain_with_note() {
1162        let server = MockServer::start().await;
1163        Mock::given(any())
1164            .and(path("/alice"))
1165            .respond_with(ResponseTemplate::new(429))
1166            .mount(&server)
1167            .await;
1168        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1169        let outcome = build_client().check(&site, &user()).await;
1170        assert_eq!(outcome.kind, MatchKind::Uncertain);
1171        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1172    }
1173
1174    #[tokio::test]
1175    async fn cloudflare_server_header_yields_uncertain() {
1176        let server = MockServer::start().await;
1177        Mock::given(any())
1178            .and(path("/alice"))
1179            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1180            .mount(&server)
1181            .await;
1182        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1183        let outcome = build_client().check(&site, &user()).await;
1184        assert_eq!(outcome.kind, MatchKind::Uncertain);
1185        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1186    }
1187
1188    #[tokio::test]
1189    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1190        // Body-based ban detection only runs when a signal already needs
1191        // the body — this site uses BodyAbsent so the body is read.
1192        let server = MockServer::start().await;
1193        Mock::given(any())
1194            .and(path("/alice"))
1195            .respond_with(
1196                ResponseTemplate::new(200)
1197                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1198            )
1199            .mount(&server)
1200            .await;
1201        let site = site_with(
1202            &server,
1203            vec![Signal::BodyAbsent {
1204                text: "Profile not found".into(),
1205            }],
1206        );
1207        let outcome = build_client().check(&site, &user()).await;
1208        assert_eq!(outcome.kind, MatchKind::Uncertain);
1209        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1210    }
1211
1212    #[tokio::test]
1213    async fn ban_detection_does_not_fire_on_legitimate_403() {
1214        let server = MockServer::start().await;
1215        Mock::given(any())
1216            .and(path("/alice"))
1217            .respond_with(ResponseTemplate::new(403))
1218            .mount(&server)
1219            .await;
1220        let site = site_with(
1221            &server,
1222            vec![
1223                Signal::StatusFound { codes: vec![200] },
1224                Signal::StatusNotFound { codes: vec![403] },
1225            ],
1226        );
1227        let outcome = build_client().check(&site, &user()).await;
1228        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1229        assert_eq!(outcome.kind, MatchKind::NotFound);
1230        assert!(outcome.reason.is_none());
1231    }
1232
1233    #[tokio::test]
1234    async fn retry_recovers_after_transient_429() {
1235        let server = MockServer::start().await;
1236        // First request: 429. Subsequent: 200.
1237        Mock::given(any())
1238            .and(path("/alice"))
1239            .respond_with(ResponseTemplate::new(429))
1240            .up_to_n_times(1)
1241            .mount(&server)
1242            .await;
1243        Mock::given(any())
1244            .and(path("/alice"))
1245            .respond_with(ResponseTemplate::new(200))
1246            .mount(&server)
1247            .await;
1248        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1249        let client = Client::builder()
1250            .timeout(Duration::from_secs(2))
1251            .min_request_interval(Duration::ZERO)
1252            .max_retries(2)
1253            .base_backoff_delay(Duration::from_millis(20))
1254            .max_backoff_delay(Duration::from_millis(100))
1255            .build()
1256            .unwrap();
1257        let outcome = client.check(&site, &user()).await;
1258        assert_eq!(outcome.kind, MatchKind::Found);
1259        assert!(outcome.reason.is_none());
1260    }
1261
1262    #[tokio::test]
1263    async fn retry_exhausts_and_returns_uncertain() {
1264        let server = MockServer::start().await;
1265        Mock::given(any())
1266            .and(path("/alice"))
1267            .respond_with(ResponseTemplate::new(429))
1268            .mount(&server)
1269            .await;
1270        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1271        let client = Client::builder()
1272            .timeout(Duration::from_secs(2))
1273            .min_request_interval(Duration::ZERO)
1274            .max_retries(2)
1275            .base_backoff_delay(Duration::from_millis(10))
1276            .max_backoff_delay(Duration::from_millis(50))
1277            .build()
1278            .unwrap();
1279        let outcome = client.check(&site, &user()).await;
1280        assert_eq!(outcome.kind, MatchKind::Uncertain);
1281        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1282    }
1283
1284    #[tokio::test]
1285    async fn retry_does_not_fire_on_network_error() {
1286        // Connection refused → Uncertain note starts with "request:", not a
1287        // ban marker. We must NOT retry — otherwise a single dead site
1288        // burns the full backoff budget before reporting.
1289        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1290        let port = listener.local_addr().unwrap().port();
1291        drop(listener);
1292        let site = Site {
1293            name: "Dead".into(),
1294            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1295            signals: vec![Signal::StatusFound { codes: vec![200] }],
1296            known_present: None,
1297            known_absent: None,
1298            extract: Vec::new(),
1299            tags: Vec::new(),
1300            request_headers: std::collections::BTreeMap::new(),
1301            regex_check: None,
1302            engine: None,
1303            strip_bad_char: None,
1304            request_method: crate::site::HttpMethod::Get,
1305            request_body: None,
1306            protection: Vec::new(),
1307            disabled: false,
1308            source: None,
1309            popularity: None,
1310        };
1311        let client = Client::builder()
1312            .timeout(Duration::from_millis(500))
1313            .connect_timeout(Duration::from_millis(500))
1314            .min_request_interval(Duration::ZERO)
1315            .max_retries(3)
1316            .base_backoff_delay(Duration::from_secs(60))
1317            .build()
1318            .unwrap();
1319        let started = Instant::now();
1320        let outcome = client.check(&site, &user()).await;
1321        // If retry fired, we'd be sleeping minutes; instead this returns
1322        // promptly with an Uncertain.
1323        assert!(started.elapsed() < Duration::from_secs(5));
1324        assert_eq!(outcome.kind, MatchKind::Uncertain);
1325        assert!(
1326            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1327            "got {:?}",
1328            outcome.reason,
1329        );
1330    }
1331
1332    #[tokio::test]
1333    async fn rotates_user_agent_per_request() {
1334        // The mock only matches when the request carries one of the pooled
1335        // UAs; if rotation weren't applied, the default adler/x.y UA would
1336        // miss and the verdict would be NotFound.
1337        let server = MockServer::start().await;
1338        Mock::given(any())
1339            .and(path("/alice"))
1340            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1341            .respond_with(ResponseTemplate::new(200))
1342            .mount(&server)
1343            .await;
1344        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1345        let client = Client::builder()
1346            .min_request_interval(Duration::ZERO)
1347            .max_retries(0)
1348            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1349            .build()
1350            .unwrap();
1351        let outcome = client.check(&site, &user()).await;
1352        assert_eq!(outcome.kind, MatchKind::Found);
1353    }
1354
1355    #[test]
1356    fn invalid_proxy_url_fails_build() {
1357        let err = Client::builder().proxy("not a url").build().unwrap_err();
1358        assert!(matches!(err, Error::HttpSetup { .. }));
1359    }
1360
1361    #[test]
1362    fn schemeless_proxy_is_rejected_up_front() {
1363        // reqwest would silently treat this as a host; we require a scheme.
1364        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1365        let Error::HttpSetup { message } = err else {
1366            panic!("expected HttpSetup, got {err:?}");
1367        };
1368        assert!(message.contains("must start with"), "{message}");
1369    }
1370
1371    #[test]
1372    fn socks5_proxy_scheme_is_accepted() {
1373        // Valid scheme + endpoint builds fine (no connection is attempted).
1374        assert!(
1375            Client::builder()
1376                .proxy("socks5://127.0.0.1:9050")
1377                .build()
1378                .is_ok()
1379        );
1380    }
1381
1382    #[tokio::test]
1383    async fn global_rps_cap_spaces_requests_across_hosts() {
1384        // Two distinct host paths; per-host throttle is disabled, so any
1385        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1386        let server = MockServer::start().await;
1387        Mock::given(any())
1388            .respond_with(ResponseTemplate::new(200))
1389            .mount(&server)
1390            .await;
1391        let site_a = Site {
1392            name: "A".into(),
1393            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1394            signals: vec![Signal::StatusFound { codes: vec![200] }],
1395            known_present: None,
1396            known_absent: None,
1397            extract: Vec::new(),
1398            tags: Vec::new(),
1399            request_headers: std::collections::BTreeMap::new(),
1400            regex_check: None,
1401            engine: None,
1402            strip_bad_char: None,
1403            request_method: crate::site::HttpMethod::Get,
1404            request_body: None,
1405            protection: Vec::new(),
1406            disabled: false,
1407            source: None,
1408            popularity: None,
1409        };
1410        let site_b = Site {
1411            name: "B".into(),
1412            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1413            signals: vec![Signal::StatusFound { codes: vec![200] }],
1414            known_present: None,
1415            known_absent: None,
1416            extract: Vec::new(),
1417            tags: Vec::new(),
1418            request_headers: std::collections::BTreeMap::new(),
1419            regex_check: None,
1420            engine: None,
1421            strip_bad_char: None,
1422            request_method: crate::site::HttpMethod::Get,
1423            request_body: None,
1424            protection: Vec::new(),
1425            disabled: false,
1426            source: None,
1427            popularity: None,
1428        };
1429        // 2 RPS → ~500 ms between requests. A large interval keeps the
1430        // assertion robust even when the first probe's own duration (which
1431        // eats into the measured gap) is inflated by test instrumentation
1432        // such as coverage tooling.
1433        let client = Client::builder()
1434            .min_request_interval(Duration::ZERO)
1435            .max_retries(0)
1436            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1437            .build()
1438            .unwrap();
1439        // First request consumes the slot at t≈0; second waits ~500 ms even
1440        // though it targets a different host.
1441        client.check(&site_a, &user()).await;
1442        let started = Instant::now();
1443        client.check(&site_b, &user()).await;
1444        assert!(
1445            started.elapsed() >= Duration::from_millis(350),
1446            "global cap should space cross-host requests, got {:?}",
1447            started.elapsed(),
1448        );
1449    }
1450
1451    #[tokio::test]
1452    async fn respect_robots_skips_disallowed_paths() {
1453        let server = MockServer::start().await;
1454        Mock::given(any())
1455            .and(path("/robots.txt"))
1456            .respond_with(
1457                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1458            )
1459            .mount(&server)
1460            .await;
1461        Mock::given(any())
1462            .and(path("/no/alice"))
1463            .respond_with(ResponseTemplate::new(200))
1464            .mount(&server)
1465            .await;
1466        Mock::given(any())
1467            .and(path("/yes/alice"))
1468            .respond_with(ResponseTemplate::new(200))
1469            .mount(&server)
1470            .await;
1471        let client = Client::builder()
1472            .min_request_interval(Duration::ZERO)
1473            .max_retries(0)
1474            .respect_robots(true)
1475            .build()
1476            .unwrap();
1477
1478        let disallowed = Site {
1479            name: "No".into(),
1480            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1481            signals: vec![Signal::StatusFound { codes: vec![200] }],
1482            known_present: None,
1483            known_absent: None,
1484            extract: Vec::new(),
1485            tags: Vec::new(),
1486            request_headers: std::collections::BTreeMap::new(),
1487            regex_check: None,
1488            engine: None,
1489            strip_bad_char: None,
1490            request_method: crate::site::HttpMethod::Get,
1491            request_body: None,
1492            protection: Vec::new(),
1493            disabled: false,
1494            source: None,
1495            popularity: None,
1496        };
1497        let allowed = Site {
1498            name: "Yes".into(),
1499            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1500            signals: vec![Signal::StatusFound { codes: vec![200] }],
1501            known_present: None,
1502            known_absent: None,
1503            extract: Vec::new(),
1504            tags: Vec::new(),
1505            request_headers: std::collections::BTreeMap::new(),
1506            regex_check: None,
1507            engine: None,
1508            strip_bad_char: None,
1509            request_method: crate::site::HttpMethod::Get,
1510            request_body: None,
1511            protection: Vec::new(),
1512            disabled: false,
1513            source: None,
1514            popularity: None,
1515        };
1516
1517        let no = client.check(&disallowed, &user()).await;
1518        assert_eq!(no.kind, MatchKind::Uncertain);
1519        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1520
1521        let yes = client.check(&allowed, &user()).await;
1522        assert_eq!(yes.kind, MatchKind::Found);
1523    }
1524
1525    #[tokio::test]
1526    async fn body_read_skipped_when_no_body_signal_needed() {
1527        // Mock returns body that would fail a body_absent check — but since
1528        // we only have a status signal, body is never read.
1529        let server = MockServer::start().await;
1530        Mock::given(any())
1531            .and(path("/alice"))
1532            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1533            .mount(&server)
1534            .await;
1535        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1536        let outcome = build_client().check(&site, &user()).await;
1537        assert_eq!(outcome.kind, MatchKind::Found);
1538    }
1539
1540    // ===== Browser routing =====
1541
1542    /// Test backend that returns a canned page and counts calls. Lets the
1543    /// routing tests assert "Client did/did not invoke the browser" without
1544    /// involving a real Chrome process.
1545    #[derive(Debug)]
1546    struct RecordingBackend {
1547        page: RenderedPage,
1548        calls: std::sync::atomic::AtomicUsize,
1549    }
1550
1551    impl RecordingBackend {
1552        fn with_page(page: RenderedPage) -> Self {
1553            Self {
1554                page,
1555                calls: std::sync::atomic::AtomicUsize::new(0),
1556            }
1557        }
1558        fn call_count(&self) -> usize {
1559            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1560        }
1561    }
1562
1563    #[async_trait::async_trait]
1564    impl BrowserBackend for RecordingBackend {
1565        async fn fetch(
1566            &self,
1567            _url: &url::Url,
1568            _headers: &std::collections::BTreeMap<String, String>,
1569            _timeout: Duration,
1570        ) -> Result<RenderedPage> {
1571            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1572            Ok(self.page.clone())
1573        }
1574    }
1575
1576    fn site_bot_protected(server: &MockServer) -> Site {
1577        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1578        s.tags = vec!["bot-protected".into()];
1579        s
1580    }
1581
1582    #[tokio::test]
1583    async fn browser_routes_bot_protected_sites() {
1584        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1585        // returns its canned page directly.
1586        let server = MockServer::start().await;
1587        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1588            status: 200,
1589            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1590            body: "<html></html>".into(),
1591            elapsed_ms: 42,
1592        }));
1593        let client = Client::builder()
1594            .min_request_interval(Duration::ZERO)
1595            .max_retries(0)
1596            .browser(backend.clone())
1597            .build()
1598            .unwrap();
1599        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1600        assert_eq!(outcome.kind, MatchKind::Found);
1601        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1602    }
1603
1604    #[tokio::test]
1605    async fn non_bot_protected_sites_skip_browser() {
1606        let server = MockServer::start().await;
1607        Mock::given(any())
1608            .and(path("/alice"))
1609            .respond_with(ResponseTemplate::new(200))
1610            .mount(&server)
1611            .await;
1612        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1613            status: 500, // would make wiremock case fail if browser was taken
1614            final_url: url::Url::parse("https://x/").unwrap(),
1615            body: String::new(),
1616            elapsed_ms: 0,
1617        }));
1618        let client = Client::builder()
1619            .min_request_interval(Duration::ZERO)
1620            .max_retries(0)
1621            .browser(backend.clone())
1622            .build()
1623            .unwrap();
1624        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1625        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1626        let outcome = client.check(&site, &user()).await;
1627        assert_eq!(outcome.kind, MatchKind::Found);
1628        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1629    }
1630
1631    #[tokio::test]
1632    async fn browser_budget_exhaust_yields_uncertain() {
1633        let server = MockServer::start().await;
1634        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1635            status: 200,
1636            final_url: url::Url::parse("https://x/").unwrap(),
1637            body: String::new(),
1638            elapsed_ms: 0,
1639        }));
1640        let client = Client::builder()
1641            .min_request_interval(Duration::ZERO)
1642            .max_retries(0)
1643            .browser(backend.clone())
1644            .browser_budget(1)
1645            .build()
1646            .unwrap();
1647        let site = site_bot_protected(&server);
1648        // First call consumes the only slot.
1649        let first = client.check(&site, &user()).await;
1650        assert_eq!(first.kind, MatchKind::Found);
1651        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1652        let second = client.check(&site, &user()).await;
1653        assert_eq!(second.kind, MatchKind::Uncertain);
1654        assert!(matches!(
1655            second.reason,
1656            Some(UncertainReason::BrowserBudget)
1657        ));
1658        assert_eq!(
1659            backend.call_count(),
1660            1,
1661            "second call must not invoke backend"
1662        );
1663    }
1664
1665    #[tokio::test]
1666    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1667        struct FailingBackend;
1668        #[async_trait::async_trait]
1669        impl BrowserBackend for FailingBackend {
1670            async fn fetch(
1671                &self,
1672                _url: &url::Url,
1673                _headers: &std::collections::BTreeMap<String, String>,
1674                _timeout: Duration,
1675            ) -> Result<RenderedPage> {
1676                Err(Error::BrowserSetup {
1677                    message: "simulated crash".into(),
1678                })
1679            }
1680        }
1681        impl std::fmt::Debug for FailingBackend {
1682            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1683                f.write_str("FailingBackend")
1684            }
1685        }
1686
1687        let server = MockServer::start().await;
1688        let client = Client::builder()
1689            .min_request_interval(Duration::ZERO)
1690            .max_retries(0)
1691            .browser(Arc::new(FailingBackend))
1692            .build()
1693            .unwrap();
1694        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1695        assert_eq!(outcome.kind, MatchKind::Uncertain);
1696        match outcome.reason {
1697            Some(UncertainReason::BrowserFailed(msg)) => {
1698                assert!(msg.contains("simulated crash"), "got: {msg}");
1699            }
1700            other => panic!("expected BrowserFailed, got {other:?}"),
1701        }
1702    }
1703
1704    #[tokio::test]
1705    async fn status_only_site_uses_head_request() {
1706        // Site with only status signals (no body markers, no enrichment)
1707        // should be probed with HEAD — saves the body download on
1708        // ~30% of the registry.
1709        let server = MockServer::start().await;
1710        Mock::given(method("HEAD"))
1711            .and(path("/alice"))
1712            .respond_with(ResponseTemplate::new(200))
1713            .mount(&server)
1714            .await;
1715        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1716        let outcome = build_client().check(&site, &user()).await;
1717        assert_eq!(outcome.kind, MatchKind::Found);
1718        let recvd = server.received_requests().await.unwrap_or_default();
1719        assert_eq!(recvd.len(), 1);
1720        assert_eq!(recvd[0].method.as_str(), "HEAD");
1721    }
1722
1723    #[tokio::test]
1724    async fn body_signal_site_uses_get_request() {
1725        // Same baseline plus a body-marker signal — must still GET so
1726        // the body actually arrives for matching.
1727        let server = MockServer::start().await;
1728        Mock::given(any())
1729            .and(path("/alice"))
1730            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
1731            .mount(&server)
1732            .await;
1733        let site = site_with(
1734            &server,
1735            vec![Signal::BodyPresent {
1736                text: "hello".into(),
1737            }],
1738        );
1739        let outcome = build_client().check(&site, &user()).await;
1740        assert_eq!(outcome.kind, MatchKind::Found);
1741        let recvd = server.received_requests().await.unwrap_or_default();
1742        assert_eq!(recvd[0].method.as_str(), "GET");
1743    }
1744
1745    #[tokio::test]
1746    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
1747        // A site that declares `protection: [Cloudflare]` but doesn't
1748        // carry the legacy `bot-protected` tag should still route
1749        // through the browser backend — the new structured field is
1750        // an additional signal, not a tag replacement.
1751        let server = MockServer::start().await;
1752        Mock::given(any())
1753            .respond_with(ResponseTemplate::new(200))
1754            .mount(&server)
1755            .await;
1756        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1757        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
1758        // No bot-protected tag — pure structured-field test.
1759        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1760            status: 200,
1761            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1762            body: String::new(),
1763            elapsed_ms: 0,
1764        }));
1765        let client = Client::builder()
1766            .min_request_interval(Duration::ZERO)
1767            .max_retries(0)
1768            .browser(backend)
1769            .build()
1770            .unwrap();
1771        let outcome = client.check(&site, &user()).await;
1772        // The recording backend always returns a synthetic 200, so
1773        // Found means we went through the browser path.
1774        assert_eq!(outcome.kind, MatchKind::Found);
1775        // No raw HTTP probe should have hit the mock server.
1776        let recvd = server.received_requests().await.unwrap_or_default();
1777        assert_eq!(
1778            recvd.len(),
1779            0,
1780            "structured protection must skip the raw HTTP path"
1781        );
1782    }
1783
1784    #[tokio::test]
1785    async fn post_method_sends_body_with_username_substituted() {
1786        // A POST-probed site (e.g. Anilist GraphQL) — the username
1787        // goes in the body, not the URL. Adler should substitute
1788        // `{username}` and send a POST with the rendered payload.
1789        let server = MockServer::start().await;
1790        Mock::given(method("POST"))
1791            .and(path("/api"))
1792            .respond_with(ResponseTemplate::new(200))
1793            .mount(&server)
1794            .await;
1795        // URL substitution still requires the `{username}` placeholder,
1796        // even for POST sites where the username also lives in the
1797        // body. Most real POST endpoints encode the username in both
1798        // (e.g. query string + body); we mirror that.
1799        let site = Site {
1800            name: "ApiPost".into(),
1801            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
1802            signals: vec![Signal::StatusFound { codes: vec![200] }],
1803            known_present: None,
1804            known_absent: None,
1805            extract: Vec::new(),
1806            tags: Vec::new(),
1807            request_headers: std::collections::BTreeMap::new(),
1808            regex_check: None,
1809            engine: None,
1810            strip_bad_char: None,
1811            request_method: HttpMethod::Post,
1812            request_body: Some(r#"{"name":"{username}"}"#.into()),
1813            protection: Vec::new(),
1814            disabled: false,
1815            source: None,
1816            popularity: None,
1817        };
1818        let outcome = build_client().check(&site, &user()).await;
1819        assert_eq!(outcome.kind, MatchKind::Found);
1820        let recvd = server.received_requests().await.unwrap_or_default();
1821        assert_eq!(recvd.len(), 1);
1822        assert_eq!(recvd[0].method.as_str(), "POST");
1823        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
1824        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
1825    }
1826
1827    #[tokio::test]
1828    async fn head_405_falls_back_to_get() {
1829        // A server that rejects HEAD with 405 — Adler should silently
1830        // retry with GET so the optimisation can never cost accuracy.
1831        let server = MockServer::start().await;
1832        Mock::given(method("HEAD"))
1833            .and(path("/alice"))
1834            .respond_with(ResponseTemplate::new(405))
1835            .mount(&server)
1836            .await;
1837        Mock::given(any())
1838            .and(path("/alice"))
1839            .respond_with(ResponseTemplate::new(200))
1840            .mount(&server)
1841            .await;
1842        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1843        let outcome = build_client().check(&site, &user()).await;
1844        assert_eq!(outcome.kind, MatchKind::Found);
1845        let recvd = server.received_requests().await.unwrap_or_default();
1846        assert_eq!(recvd.len(), 2);
1847        assert_eq!(recvd[0].method.as_str(), "HEAD");
1848        assert_eq!(recvd[1].method.as_str(), "GET");
1849    }
1850}