Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use reqwest::redirect;
16
17use crate::ban;
18use crate::browser::{BrowserBackend, BrowserBudget, RenderedPage};
19use crate::check::{CheckOutcome, MatchKind, UncertainReason};
20use crate::error::{Error, Result};
21use crate::retry::{self, RetryPolicy};
22use crate::robots::RobotsCache;
23use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
24use crate::throttle::HostThrottle;
25use crate::username::Username;
26
27const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
28const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
29const DEFAULT_REDIRECT_LIMIT: usize = 8;
30const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
31/// Single fixed key for the global rate limiter (it gates all hosts).
32const GLOBAL_THROTTLE_KEY: &str = "*global*";
33
34/// HTTP client used to probe sites.
35///
36/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
37/// internally, and the throttle is `Arc`-backed, so cloning is the
38/// recommended way to share a client between tasks. Cloned clients share
39/// throttle state, which is what you want: a fan-out scan must not
40/// accidentally exceed a per-host budget by spawning more clients.
41#[derive(Clone)]
42pub struct Client {
43    inner: reqwest::Client,
44    throttle: HostThrottle,
45    /// Global RPS cap applied across all hosts. `None` → uncapped.
46    global_throttle: Option<HostThrottle>,
47    retry: RetryPolicy,
48    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
49    /// `Arc<[String]>` so cloning a client per task stays cheap.
50    user_agents: Arc<[String]>,
51    /// Extract profile fields from `Found` pages that declare extractors.
52    enrich: bool,
53    /// When set, skip probes disallowed by the host's `robots.txt`.
54    robots: Option<RobotsCache>,
55    /// Browser backend used for `bot-protected` sites. `None` → those sites
56    /// stay on the raw HTTP path and typically end up `Uncertain`.
57    browser: Option<Arc<dyn BrowserBackend>>,
58    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
59    /// for a single scan, so several tasks compete for the same budget.
60    browser_budget: Arc<BrowserBudget>,
61}
62
63impl Client {
64    /// Start configuring a new client.
65    pub fn builder() -> ClientBuilder {
66        ClientBuilder::default()
67    }
68
69    /// Probe a single site for `username`, retrying on transient bans.
70    ///
71    /// Network failures, timeouts, and unexpected response shapes all yield
72    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
73    /// returns an error: at the executor level we want a partial result for
74    /// every site, not abort-on-first-failure semantics.
75    ///
76    /// When ban detection classifies a response as `rate_limited` /
77    /// `cloudflare_challenge`, the call is retried with jittered exponential
78    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
79    /// Uncertain (network errors, body read failures) is **not** retried —
80    /// those failures rarely fix themselves in the seconds-to-minutes window
81    /// we'd block for.
82    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
83    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
84        let mut attempt: u32 = 0;
85        loop {
86            let outcome = self.probe_once(site, username).await;
87            if !retry::should_retry(&outcome, attempt, &self.retry) {
88                return outcome;
89            }
90            let delay = retry::backoff_delay(attempt, &self.retry);
91            tracing::info!(
92                site = %site.name,
93                attempt = attempt + 1,
94                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
95                ?delay,
96                "transient ban, retrying",
97            );
98            tokio::time::sleep(delay).await;
99            attempt += 1;
100        }
101    }
102
103    /// Fetch a URL and return raw response data (status, final URL, body)
104    /// with the same throttle / User-Agent / proxy machinery as `check`,
105    /// but without signal evaluation or retry.
106    ///
107    /// Returns `None` on any network/transport error. Intended for
108    /// diagnostics such as `adler --doctor --fix`, which diffs the
109    /// responses for a known-present and a nonsense user to derive a
110    /// signature.
111    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
112        let host = host_of(url);
113        if let Some(global) = &self.global_throttle {
114            global.wait(GLOBAL_THROTTLE_KEY).await;
115        }
116        self.throttle.wait(&host).await;
117        let mut request = self.inner.get(url);
118        if let Some(ua) = self.pick_user_agent() {
119            request = request.header(reqwest::header::USER_AGENT, ua);
120        }
121        let response = request.send().await.ok()?;
122        let status = response.status().as_u16();
123        let final_url = response.url().to_string();
124        let body = response.text().await.unwrap_or_default();
125        Some(RawResponse {
126            status,
127            final_url,
128            body,
129        })
130    }
131
132    /// Same as [`Self::fetch`] but routes through the configured browser
133    /// backend when the site is tagged `bot-protected` and a backend is
134    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
135    /// so that the diff-derivation works against the JS-rendered page
136    /// (login wall vs. real profile) rather than two identical raw-HTTP
137    /// shells.
138    ///
139    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
140    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
141    /// callers get the same `Option<RawResponse>` shape either way.
142    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
143        if let Some(backend) = self.browser.as_deref() {
144            let has_tag = site
145                .tags
146                .iter()
147                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
148            if has_tag || !site.protection.is_empty() {
149                let parsed = url::Url::parse(url).ok()?;
150                match backend
151                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
152                    .await
153                {
154                    Ok(page) => {
155                        return Some(RawResponse {
156                            status: page.status,
157                            final_url: page.final_url.to_string(),
158                            body: page.body,
159                        });
160                    }
161                    Err(err) => {
162                        tracing::warn!(
163                            site = %site.name, %url, error = %err,
164                            "browser fetch failed in doctor; falling back to raw HTTP",
165                        );
166                    }
167                }
168            }
169        }
170        self.fetch(url).await
171    }
172
173    /// Pick a User-Agent for the next request from the rotation pool, or
174    /// `None` to fall back on the client's fixed header.
175    fn pick_user_agent(&self) -> Option<&str> {
176        match self.user_agents.len() {
177            0 => None,
178            1 => Some(&self.user_agents[0]),
179            n => Some(&self.user_agents[fastrand::usize(0..n)]),
180        }
181    }
182
183    // Splitting probe_once into helpers would scatter the request/response
184    // flow that has to read top-to-bottom; one long function reads better.
185    #[allow(clippy::too_many_lines)]
186    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
187        let url = site.url_for(username);
188
189        // Site-level username constraint (Sherlock's `regexCheck`).
190        // Mismatch → skip the probe entirely. Saves a request and
191        // sidesteps the false-positive class where a site 404s on
192        // illegal usernames in a way our signal can't distinguish
193        // from a missing account. If the pattern fails to compile
194        // (Sherlock occasionally uses lookarounds, which our `regex`
195        // crate can't express), we let validate's warn-log stand
196        // and silently fall through — the rest of the probe still
197        // works.
198        if let Some(pat) = &site.regex_check {
199            if let Ok(re) = regex::Regex::new(pat) {
200                if !re.is_match(username.as_str()) {
201                    return uncertain(
202                        &site.name,
203                        url,
204                        Instant::now(),
205                        UncertainReason::UsernameNotAllowed,
206                    );
207                }
208            }
209        }
210
211        // Auto-route bot-protected sites through the browser backend when
212        // one is configured. Raw HTTP can't see past their JS/login wall,
213        // so this is the only way they ever produce a Found verdict.
214        // A site is "bot-protected" in the routing sense if it carries
215        // the legacy tag OR declares any specific protection mechanism
216        // via the new `protection` field — either signal is enough.
217        if let Some(backend) = self.browser.as_deref() {
218            let has_tag = site
219                .tags
220                .iter()
221                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
222            if has_tag || !site.protection.is_empty() {
223                if self.browser_budget.try_consume() {
224                    return self.probe_with_browser(site, &url, backend).await;
225                }
226                tracing::warn!(site = %site.name, "browser budget exhausted");
227                return uncertain(
228                    &site.name,
229                    url,
230                    Instant::now(),
231                    UncertainReason::BrowserBudget,
232                );
233            }
234        }
235
236        let host = host_of(&url);
237
238        // robots.txt gate, before consuming a throttle slot or probing.
239        if let Some(robots) = &self.robots {
240            if let Some((origin, path)) = origin_and_path(&url) {
241                if !robots.allowed(&origin, &path).await {
242                    tracing::debug!(%url, "skipped by robots.txt");
243                    return uncertain(
244                        &site.name,
245                        url,
246                        Instant::now(),
247                        UncertainReason::RobotsDisallowed,
248                    );
249                }
250            }
251        }
252
253        // Global cap first (gates every request), then per-host spacing.
254        if let Some(global) = &self.global_throttle {
255            global.wait(GLOBAL_THROTTLE_KEY).await;
256        }
257        self.throttle.wait(&host).await;
258        let started = Instant::now();
259        tracing::debug!(%url, %host, "probing");
260
261        // Read the body if a signal needs it, or if enrichment is on and the
262        // site has extractor rules (extraction needs the body).
263        let want_enrich = self.enrich && !site.extract.is_empty();
264        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
265
266        // POST sites carry their own body payload (the username goes in
267        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
268        // HEAD optimisation only applies to GET-probed sites: a HEAD
269        // for a POST endpoint would defeat its purpose. Body
270        // substitution mirrors URL substitution: `{username}` in
271        // `Site::request_body` is replaced before sending.
272        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
273            const USERNAME_PH: &str = "{username}";
274            site.request_body
275                .as_deref()
276                .map(|t| t.replace(USERNAME_PH, username.as_str()))
277        } else {
278            None
279        };
280
281        // For status-only sites (only StatusFound / StatusNotFound /
282        // RedirectAbsent signals, no enrichment), HEAD avoids the body
283        // download entirely — saving bandwidth and time on the
284        // ~30% of the registry that doesn't need a body marker.
285        // Some servers reject HEAD with 405; we transparently retry
286        // with GET so the optimisation never costs accuracy. POST
287        // probes always go out as POST regardless of body needs.
288        let response = match site.request_method {
289            HttpMethod::Post => {
290                send_request_with_body(
291                    &self.inner,
292                    reqwest::Method::POST,
293                    &url,
294                    self.pick_user_agent(),
295                    body_for_post.as_deref(),
296                )
297                .await
298            }
299            HttpMethod::Get if needs_body => {
300                send_request(
301                    &self.inner,
302                    reqwest::Method::GET,
303                    &url,
304                    self.pick_user_agent(),
305                )
306                .await
307            }
308            HttpMethod::Get => {
309                match send_request(
310                    &self.inner,
311                    reqwest::Method::HEAD,
312                    &url,
313                    self.pick_user_agent(),
314                )
315                .await
316                {
317                    Ok(r) if r.status().as_u16() == 405 => {
318                        send_request(
319                            &self.inner,
320                            reqwest::Method::GET,
321                            &url,
322                            self.pick_user_agent(),
323                        )
324                        .await
325                    }
326                    other => other,
327                }
328            }
329        };
330        let response = match response {
331            Ok(r) => r,
332            Err(err) => {
333                tracing::debug!(error = %err, "request failed");
334                return uncertain(
335                    &site.name,
336                    url,
337                    started,
338                    UncertainReason::Network(err.to_string()),
339                );
340            }
341        };
342
343        let status = response.status().as_u16();
344        let final_url = response.url().to_string();
345
346        if let Some(reason) = ban::detect_pre_body(status, response.headers()) {
347            tracing::warn!(%host, status, %reason, "ban-like response");
348            return uncertain(&site.name, url, started, reason);
349        }
350        let body = if needs_body {
351            match response.text().await {
352                Ok(b) => b,
353                Err(err) => {
354                    return uncertain(
355                        &site.name,
356                        url,
357                        started,
358                        UncertainReason::BodyRead(err.to_string()),
359                    );
360                }
361            }
362        } else {
363            String::new()
364        };
365
366        if !body.is_empty() {
367            if let Some(reason) = ban::detect_in_body(&body) {
368                tracing::warn!(%host, %reason, "ban-like body");
369                return uncertain(&site.name, url, started, reason);
370            }
371        }
372
373        let probe = Probe {
374            status,
375            final_url: &final_url,
376            body: &body,
377        };
378        let votes: Vec<(&Signal, SignalVerdict)> = site
379            .signals
380            .iter()
381            .map(|s| (s, s.evaluate(&probe)))
382            .collect();
383        let kind = aggregate(votes.iter().map(|(_, v)| *v));
384        let mut result = outcome(&site.name, url, started, kind);
385        // Record which signals produced the verdict (the winning polarity).
386        let winning = match kind {
387            MatchKind::Found => Some(SignalVerdict::Found),
388            MatchKind::NotFound => Some(SignalVerdict::NotFound),
389            MatchKind::Uncertain => None,
390        };
391        if let Some(want) = winning {
392            result.evidence = votes
393                .iter()
394                .filter(|(_, v)| *v == want)
395                .map(|(s, _)| s.describe_match(&probe))
396                .collect();
397        }
398        if want_enrich && kind == MatchKind::Found {
399            result.enrichment = crate::enrich::extract(&body, &site.extract);
400        }
401        result
402    }
403
404    /// Render `url` through the configured [`BrowserBackend`] and run the
405    /// same signal pipeline on the result. Per-fetch failures (timeout,
406    /// navigation error, etc.) surface as `Uncertain(BrowserFailed)` so
407    /// one flaky bot-protected site can't abort the scan.
408    async fn probe_with_browser(
409        &self,
410        site: &Site,
411        url: &str,
412        backend: &dyn BrowserBackend,
413    ) -> CheckOutcome {
414        let started = Instant::now();
415        let parsed = match url::Url::parse(url) {
416            Ok(u) => u,
417            Err(err) => {
418                return uncertain(
419                    &site.name,
420                    url.to_owned(),
421                    started,
422                    UncertainReason::Other(format!("invalid url: {err}")),
423                );
424            }
425        };
426
427        let page: RenderedPage = match backend
428            .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
429            .await
430        {
431            Ok(p) => p,
432            Err(err) => {
433                tracing::warn!(site = %site.name, %url, error = %err, "browser fetch failed");
434                return uncertain(
435                    &site.name,
436                    url.to_owned(),
437                    started,
438                    UncertainReason::BrowserFailed(err.to_string()),
439                );
440            }
441        };
442
443        let final_url_str = page.final_url.as_str().to_owned();
444        let probe = Probe {
445            status: page.status,
446            final_url: &final_url_str,
447            body: &page.body,
448        };
449        let votes: Vec<(&Signal, SignalVerdict)> = site
450            .signals
451            .iter()
452            .map(|s| (s, s.evaluate(&probe)))
453            .collect();
454        let kind = aggregate(votes.iter().map(|(_, v)| *v));
455        let mut result = outcome(&site.name, url.to_owned(), started, kind);
456        let winning = match kind {
457            MatchKind::Found => Some(SignalVerdict::Found),
458            MatchKind::NotFound => Some(SignalVerdict::NotFound),
459            MatchKind::Uncertain => None,
460        };
461        if let Some(want) = winning {
462            result.evidence = votes
463                .iter()
464                .filter(|(_, v)| *v == want)
465                .map(|(s, _)| s.describe_match(&probe))
466                .collect();
467        }
468        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
469            result.enrichment = crate::enrich::extract(&page.body, &site.extract);
470        }
471        result
472    }
473}
474
475/// Raw response data returned by [`Client::fetch`] for diagnostics.
476#[derive(Debug, Clone)]
477pub struct RawResponse {
478    /// HTTP status code.
479    pub status: u16,
480    /// Final URL after redirects.
481    pub final_url: String,
482    /// Decoded response body.
483    pub body: String,
484}
485
486/// Builder for [`Client`].
487#[derive(Clone)]
488#[must_use = "ClientBuilder does nothing until `.build()` is called"]
489pub struct ClientBuilder {
490    timeout: Duration,
491    connect_timeout: Duration,
492    user_agent: String,
493    follow_redirects: bool,
494    redirect_limit: usize,
495    min_request_interval: Duration,
496    max_rps: Option<NonZeroU32>,
497    retry: RetryPolicy,
498    proxy: Option<String>,
499    user_agents: Vec<String>,
500    enrich: bool,
501    respect_robots: bool,
502    browser: Option<Arc<dyn BrowserBackend>>,
503    browser_budget: usize,
504}
505
506impl Default for ClientBuilder {
507    fn default() -> Self {
508        Self {
509            timeout: DEFAULT_TIMEOUT,
510            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
511            user_agent: default_user_agent(),
512            follow_redirects: true,
513            redirect_limit: DEFAULT_REDIRECT_LIMIT,
514            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
515            max_rps: None,
516            retry: RetryPolicy::default(),
517            proxy: None,
518            user_agents: Vec::new(),
519            enrich: false,
520            respect_robots: false,
521            browser: None,
522            browser_budget: DEFAULT_BROWSER_BUDGET,
523        }
524    }
525}
526
527impl ClientBuilder {
528    /// Per-request timeout (covers connect, headers, and body read).
529    pub fn timeout(mut self, timeout: Duration) -> Self {
530        self.timeout = timeout;
531        self
532    }
533
534    /// TCP-connect timeout, applied independently of the request timeout.
535    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
536        self.connect_timeout = timeout;
537        self
538    }
539
540    /// Override the `User-Agent` header sent on every request.
541    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
542        self.user_agent = user_agent.into();
543        self
544    }
545
546    /// Toggle automatic redirect following. Defaults to `true`; disable when
547    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
548    pub fn follow_redirects(mut self, follow: bool) -> Self {
549        self.follow_redirects = follow;
550        self
551    }
552
553    /// Minimum time between consecutive requests to the same host.
554    ///
555    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
556    /// rate-limit responses on common OSINT targets while keeping fan-out
557    /// across many sites fast.
558    pub fn min_request_interval(mut self, interval: Duration) -> Self {
559        self.min_request_interval = interval;
560        self
561    }
562
563    /// Cap the total request rate across *all* hosts to `rps` requests per
564    /// second. Independent of (and composed with) the per-host interval —
565    /// useful on a metered connection or behind a shared-quota proxy.
566    /// Uncapped by default.
567    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
568        self.max_rps = Some(rps);
569        self
570    }
571
572    /// Maximum retry attempts after a transient ban response. Defaults to 2
573    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
574    pub fn max_retries(mut self, n: u32) -> Self {
575        self.retry.max_retries = n;
576        self
577    }
578
579    /// Base delay for the first retry. Subsequent retries double until
580    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
581    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
582        self.retry.base_delay = d;
583        self
584    }
585
586    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
587    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
588        self.retry.max_delay = d;
589        self
590    }
591
592    /// Route all requests through a proxy. Accepts `http://`, `https://`,
593    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
594    pub fn proxy(mut self, url: impl Into<String>) -> Self {
595        self.proxy = Some(url.into());
596        self
597    }
598
599    /// Rotate the `User-Agent` header per request, picking uniformly at
600    /// random from `agents`. An empty list (the default) keeps the single
601    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
602    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
603        self.user_agents = agents;
604        self
605    }
606
607    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
608    /// pages. Off by default; enables an extra body read for matching sites.
609    pub fn enrich(mut self, enrich: bool) -> Self {
610        self.enrich = enrich;
611        self
612    }
613
614    /// Honor each host's `robots.txt`: probes to disallowed paths are
615    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
616    /// default. Adds one cached `robots.txt` fetch per origin.
617    pub fn respect_robots(mut self, respect: bool) -> Self {
618        self.respect_robots = respect;
619        self
620    }
621
622    /// Attach a browser backend. Sites tagged `bot-protected` will be
623    /// routed through it instead of the raw HTTP path, up to the
624    /// [`browser_budget`](Self::browser_budget) cap.
625    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
626        self.browser = Some(backend);
627        self
628    }
629
630    /// Per-scan cap on how many `bot-protected` sites are allowed to use
631    /// the browser backend. Once exhausted, the rest fall back to
632    /// `Uncertain(BrowserBudget)`. Defaults to
633    /// [`DEFAULT_BROWSER_BUDGET`].
634    pub const fn browser_budget(mut self, cap: usize) -> Self {
635        self.browser_budget = cap;
636        self
637    }
638
639    /// Build a [`Client`].
640    pub fn build(self) -> Result<Client> {
641        let redirect_policy = if self.follow_redirects {
642            redirect::Policy::limited(self.redirect_limit)
643        } else {
644            redirect::Policy::none()
645        };
646        let mut builder = reqwest::Client::builder()
647            .user_agent(self.user_agent)
648            .timeout(self.timeout)
649            .connect_timeout(self.connect_timeout)
650            .redirect(redirect_policy);
651        if let Some(proxy_url) = &self.proxy {
652            // reqwest treats a schemeless string (e.g. "not-a-url") as a host
653            // and silently defaults it to http://, so every probe would fail
654            // confusingly. Require an explicit, supported scheme up front.
655            const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
656            if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
657                return Err(Error::HttpSetup {
658                    message: format!(
659                        "invalid proxy {proxy_url:?}: must start with one of {}",
660                        SCHEMES.join(", ")
661                    ),
662                });
663            }
664            let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
665                message: format!("invalid proxy {proxy_url:?}: {e}"),
666            })?;
667            builder = builder.proxy(proxy);
668        }
669        let inner = builder.build().map_err(|e| Error::HttpSetup {
670            message: e.to_string(),
671        })?;
672        let global_throttle = self.max_rps.map(|rps| {
673            // Min spacing between any two requests = 1s / rps.
674            let interval = Duration::from_secs(1) / rps.get();
675            HostThrottle::new(interval)
676        });
677        let robots = self
678            .respect_robots
679            .then(|| RobotsCache::new(inner.clone(), "adler"));
680        Ok(Client {
681            inner,
682            throttle: HostThrottle::new(self.min_request_interval),
683            global_throttle,
684            retry: self.retry,
685            user_agents: Arc::from(self.user_agents),
686            enrich: self.enrich,
687            robots,
688            browser: self.browser,
689            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
690        })
691    }
692}
693
694/// Default ceiling on browser-backed probes per scan when no other value
695/// is specified.
696///
697/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
698/// headroom while still being a guardrail against a misconfigured flag
699/// burning a whole Browserbase quota.
700pub const DEFAULT_BROWSER_BUDGET: usize = 50;
701
702impl fmt::Debug for Client {
703    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
704        f.debug_struct("Client")
705            .field("throttle", &self.throttle)
706            .field("global_throttle", &self.global_throttle)
707            .field("retry", &self.retry)
708            .field("user_agents", &self.user_agents)
709            .field("enrich", &self.enrich)
710            .field("robots", &self.robots.is_some())
711            .field("browser", &self.browser.is_some())
712            .field("browser_budget", &self.browser_budget)
713            .finish_non_exhaustive()
714    }
715}
716
717impl fmt::Debug for ClientBuilder {
718    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719        f.debug_struct("ClientBuilder")
720            .field("timeout", &self.timeout)
721            .field("connect_timeout", &self.connect_timeout)
722            .field("user_agent", &self.user_agent)
723            .field("follow_redirects", &self.follow_redirects)
724            .field("redirect_limit", &self.redirect_limit)
725            .field("min_request_interval", &self.min_request_interval)
726            .field("max_rps", &self.max_rps)
727            .field("retry", &self.retry)
728            .field("proxy", &self.proxy)
729            .field("user_agents", &self.user_agents)
730            .field("enrich", &self.enrich)
731            .field("respect_robots", &self.respect_robots)
732            .field("browser", &self.browser.is_some())
733            .field("browser_budget", &self.browser_budget)
734            .finish()
735    }
736}
737
738/// Per-fetch timeout passed to [`BrowserBackend::fetch`]. Browser fetches
739/// (JS execution + waits) are inherently slower than raw HTTP, so this is
740/// generous on purpose.
741const BROWSER_TIMEOUT: Duration = Duration::from_secs(60);
742
743const BOT_PROTECTED_TAG: &str = "bot-protected";
744
745fn default_user_agent() -> String {
746    format!("adler/{}", env!("CARGO_PKG_VERSION"))
747}
748
749/// Issue a single HTTP request with the configured client, an optional
750/// User-Agent override, and the given method. Centralised so the probe
751/// path can transparently swap HEAD for GET (and retry on 405) without
752/// duplicating the request-build logic.
753async fn send_request(
754    client: &reqwest::Client,
755    method: reqwest::Method,
756    url: &str,
757    ua: Option<&str>,
758) -> reqwest::Result<reqwest::Response> {
759    send_request_with_body(client, method, url, ua, None).await
760}
761
762/// Same as [`send_request`] but with an optional request body — used
763/// for POST probes against API endpoints (GraphQL, login form, …).
764/// When `body` is `Some`, the request is sent with a `application/json`
765/// content type by default; sites that need a different content type
766/// declare it through [`Site::request_headers`].
767async fn send_request_with_body(
768    client: &reqwest::Client,
769    method: reqwest::Method,
770    url: &str,
771    ua: Option<&str>,
772    body: Option<&str>,
773) -> reqwest::Result<reqwest::Response> {
774    let mut request = client.request(method, url);
775    if let Some(ua) = ua {
776        request = request.header(reqwest::header::USER_AGENT, ua);
777    }
778    if let Some(b) = body {
779        request = request
780            .header(reqwest::header::CONTENT_TYPE, "application/json")
781            .body(b.to_owned());
782    }
783    request.send().await
784}
785
786fn host_of(url: &str) -> String {
787    reqwest::Url::parse(url)
788        .ok()
789        .and_then(|u| u.host_str().map(str::to_owned))
790        .unwrap_or_else(|| "unknown".into())
791}
792
793/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
794/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
795fn origin_and_path(url: &str) -> Option<(String, String)> {
796    let parsed = reqwest::Url::parse(url).ok()?;
797    let host = parsed.host_str()?;
798    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
799    let origin = format!("{}://{host}{port}", parsed.scheme());
800    let path = parsed.query().map_or_else(
801        || parsed.path().to_owned(),
802        |q| format!("{}?{q}", parsed.path()),
803    );
804    Some((origin, path))
805}
806
807fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
808    CheckOutcome {
809        site: site.to_owned(),
810        url,
811        kind,
812        reason: None,
813        elapsed_ms: elapsed_ms(started),
814        enrichment: std::collections::BTreeMap::new(),
815        evidence: Vec::new(),
816    }
817}
818
819fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
820    CheckOutcome {
821        site: site.to_owned(),
822        url,
823        kind: MatchKind::Uncertain,
824        reason: Some(reason),
825        elapsed_ms: elapsed_ms(started),
826        enrichment: std::collections::BTreeMap::new(),
827        evidence: Vec::new(),
828    }
829}
830
831fn elapsed_ms(started: Instant) -> u64 {
832    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
833}
834
835#[cfg(test)]
836mod tests {
837    use super::*;
838    use crate::site::{Signal, UrlTemplate};
839    use wiremock::matchers::{any, method, path};
840    use wiremock::{Mock, MockServer, ResponseTemplate};
841
842    fn build_client() -> Client {
843        Client::builder()
844            .timeout(Duration::from_secs(2))
845            // Tests share `127.0.0.1` as host — keep throttle out of the
846            // way for everything but the dedicated throttle test below.
847            .min_request_interval(Duration::ZERO)
848            // Default retry would re-hit ban-test mocks; tests opt in
849            // explicitly when they want to exercise the retry path.
850            .max_retries(0)
851            .build()
852            .expect("client builds")
853    }
854
855    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
856        Site {
857            name: "Mock".into(),
858            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
859            signals,
860            known_present: None,
861            known_absent: None,
862            extract: Vec::new(),
863            tags: Vec::new(),
864            request_headers: std::collections::BTreeMap::new(),
865            regex_check: None,
866            engine: None,
867            strip_bad_char: None,
868            request_method: crate::site::HttpMethod::Get,
869            request_body: None,
870            protection: Vec::new(),
871        }
872    }
873
874    fn user() -> Username {
875        Username::new("alice").unwrap()
876    }
877
878    #[tokio::test]
879    async fn regex_check_short_circuits_before_any_request() {
880        // Stand up a mock that would 200 on *anything* — if probe_once
881        // failed to short-circuit on regex mismatch, the username
882        // "alice" (5 chars) would resolve to Found here.
883        let server = MockServer::start().await;
884        Mock::given(any())
885            .respond_with(ResponseTemplate::new(200))
886            .mount(&server)
887            .await;
888        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
889        // The site only accepts usernames of 8+ chars; "alice" is 5.
890        site.regex_check = Some("^[A-Za-z]{8,}$".into());
891        let outcome = build_client().check(&site, &user()).await;
892        assert_eq!(outcome.kind, MatchKind::Uncertain);
893        assert!(
894            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
895            "expected UsernameNotAllowed, got {:?}",
896            outcome.reason,
897        );
898        // No request should have hit the mock — assert by counting
899        // received_requests on the wiremock server.
900        let recvd = server.received_requests().await.unwrap_or_default();
901        assert_eq!(
902            recvd.len(),
903            0,
904            "regex_check mismatch must skip the HTTP request entirely"
905        );
906    }
907
908    #[tokio::test]
909    async fn regex_check_pass_proceeds_to_probe() {
910        let server = MockServer::start().await;
911        Mock::given(any())
912            .and(path("/alice"))
913            .respond_with(ResponseTemplate::new(200))
914            .mount(&server)
915            .await;
916        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
917        // Pattern that matches "alice".
918        site.regex_check = Some("^[a-z]{3,}$".into());
919        let outcome = build_client().check(&site, &user()).await;
920        assert_eq!(outcome.kind, MatchKind::Found);
921    }
922
923    #[tokio::test]
924    async fn status_signal_reports_found_on_match() {
925        let server = MockServer::start().await;
926        Mock::given(any())
927            .and(path("/alice"))
928            .respond_with(ResponseTemplate::new(200))
929            .mount(&server)
930            .await;
931        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
932        let outcome = build_client().check(&site, &user()).await;
933        assert_eq!(outcome.kind, MatchKind::Found);
934        assert!(outcome.url.ends_with("/alice"));
935        assert!(outcome.reason.is_none());
936        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
937    }
938
939    #[tokio::test]
940    async fn status_signal_pair_reports_not_found_on_404() {
941        let server = MockServer::start().await;
942        Mock::given(any())
943            .and(path("/alice"))
944            .respond_with(ResponseTemplate::new(404))
945            .mount(&server)
946            .await;
947        let site = site_with(
948            &server,
949            vec![
950                Signal::StatusFound { codes: vec![200] },
951                Signal::StatusNotFound { codes: vec![404] },
952            ],
953        );
954        let outcome = build_client().check(&site, &user()).await;
955        assert_eq!(outcome.kind, MatchKind::NotFound);
956        // Only the NotFound-voting signal is cited as evidence.
957        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
958    }
959
960    #[tokio::test]
961    async fn body_absent_signal_detects_missing_account() {
962        let server = MockServer::start().await;
963        Mock::given(any())
964            .and(path("/alice"))
965            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
966            .mount(&server)
967            .await;
968        let site = site_with(
969            &server,
970            vec![Signal::BodyAbsent {
971                text: "Profile not found".into(),
972            }],
973        );
974        let outcome = build_client().check(&site, &user()).await;
975        assert_eq!(outcome.kind, MatchKind::NotFound);
976    }
977
978    #[tokio::test]
979    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
980        // Phase 2 semantics: absence of an absence-marker is not evidence
981        // of presence — it just means we have no signal that fired.
982        let server = MockServer::start().await;
983        Mock::given(any())
984            .and(path("/alice"))
985            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
986            .mount(&server)
987            .await;
988        let site = site_with(
989            &server,
990            vec![Signal::BodyAbsent {
991                text: "Profile not found".into(),
992            }],
993        );
994        let outcome = build_client().check(&site, &user()).await;
995        assert_eq!(outcome.kind, MatchKind::Uncertain);
996    }
997
998    #[tokio::test]
999    async fn body_present_plus_absent_resolve_to_found() {
1000        let server = MockServer::start().await;
1001        Mock::given(any())
1002            .and(path("/alice"))
1003            .respond_with(
1004                ResponseTemplate::new(200)
1005                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
1006            )
1007            .mount(&server)
1008            .await;
1009        let site = site_with(
1010            &server,
1011            vec![
1012                Signal::BodyPresent {
1013                    text: "profile-card".into(),
1014                },
1015                Signal::BodyAbsent {
1016                    text: "Profile not found".into(),
1017                },
1018            ],
1019        );
1020        let outcome = build_client().check(&site, &user()).await;
1021        assert_eq!(outcome.kind, MatchKind::Found);
1022    }
1023
1024    #[tokio::test]
1025    async fn redirect_absent_signal_detects_missing_account() {
1026        let server = MockServer::start().await;
1027        Mock::given(any())
1028            .and(path("/alice"))
1029            .respond_with(
1030                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1031            )
1032            .mount(&server)
1033            .await;
1034        Mock::given(any())
1035            .and(path("/login"))
1036            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1037            .mount(&server)
1038            .await;
1039        let site = site_with(
1040            &server,
1041            vec![Signal::RedirectAbsent {
1042                fragment: "/login".into(),
1043            }],
1044        );
1045        let outcome = build_client().check(&site, &user()).await;
1046        assert_eq!(outcome.kind, MatchKind::NotFound);
1047    }
1048
1049    #[tokio::test]
1050    async fn negative_signal_wins_over_positive() {
1051        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1052        // (error marker appears). Negative-priority aggregation → NotFound.
1053        // This is the canonical Sherlock "message" pattern: a site that
1054        // returns 200 for everyone and differentiates via an error string.
1055        let server = MockServer::start().await;
1056        Mock::given(any())
1057            .and(path("/alice"))
1058            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1059            .mount(&server)
1060            .await;
1061        let site = site_with(
1062            &server,
1063            vec![
1064                Signal::StatusFound { codes: vec![200] },
1065                Signal::BodyAbsent {
1066                    text: "Profile not found".into(),
1067                },
1068            ],
1069        );
1070        let outcome = build_client().check(&site, &user()).await;
1071        assert_eq!(outcome.kind, MatchKind::NotFound);
1072    }
1073
1074    #[tokio::test]
1075    async fn network_failure_yields_uncertain() {
1076        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1077        let port = listener.local_addr().unwrap().port();
1078        drop(listener);
1079
1080        let site = Site {
1081            name: "Dead".into(),
1082            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1083            signals: vec![Signal::StatusFound { codes: vec![200] }],
1084            known_present: None,
1085            known_absent: None,
1086            extract: Vec::new(),
1087            tags: Vec::new(),
1088            request_headers: std::collections::BTreeMap::new(),
1089            regex_check: None,
1090            engine: None,
1091            strip_bad_char: None,
1092            request_method: crate::site::HttpMethod::Get,
1093            request_body: None,
1094            protection: Vec::new(),
1095        };
1096        let client = Client::builder()
1097            .timeout(Duration::from_millis(500))
1098            .connect_timeout(Duration::from_millis(500))
1099            .max_retries(0)
1100            .build()
1101            .unwrap();
1102        let outcome = client.check(&site, &user()).await;
1103        assert_eq!(outcome.kind, MatchKind::Uncertain);
1104        assert!(outcome.reason.is_some());
1105    }
1106
1107    #[tokio::test]
1108    async fn throttle_spaces_consecutive_calls_to_same_host() {
1109        let server = MockServer::start().await;
1110        Mock::given(any())
1111            .and(path("/alice"))
1112            .respond_with(ResponseTemplate::new(200))
1113            .mount(&server)
1114            .await;
1115        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1116        // Interval is intentionally much larger than typical wiremock latency
1117        // (≤10 ms locally, can spike under heavy parallel test load). Any
1118        // value too close to HTTP latency would let the first request burn
1119        // through the throttle window and make the assertion flaky.
1120        let client = Client::builder()
1121            .timeout(Duration::from_secs(2))
1122            .min_request_interval(Duration::from_millis(300))
1123            .build()
1124            .unwrap();
1125
1126        client.check(&site, &user()).await;
1127        let started = Instant::now();
1128        client.check(&site, &user()).await;
1129        let elapsed = started.elapsed();
1130        assert!(
1131            elapsed >= Duration::from_millis(200),
1132            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1133        );
1134    }
1135
1136    #[tokio::test]
1137    async fn builder_overrides_user_agent() {
1138        let server = MockServer::start().await;
1139        Mock::given(any())
1140            .and(path("/alice"))
1141            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1142            .respond_with(ResponseTemplate::new(200))
1143            .mount(&server)
1144            .await;
1145        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1146        let client = Client::builder()
1147            .user_agent("adler-test/1.0")
1148            .build()
1149            .unwrap();
1150        let outcome = client.check(&site, &user()).await;
1151        assert_eq!(outcome.kind, MatchKind::Found);
1152    }
1153
1154    #[tokio::test]
1155    async fn rate_limit_429_yields_uncertain_with_note() {
1156        let server = MockServer::start().await;
1157        Mock::given(any())
1158            .and(path("/alice"))
1159            .respond_with(ResponseTemplate::new(429))
1160            .mount(&server)
1161            .await;
1162        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1163        let outcome = build_client().check(&site, &user()).await;
1164        assert_eq!(outcome.kind, MatchKind::Uncertain);
1165        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1166    }
1167
1168    #[tokio::test]
1169    async fn cloudflare_server_header_yields_uncertain() {
1170        let server = MockServer::start().await;
1171        Mock::given(any())
1172            .and(path("/alice"))
1173            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1174            .mount(&server)
1175            .await;
1176        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1177        let outcome = build_client().check(&site, &user()).await;
1178        assert_eq!(outcome.kind, MatchKind::Uncertain);
1179        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1180    }
1181
1182    #[tokio::test]
1183    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1184        // Body-based ban detection only runs when a signal already needs
1185        // the body — this site uses BodyAbsent so the body is read.
1186        let server = MockServer::start().await;
1187        Mock::given(any())
1188            .and(path("/alice"))
1189            .respond_with(
1190                ResponseTemplate::new(200)
1191                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1192            )
1193            .mount(&server)
1194            .await;
1195        let site = site_with(
1196            &server,
1197            vec![Signal::BodyAbsent {
1198                text: "Profile not found".into(),
1199            }],
1200        );
1201        let outcome = build_client().check(&site, &user()).await;
1202        assert_eq!(outcome.kind, MatchKind::Uncertain);
1203        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1204    }
1205
1206    #[tokio::test]
1207    async fn ban_detection_does_not_fire_on_legitimate_403() {
1208        let server = MockServer::start().await;
1209        Mock::given(any())
1210            .and(path("/alice"))
1211            .respond_with(ResponseTemplate::new(403))
1212            .mount(&server)
1213            .await;
1214        let site = site_with(
1215            &server,
1216            vec![
1217                Signal::StatusFound { codes: vec![200] },
1218                Signal::StatusNotFound { codes: vec![403] },
1219            ],
1220        );
1221        let outcome = build_client().check(&site, &user()).await;
1222        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1223        assert_eq!(outcome.kind, MatchKind::NotFound);
1224        assert!(outcome.reason.is_none());
1225    }
1226
1227    #[tokio::test]
1228    async fn retry_recovers_after_transient_429() {
1229        let server = MockServer::start().await;
1230        // First request: 429. Subsequent: 200.
1231        Mock::given(any())
1232            .and(path("/alice"))
1233            .respond_with(ResponseTemplate::new(429))
1234            .up_to_n_times(1)
1235            .mount(&server)
1236            .await;
1237        Mock::given(any())
1238            .and(path("/alice"))
1239            .respond_with(ResponseTemplate::new(200))
1240            .mount(&server)
1241            .await;
1242        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1243        let client = Client::builder()
1244            .timeout(Duration::from_secs(2))
1245            .min_request_interval(Duration::ZERO)
1246            .max_retries(2)
1247            .base_backoff_delay(Duration::from_millis(20))
1248            .max_backoff_delay(Duration::from_millis(100))
1249            .build()
1250            .unwrap();
1251        let outcome = client.check(&site, &user()).await;
1252        assert_eq!(outcome.kind, MatchKind::Found);
1253        assert!(outcome.reason.is_none());
1254    }
1255
1256    #[tokio::test]
1257    async fn retry_exhausts_and_returns_uncertain() {
1258        let server = MockServer::start().await;
1259        Mock::given(any())
1260            .and(path("/alice"))
1261            .respond_with(ResponseTemplate::new(429))
1262            .mount(&server)
1263            .await;
1264        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1265        let client = Client::builder()
1266            .timeout(Duration::from_secs(2))
1267            .min_request_interval(Duration::ZERO)
1268            .max_retries(2)
1269            .base_backoff_delay(Duration::from_millis(10))
1270            .max_backoff_delay(Duration::from_millis(50))
1271            .build()
1272            .unwrap();
1273        let outcome = client.check(&site, &user()).await;
1274        assert_eq!(outcome.kind, MatchKind::Uncertain);
1275        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1276    }
1277
1278    #[tokio::test]
1279    async fn retry_does_not_fire_on_network_error() {
1280        // Connection refused → Uncertain note starts with "request:", not a
1281        // ban marker. We must NOT retry — otherwise a single dead site
1282        // burns the full backoff budget before reporting.
1283        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1284        let port = listener.local_addr().unwrap().port();
1285        drop(listener);
1286        let site = Site {
1287            name: "Dead".into(),
1288            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1289            signals: vec![Signal::StatusFound { codes: vec![200] }],
1290            known_present: None,
1291            known_absent: None,
1292            extract: Vec::new(),
1293            tags: Vec::new(),
1294            request_headers: std::collections::BTreeMap::new(),
1295            regex_check: None,
1296            engine: None,
1297            strip_bad_char: None,
1298            request_method: crate::site::HttpMethod::Get,
1299            request_body: None,
1300            protection: Vec::new(),
1301        };
1302        let client = Client::builder()
1303            .timeout(Duration::from_millis(500))
1304            .connect_timeout(Duration::from_millis(500))
1305            .min_request_interval(Duration::ZERO)
1306            .max_retries(3)
1307            .base_backoff_delay(Duration::from_secs(60))
1308            .build()
1309            .unwrap();
1310        let started = Instant::now();
1311        let outcome = client.check(&site, &user()).await;
1312        // If retry fired, we'd be sleeping minutes; instead this returns
1313        // promptly with an Uncertain.
1314        assert!(started.elapsed() < Duration::from_secs(5));
1315        assert_eq!(outcome.kind, MatchKind::Uncertain);
1316        assert!(
1317            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1318            "got {:?}",
1319            outcome.reason,
1320        );
1321    }
1322
1323    #[tokio::test]
1324    async fn rotates_user_agent_per_request() {
1325        // The mock only matches when the request carries one of the pooled
1326        // UAs; if rotation weren't applied, the default adler/x.y UA would
1327        // miss and the verdict would be NotFound.
1328        let server = MockServer::start().await;
1329        Mock::given(any())
1330            .and(path("/alice"))
1331            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1332            .respond_with(ResponseTemplate::new(200))
1333            .mount(&server)
1334            .await;
1335        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1336        let client = Client::builder()
1337            .min_request_interval(Duration::ZERO)
1338            .max_retries(0)
1339            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1340            .build()
1341            .unwrap();
1342        let outcome = client.check(&site, &user()).await;
1343        assert_eq!(outcome.kind, MatchKind::Found);
1344    }
1345
1346    #[test]
1347    fn invalid_proxy_url_fails_build() {
1348        let err = Client::builder().proxy("not a url").build().unwrap_err();
1349        assert!(matches!(err, Error::HttpSetup { .. }));
1350    }
1351
1352    #[test]
1353    fn schemeless_proxy_is_rejected_up_front() {
1354        // reqwest would silently treat this as a host; we require a scheme.
1355        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1356        let Error::HttpSetup { message } = err else {
1357            panic!("expected HttpSetup, got {err:?}");
1358        };
1359        assert!(message.contains("must start with"), "{message}");
1360    }
1361
1362    #[test]
1363    fn socks5_proxy_scheme_is_accepted() {
1364        // Valid scheme + endpoint builds fine (no connection is attempted).
1365        assert!(
1366            Client::builder()
1367                .proxy("socks5://127.0.0.1:9050")
1368                .build()
1369                .is_ok()
1370        );
1371    }
1372
1373    #[tokio::test]
1374    async fn global_rps_cap_spaces_requests_across_hosts() {
1375        // Two distinct host paths; per-host throttle is disabled, so any
1376        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1377        let server = MockServer::start().await;
1378        Mock::given(any())
1379            .respond_with(ResponseTemplate::new(200))
1380            .mount(&server)
1381            .await;
1382        let site_a = Site {
1383            name: "A".into(),
1384            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1385            signals: vec![Signal::StatusFound { codes: vec![200] }],
1386            known_present: None,
1387            known_absent: None,
1388            extract: Vec::new(),
1389            tags: Vec::new(),
1390            request_headers: std::collections::BTreeMap::new(),
1391            regex_check: None,
1392            engine: None,
1393            strip_bad_char: None,
1394            request_method: crate::site::HttpMethod::Get,
1395            request_body: None,
1396            protection: Vec::new(),
1397        };
1398        let site_b = Site {
1399            name: "B".into(),
1400            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1401            signals: vec![Signal::StatusFound { codes: vec![200] }],
1402            known_present: None,
1403            known_absent: None,
1404            extract: Vec::new(),
1405            tags: Vec::new(),
1406            request_headers: std::collections::BTreeMap::new(),
1407            regex_check: None,
1408            engine: None,
1409            strip_bad_char: None,
1410            request_method: crate::site::HttpMethod::Get,
1411            request_body: None,
1412            protection: Vec::new(),
1413        };
1414        // 2 RPS → ~500 ms between requests. A large interval keeps the
1415        // assertion robust even when the first probe's own duration (which
1416        // eats into the measured gap) is inflated by test instrumentation
1417        // such as coverage tooling.
1418        let client = Client::builder()
1419            .min_request_interval(Duration::ZERO)
1420            .max_retries(0)
1421            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1422            .build()
1423            .unwrap();
1424        // First request consumes the slot at t≈0; second waits ~500 ms even
1425        // though it targets a different host.
1426        client.check(&site_a, &user()).await;
1427        let started = Instant::now();
1428        client.check(&site_b, &user()).await;
1429        assert!(
1430            started.elapsed() >= Duration::from_millis(350),
1431            "global cap should space cross-host requests, got {:?}",
1432            started.elapsed(),
1433        );
1434    }
1435
1436    #[tokio::test]
1437    async fn respect_robots_skips_disallowed_paths() {
1438        let server = MockServer::start().await;
1439        Mock::given(any())
1440            .and(path("/robots.txt"))
1441            .respond_with(
1442                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1443            )
1444            .mount(&server)
1445            .await;
1446        Mock::given(any())
1447            .and(path("/no/alice"))
1448            .respond_with(ResponseTemplate::new(200))
1449            .mount(&server)
1450            .await;
1451        Mock::given(any())
1452            .and(path("/yes/alice"))
1453            .respond_with(ResponseTemplate::new(200))
1454            .mount(&server)
1455            .await;
1456        let client = Client::builder()
1457            .min_request_interval(Duration::ZERO)
1458            .max_retries(0)
1459            .respect_robots(true)
1460            .build()
1461            .unwrap();
1462
1463        let disallowed = Site {
1464            name: "No".into(),
1465            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1466            signals: vec![Signal::StatusFound { codes: vec![200] }],
1467            known_present: None,
1468            known_absent: None,
1469            extract: Vec::new(),
1470            tags: Vec::new(),
1471            request_headers: std::collections::BTreeMap::new(),
1472            regex_check: None,
1473            engine: None,
1474            strip_bad_char: None,
1475            request_method: crate::site::HttpMethod::Get,
1476            request_body: None,
1477            protection: Vec::new(),
1478        };
1479        let allowed = Site {
1480            name: "Yes".into(),
1481            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1482            signals: vec![Signal::StatusFound { codes: vec![200] }],
1483            known_present: None,
1484            known_absent: None,
1485            extract: Vec::new(),
1486            tags: Vec::new(),
1487            request_headers: std::collections::BTreeMap::new(),
1488            regex_check: None,
1489            engine: None,
1490            strip_bad_char: None,
1491            request_method: crate::site::HttpMethod::Get,
1492            request_body: None,
1493            protection: Vec::new(),
1494        };
1495
1496        let no = client.check(&disallowed, &user()).await;
1497        assert_eq!(no.kind, MatchKind::Uncertain);
1498        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1499
1500        let yes = client.check(&allowed, &user()).await;
1501        assert_eq!(yes.kind, MatchKind::Found);
1502    }
1503
1504    #[tokio::test]
1505    async fn body_read_skipped_when_no_body_signal_needed() {
1506        // Mock returns body that would fail a body_absent check — but since
1507        // we only have a status signal, body is never read.
1508        let server = MockServer::start().await;
1509        Mock::given(any())
1510            .and(path("/alice"))
1511            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1512            .mount(&server)
1513            .await;
1514        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1515        let outcome = build_client().check(&site, &user()).await;
1516        assert_eq!(outcome.kind, MatchKind::Found);
1517    }
1518
1519    // ===== Browser routing =====
1520
1521    /// Test backend that returns a canned page and counts calls. Lets the
1522    /// routing tests assert "Client did/did not invoke the browser" without
1523    /// involving a real Chrome process.
1524    #[derive(Debug)]
1525    struct RecordingBackend {
1526        page: RenderedPage,
1527        calls: std::sync::atomic::AtomicUsize,
1528    }
1529
1530    impl RecordingBackend {
1531        fn with_page(page: RenderedPage) -> Self {
1532            Self {
1533                page,
1534                calls: std::sync::atomic::AtomicUsize::new(0),
1535            }
1536        }
1537        fn call_count(&self) -> usize {
1538            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1539        }
1540    }
1541
1542    #[async_trait::async_trait]
1543    impl BrowserBackend for RecordingBackend {
1544        async fn fetch(
1545            &self,
1546            _url: &url::Url,
1547            _headers: &std::collections::BTreeMap<String, String>,
1548            _timeout: Duration,
1549        ) -> Result<RenderedPage> {
1550            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1551            Ok(self.page.clone())
1552        }
1553    }
1554
1555    fn site_bot_protected(server: &MockServer) -> Site {
1556        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1557        s.tags = vec!["bot-protected".into()];
1558        s
1559    }
1560
1561    #[tokio::test]
1562    async fn browser_routes_bot_protected_sites() {
1563        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1564        // returns its canned page directly.
1565        let server = MockServer::start().await;
1566        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1567            status: 200,
1568            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1569            body: "<html></html>".into(),
1570            elapsed_ms: 42,
1571        }));
1572        let client = Client::builder()
1573            .min_request_interval(Duration::ZERO)
1574            .max_retries(0)
1575            .browser(backend.clone())
1576            .build()
1577            .unwrap();
1578        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1579        assert_eq!(outcome.kind, MatchKind::Found);
1580        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1581    }
1582
1583    #[tokio::test]
1584    async fn non_bot_protected_sites_skip_browser() {
1585        let server = MockServer::start().await;
1586        Mock::given(any())
1587            .and(path("/alice"))
1588            .respond_with(ResponseTemplate::new(200))
1589            .mount(&server)
1590            .await;
1591        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1592            status: 500, // would make wiremock case fail if browser was taken
1593            final_url: url::Url::parse("https://x/").unwrap(),
1594            body: String::new(),
1595            elapsed_ms: 0,
1596        }));
1597        let client = Client::builder()
1598            .min_request_interval(Duration::ZERO)
1599            .max_retries(0)
1600            .browser(backend.clone())
1601            .build()
1602            .unwrap();
1603        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1604        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1605        let outcome = client.check(&site, &user()).await;
1606        assert_eq!(outcome.kind, MatchKind::Found);
1607        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1608    }
1609
1610    #[tokio::test]
1611    async fn browser_budget_exhaust_yields_uncertain() {
1612        let server = MockServer::start().await;
1613        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1614            status: 200,
1615            final_url: url::Url::parse("https://x/").unwrap(),
1616            body: String::new(),
1617            elapsed_ms: 0,
1618        }));
1619        let client = Client::builder()
1620            .min_request_interval(Duration::ZERO)
1621            .max_retries(0)
1622            .browser(backend.clone())
1623            .browser_budget(1)
1624            .build()
1625            .unwrap();
1626        let site = site_bot_protected(&server);
1627        // First call consumes the only slot.
1628        let first = client.check(&site, &user()).await;
1629        assert_eq!(first.kind, MatchKind::Found);
1630        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1631        let second = client.check(&site, &user()).await;
1632        assert_eq!(second.kind, MatchKind::Uncertain);
1633        assert!(matches!(
1634            second.reason,
1635            Some(UncertainReason::BrowserBudget)
1636        ));
1637        assert_eq!(
1638            backend.call_count(),
1639            1,
1640            "second call must not invoke backend"
1641        );
1642    }
1643
1644    #[tokio::test]
1645    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1646        struct FailingBackend;
1647        #[async_trait::async_trait]
1648        impl BrowserBackend for FailingBackend {
1649            async fn fetch(
1650                &self,
1651                _url: &url::Url,
1652                _headers: &std::collections::BTreeMap<String, String>,
1653                _timeout: Duration,
1654            ) -> Result<RenderedPage> {
1655                Err(Error::BrowserSetup {
1656                    message: "simulated crash".into(),
1657                })
1658            }
1659        }
1660        impl std::fmt::Debug for FailingBackend {
1661            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1662                f.write_str("FailingBackend")
1663            }
1664        }
1665
1666        let server = MockServer::start().await;
1667        let client = Client::builder()
1668            .min_request_interval(Duration::ZERO)
1669            .max_retries(0)
1670            .browser(Arc::new(FailingBackend))
1671            .build()
1672            .unwrap();
1673        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1674        assert_eq!(outcome.kind, MatchKind::Uncertain);
1675        match outcome.reason {
1676            Some(UncertainReason::BrowserFailed(msg)) => {
1677                assert!(msg.contains("simulated crash"), "got: {msg}");
1678            }
1679            other => panic!("expected BrowserFailed, got {other:?}"),
1680        }
1681    }
1682
1683    #[tokio::test]
1684    async fn status_only_site_uses_head_request() {
1685        // Site with only status signals (no body markers, no enrichment)
1686        // should be probed with HEAD — saves the body download on
1687        // ~30% of the registry.
1688        let server = MockServer::start().await;
1689        Mock::given(method("HEAD"))
1690            .and(path("/alice"))
1691            .respond_with(ResponseTemplate::new(200))
1692            .mount(&server)
1693            .await;
1694        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1695        let outcome = build_client().check(&site, &user()).await;
1696        assert_eq!(outcome.kind, MatchKind::Found);
1697        let recvd = server.received_requests().await.unwrap_or_default();
1698        assert_eq!(recvd.len(), 1);
1699        assert_eq!(recvd[0].method.as_str(), "HEAD");
1700    }
1701
1702    #[tokio::test]
1703    async fn body_signal_site_uses_get_request() {
1704        // Same baseline plus a body-marker signal — must still GET so
1705        // the body actually arrives for matching.
1706        let server = MockServer::start().await;
1707        Mock::given(any())
1708            .and(path("/alice"))
1709            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
1710            .mount(&server)
1711            .await;
1712        let site = site_with(
1713            &server,
1714            vec![Signal::BodyPresent {
1715                text: "hello".into(),
1716            }],
1717        );
1718        let outcome = build_client().check(&site, &user()).await;
1719        assert_eq!(outcome.kind, MatchKind::Found);
1720        let recvd = server.received_requests().await.unwrap_or_default();
1721        assert_eq!(recvd[0].method.as_str(), "GET");
1722    }
1723
1724    #[tokio::test]
1725    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
1726        // A site that declares `protection: [Cloudflare]` but doesn't
1727        // carry the legacy `bot-protected` tag should still route
1728        // through the browser backend — the new structured field is
1729        // an additional signal, not a tag replacement.
1730        let server = MockServer::start().await;
1731        Mock::given(any())
1732            .respond_with(ResponseTemplate::new(200))
1733            .mount(&server)
1734            .await;
1735        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1736        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
1737        // No bot-protected tag — pure structured-field test.
1738        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1739            status: 200,
1740            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1741            body: String::new(),
1742            elapsed_ms: 0,
1743        }));
1744        let client = Client::builder()
1745            .min_request_interval(Duration::ZERO)
1746            .max_retries(0)
1747            .browser(backend)
1748            .build()
1749            .unwrap();
1750        let outcome = client.check(&site, &user()).await;
1751        // The recording backend always returns a synthetic 200, so
1752        // Found means we went through the browser path.
1753        assert_eq!(outcome.kind, MatchKind::Found);
1754        // No raw HTTP probe should have hit the mock server.
1755        let recvd = server.received_requests().await.unwrap_or_default();
1756        assert_eq!(
1757            recvd.len(),
1758            0,
1759            "structured protection must skip the raw HTTP path"
1760        );
1761    }
1762
1763    #[tokio::test]
1764    async fn post_method_sends_body_with_username_substituted() {
1765        // A POST-probed site (e.g. Anilist GraphQL) — the username
1766        // goes in the body, not the URL. Adler should substitute
1767        // `{username}` and send a POST with the rendered payload.
1768        let server = MockServer::start().await;
1769        Mock::given(method("POST"))
1770            .and(path("/api"))
1771            .respond_with(ResponseTemplate::new(200))
1772            .mount(&server)
1773            .await;
1774        // URL substitution still requires the `{username}` placeholder,
1775        // even for POST sites where the username also lives in the
1776        // body. Most real POST endpoints encode the username in both
1777        // (e.g. query string + body); we mirror that.
1778        let site = Site {
1779            name: "ApiPost".into(),
1780            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
1781            signals: vec![Signal::StatusFound { codes: vec![200] }],
1782            known_present: None,
1783            known_absent: None,
1784            extract: Vec::new(),
1785            tags: Vec::new(),
1786            request_headers: std::collections::BTreeMap::new(),
1787            regex_check: None,
1788            engine: None,
1789            strip_bad_char: None,
1790            request_method: HttpMethod::Post,
1791            request_body: Some(r#"{"name":"{username}"}"#.into()),
1792            protection: Vec::new(),
1793        };
1794        let outcome = build_client().check(&site, &user()).await;
1795        assert_eq!(outcome.kind, MatchKind::Found);
1796        let recvd = server.received_requests().await.unwrap_or_default();
1797        assert_eq!(recvd.len(), 1);
1798        assert_eq!(recvd[0].method.as_str(), "POST");
1799        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
1800        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
1801    }
1802
1803    #[tokio::test]
1804    async fn head_405_falls_back_to_get() {
1805        // A server that rejects HEAD with 405 — Adler should silently
1806        // retry with GET so the optimisation can never cost accuracy.
1807        let server = MockServer::start().await;
1808        Mock::given(method("HEAD"))
1809            .and(path("/alice"))
1810            .respond_with(ResponseTemplate::new(405))
1811            .mount(&server)
1812            .await;
1813        Mock::given(any())
1814            .and(path("/alice"))
1815            .respond_with(ResponseTemplate::new(200))
1816            .mount(&server)
1817            .await;
1818        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1819        let outcome = build_client().check(&site, &user()).await;
1820        assert_eq!(outcome.kind, MatchKind::Found);
1821        let recvd = server.received_requests().await.unwrap_or_default();
1822        assert_eq!(recvd.len(), 2);
1823        assert_eq!(recvd[0].method.as_str(), "HEAD");
1824        assert_eq!(recvd[1].method.as_str(), "GET");
1825    }
1826}